In [1]:
import pandas as pd
import numpy as np

# --- 1. Load the dataset with the redundancy issue ---
# This is the file you provided, which we will now correct.
source_file = 'final_unified_dataset.csv'

print(f"--- Loading data from '{source_file}' ---")
try:
    df = pd.read_csv(source_file)
    print("File loaded successfully.")
except FileNotFoundError:
    print(f"ERROR: The file '{source_file}' was not found.")
    print("Please make sure the CSV file is in the same directory as your notebook.")
    # Exit gracefully if the file isn't found
    exit()

# --- 2. Correct the Redundancy ---
# The key to solving this is to separate business-level metrics from channel-level metrics.

print("\n--- Correcting data redundancy ---")

# Define which columns are at the business level (repeated for each channel in a month)
business_level_cols = ['YEAR', 'MONTH', 'Total_Returns', 'Total_Revenue']

# Define which columns are at the channel level
channel_level_cols = ['YEAR', 'MONTH', 'CHANNEL', 'Spend', 'Orders_Existing', 
                      'Customers_Existing', 'Orders_New', 'Customers_New']

# Create a clean dataframe for business-level metrics by dropping duplicates.
# This leaves us with one unique row per month for these metrics.
business_df = df[business_level_cols].drop_duplicates()
print("Created clean business-level summary.")

# Create a dataframe for the channel-specific metrics.
channel_df = df[channel_level_cols]
print("Created channel-level performance summary.")

# Merge the two dataframes back together.
# Pandas will correctly broadcast the single monthly business value across all the channel rows for that month.
# This produces a clean, non-redundant, and analytically sound final table.
final_corrected_df = pd.merge(channel_df, business_df, on=['YEAR', 'MONTH'], how='left')
print("Merged data back into a non-redundant structure.")

# --- 3. Final Verification and Saving ---
print("\n--- Verification and Saving ---")

# Fill any potential NaN values that might have been created during the merge
final_corrected_df.fillna(0, inplace=True)

# Display the first 10 rows of the corrected data to verify
print("\n--- Corrected Data (First 10 Rows) ---")
print(final_corrected_df.head(10).to_string())

# Save the final, corrected file
output_filename = 'final_corrected_dataset.csv'
final_corrected_df.to_csv(output_filename, index=False)

print(f"\nSUCCESS: The corrected data has been saved to '{output_filename}'")


--- Loading data from 'final_unified_dataset.csv' ---
File loaded successfully.

--- Correcting data redundancy ---
Created clean business-level summary.
Created channel-level performance summary.
Merged data back into a non-redundant structure.

--- Verification and Saving ---

--- Corrected Data (First 10 Rows) ---
   YEAR    MONTH         CHANNEL  Spend  Orders_Existing  Customers_Existing  Orders_New  Customers_New  Total_Returns  Total_Revenue
0  2024  JANUARY       Affiliate    0.0            116.0                 0.0         0.0            0.0       -61116.9     1397391.57
1  2024  JANUARY          Direct    0.0            793.0                 0.0       800.0            0.0       -61116.9     1397391.57
2  2024  JANUARY         Display    0.0              0.0                58.0         0.0           24.0       -61116.9     1397391.57
3  2024  JANUARY           Email    0.0           1158.0                 0.0       271.0            0.0       -61116.9     1397391.57
4  2024  JA

In [7]:
import pandas as pd
import numpy as np

# --- 1. Load your manually preprocessed dataset ---
# This script uses your 'new.csv' as the starting point.
source_file = 'new.csv'

print(f"--- Loading your preprocessed data from '{source_file}' ---")
try:
    # Load the data, assuming the first row is the header
    df = pd.read_csv(source_file, encoding='latin-1', header=0)
    
    # Standardize all column names to prevent KeyErrors
    # Convert to lowercase, replace spaces and special characters with underscores
    df.columns = df.columns.str.lower().str.replace(r'[^a-zA-Z0-9_]', '_', regex=True)
    
    print("File loaded successfully.")
    
    # --- FIX: Extract YEAR and MONTH from the date column ---
    # Intelligently find the date column
    date_col = next((col for col in df.columns if 'month' in col or 'date' in col), None)
    if date_col:
        df['datetime'] = pd.to_datetime(df[date_col], errors='coerce')
        df['year'] = df['datetime'].dt.year
        df['month'] = df['datetime'].dt.month_name().str.upper()
        # Drop rows where date conversion failed
        df.dropna(subset=['datetime'], inplace=True)
        print("Successfully extracted 'year' and 'month' columns.")
    else:
        raise ValueError("Could not find a date-like column (e.g., 'month' or 'date') in the CSV.")

    print("Standardized columns found:", df.columns.tolist())
    
except (FileNotFoundError, ValueError) as e:
    print(f"ERROR: {e}")
    # Exit gracefully if there's an issue
    exit()

# --- 2. Correct the Redundancy ---
# This is the final and most critical step to ensure the data is analytically sound.

print("\n--- Correcting data redundancy ---")

# Define which columns are at the business level (repeated for each channel in a month)
business_level_cols = ['year', 'month', 'returns', 'gross_sales', 'net_sales']
# Ensure we only use columns that actually exist in the dataframe
business_level_cols = [col for col in business_level_cols if col in df.columns]


# Define which columns are at the channel level
channel_level_cols = [col for col in df.columns if col not in business_level_cols]
# Add back the keys needed for merging
channel_level_cols = ['year', 'month', 'channel'] + [col for col in channel_level_cols if col not in ['year', 'month', 'channel', 'datetime']]


# Create a clean dataframe for business-level metrics by dropping duplicates.
business_df = df[business_level_cols].drop_duplicates()
print("Created clean business-level summary.")

# Create a dataframe for the channel-specific metrics.
channel_df = df[channel_level_cols]
print("Created channel-level performance summary.")

# Merge the two dataframes back together.
final_corrected_df = pd.merge(channel_df, business_df, on=['year', 'month'], how='left')
print("Merged data back into a non-redundant structure.")

# --- 3. Final Verification and Saving ---
print("\n--- Verification and Saving ---")

# Fill any potential NaN values that might have been created during the merge
final_corrected_df.fillna(0, inplace=True)

# Remove any summary rows like 'TOTAL' that might exist
if 'channel' in final_corrected_df.columns:
    final_corrected_df = final_corrected_df[~final_corrected_df['channel'].str.contains('TOTAL', na=False, case=False)]


# Display the first 10 rows of the corrected data to verify
print("\n--- Final Corrected Data (First 10 Rows) ---")
print(final_corrected_df.head(10).to_string())

# Save the final, corrected file
output_filename = 'final_master_dataset.csv'
final_corrected_df.to_csv(output_filename, index=False)

print(f"\nSUCCESS: The definitive, non-redundant data has been saved to '{output_filename}'")


--- Loading your preprocessed data from 'new.csv' ---
File loaded successfully.
Successfully extracted 'year' and 'month' columns.
Standardized columns found: ['month', 'channel', 'ad_spend', 'gross_discount_shopify', 'sessions_mkt', 'ctr_mkt', 'clicks_mkt', 'conversion_rate_mkt', 'orders_mkt', 'new_customers_mkt', 'sessions_web', 'sessions_with_cart_additions', 'added_to_cart_rate', 'sessions_that_reached_checkout', 'reached_checkout_rate', 'sessions_that_completed_checkout', 'checkout_conversion_rate', 'conversion_rate_web', 'pageviews_per_session', 'pageviews', 'new_orders', 'existing_orders', 'new_customers', 'existing_customers', 'media_spend_by_channel', 'gross_sales', 'discounts', 'returns', 'net_sales', 'datetime', 'year']

--- Correcting data redundancy ---
Created clean business-level summary.
Created channel-level performance summary.
Merged data back into a non-redundant structure.

--- Verification and Saving ---

--- Final Corrected Data (First 10 Rows) ---
   year      m