In [8]:
import pandas as pd

# This script finds and removes duplicate rows from your preprocessed data file.

try:
    # Load the CSV file you created
    # Make sure 'preprocessed_data.csv' is in the same directory as your notebook
    df = pd.read_csv('../raw_data/new.csv')
    print(f"Successfully loaded file. It has {df.shape[0]} rows.")

    # Find all rows that are completely duplicated across all columns
    duplicates = df[df.duplicated(keep=False)]

    if duplicates.empty:
        print("\n✅ SUCCESS: No duplicate rows were found in your file.")
        
    else:
        num_duplicate_sets = len(duplicates.drop_duplicates())
        num_total_duplicate_rows = len(duplicates)
        
        print(f"\n❗️ WARNING: Found {num_duplicate_sets} set(s) of duplicate rows, totaling {num_total_duplicate_rows} rows.")
        
        # --- FIX: Automatically remove duplicates and save a new file ---
        
        print("\nRemoving duplicates and creating a clean file...")
        
        # Create a new DataFrame with all duplicate rows removed
        clean_df = df.drop_duplicates()
        
        # Save the clean DataFrame to a new CSV file
        clean_filename = 'preprocessed_data_clean.csv'
        clean_df.to_csv(clean_filename, index=False)
        
        print("-" * 50)
        print(f"✅ SUCCESS: A new, clean file has been saved as '{clean_filename}'")
        print(f"The clean file has {len(clean_df)} rows and is ready to be used.")
        print("-" * 50)


except FileNotFoundError:
    print("\nERROR: The file 'preprocessed_data.csv' was not found in this folder.")
except Exception as e:
    print(f"\nAn error occurred: {e}")



Successfully loaded file. It has 451 rows.


Removing duplicates and creating a clean file...
--------------------------------------------------
✅ SUCCESS: A new, clean file has been saved as 'preprocessed_data_clean.csv'
The clean file has 353 rows and is ready to be used.
--------------------------------------------------


In [9]:
df2 = pd.read_csv('../raw_data/new.csv')

In [10]:
df2

Unnamed: 0,month,channel,total_revenue,total_orders,aov,total_spend,new_customers,existing_customers,new_customer_orders,existing_customer_orders,...,users,spend,impressions,clicks,returned_value,technology_spend,revenue_share,returns_value_dist,technology_spend_dist,net_revenue
0,1/1/2024,Affiliate,64319.0,523.0,122.98,12864.0,306.0,102.0,310.0,213.0,...,2500.0,12864.0,643200.0,3216.0,25000.0,15000.0,0.05000,1250.0,750.0,63069.0
1,1/1/2024,Direct,195000.0,1500.0,130.00,0.0,800.0,1200.0,810.0,690.0,...,12000.0,0.0,0.0,0.0,25000.0,15000.0,0.15000,3750.0,2250.0,191250.0
2,1/1/2024,Email,280000.0,2500.0,112.00,5000.0,1000.0,4000.0,1010.0,1490.0,...,18000.0,5000.0,250000.0,25000.0,25000.0,15000.0,0.21000,5250.0,3150.0,274750.0
3,1/1/2024,Organic Social,120000.0,1100.0,109.09,2500.0,400.0,1600.0,405.0,695.0,...,8000.0,2500.0,125000.0,6250.0,25000.0,15000.0,0.09000,2250.0,1350.0,117750.0
4,1/1/2024,Paid Search,450000.0,3500.0,128.57,90000.0,2000.0,800.0,2020.0,1480.0,...,28000.0,90000.0,1800000.0,36000.0,25000.0,15000.0,0.34000,8500.0,5100.0,441500.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
446,12/1/2023,Direct,240000.0,1950.0,123.08,0.0,970.0,1455.0,980.0,970.0,...,16500.0,0.0,0.0,0.0,35000.0,20000.0,0.13962,4886.7,2792.4,235113.0
447,12/1/2023,Email,380000.0,3400.0,111.76,7000.0,1170.0,4680.0,1180.0,2220.0,...,22500.0,7000.0,350000.0,35000.0,35000.0,20000.0,0.22106,7737.1,4421.2,372263.0
448,12/1/2023,Organic Social,165000.0,1550.0,106.45,3500.0,490.0,1960.0,495.0,1055.0,...,12500.0,3500.0,175000.0,8750.0,35000.0,20000.0,0.09598,3359.3,1919.6,161641.0
449,12/1/2023,Paid Search,590000.0,4300.0,137.21,118000.0,2450.0,980.0,2470.0,1830.0,...,37000.0,118000.0,2360000.0,47200.0,35000.0,20000.0,0.34322,12012.7,6864.4,577987.0


In [11]:
df

Unnamed: 0,month,channel,total_revenue,total_orders,aov,total_spend,new_customers,existing_customers,new_customer_orders,existing_customer_orders,...,users,spend,impressions,clicks,returned_value,technology_spend,revenue_share,returns_value_dist,technology_spend_dist,net_revenue
0,1/1/2024,Affiliate,64319.0,523.0,122.98,12864.0,306.0,102.0,310.0,213.0,...,2500.0,12864.0,643200.0,3216.0,25000.0,15000.0,0.05000,1250.0,750.0,63069.0
1,1/1/2024,Direct,195000.0,1500.0,130.00,0.0,800.0,1200.0,810.0,690.0,...,12000.0,0.0,0.0,0.0,25000.0,15000.0,0.15000,3750.0,2250.0,191250.0
2,1/1/2024,Email,280000.0,2500.0,112.00,5000.0,1000.0,4000.0,1010.0,1490.0,...,18000.0,5000.0,250000.0,25000.0,25000.0,15000.0,0.21000,5250.0,3150.0,274750.0
3,1/1/2024,Organic Social,120000.0,1100.0,109.09,2500.0,400.0,1600.0,405.0,695.0,...,8000.0,2500.0,125000.0,6250.0,25000.0,15000.0,0.09000,2250.0,1350.0,117750.0
4,1/1/2024,Paid Search,450000.0,3500.0,128.57,90000.0,2000.0,800.0,2020.0,1480.0,...,28000.0,90000.0,1800000.0,36000.0,25000.0,15000.0,0.34000,8500.0,5100.0,441500.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
446,12/1/2023,Direct,240000.0,1950.0,123.08,0.0,970.0,1455.0,980.0,970.0,...,16500.0,0.0,0.0,0.0,35000.0,20000.0,0.13962,4886.7,2792.4,235113.0
447,12/1/2023,Email,380000.0,3400.0,111.76,7000.0,1170.0,4680.0,1180.0,2220.0,...,22500.0,7000.0,350000.0,35000.0,35000.0,20000.0,0.22106,7737.1,4421.2,372263.0
448,12/1/2023,Organic Social,165000.0,1550.0,106.45,3500.0,490.0,1960.0,495.0,1055.0,...,12500.0,3500.0,175000.0,8750.0,35000.0,20000.0,0.09598,3359.3,1919.6,161641.0
449,12/1/2023,Paid Search,590000.0,4300.0,137.21,118000.0,2450.0,980.0,2470.0,1830.0,...,37000.0,118000.0,2360000.0,47200.0,35000.0,20000.0,0.34322,12012.7,6864.4,577987.0


In [13]:
import pandas as pd

# Load preprocessed data
df = pd.read_csv("../preprocessed_data_clean.csv")

# Ensure date column is parsed
df['month'] = pd.to_datetime(df['month'])

# Filter for 2024
df_2024 = df[df['month'].dt.year == 2024]

# Sum revenue
total_2024_revenue = df_2024['total_revenue'].sum()

print("Total 2024 Revenue in Cleaned Data: ${:,.2f}".format(total_2024_revenue))


Total 2024 Revenue in Cleaned Data: $23,622,638.00


In [16]:
df.isna().sum()

month                       1
channel                     1
total_revenue               1
total_orders                1
aov                         1
total_spend                 1
new_customers               1
existing_customers          1
new_customer_orders         1
existing_customer_orders    1
sessions                    1
users                       1
spend                       1
impressions                 1
clicks                      1
returned_value              1
technology_spend            1
revenue_share               1
returns_value_dist          1
technology_spend_dist       1
net_revenue                 1
dtype: int64

In [17]:
df.dropna(axi)

Unnamed: 0,month,channel,total_revenue,total_orders,aov,total_spend,new_customers,existing_customers,new_customer_orders,existing_customer_orders,...,users,spend,impressions,clicks,returned_value,technology_spend,revenue_share,returns_value_dist,technology_spend_dist,net_revenue
0,2024-01-01,Affiliate,64319.0,523.0,122.98,12864.0,306.0,102.0,310.0,213.0,...,2500.0,12864.0,643200.0,3216.0,25000.0,15000.0,0.05000,1250.00,750.00,63069.0
1,2024-01-01,Direct,195000.0,1500.0,130.00,0.0,800.0,1200.0,810.0,690.0,...,12000.0,0.0,0.0,0.0,25000.0,15000.0,0.15000,3750.00,2250.00,191250.0
2,2024-01-01,Email,280000.0,2500.0,112.00,5000.0,1000.0,4000.0,1010.0,1490.0,...,18000.0,5000.0,250000.0,25000.0,25000.0,15000.0,0.21000,5250.00,3150.00,274750.0
3,2024-01-01,Organic Social,120000.0,1100.0,109.09,2500.0,400.0,1600.0,405.0,695.0,...,8000.0,2500.0,125000.0,6250.0,25000.0,15000.0,0.09000,2250.00,1350.00,117750.0
4,2024-01-01,Paid Search,450000.0,3500.0,128.57,90000.0,2000.0,800.0,2020.0,1480.0,...,28000.0,90000.0,1800000.0,36000.0,25000.0,15000.0,0.34000,8500.00,5100.00,441500.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
348,2021-12-01,Email,300000.0,2700.0,111.11,5600.0,1070.0,4280.0,1080.0,1620.0,...,21000.0,5600.0,280000.0,28000.0,28000.0,16500.0,0.21389,5988.92,3529.19,294011.0
349,2021-12-01,Organic Social,125000.0,1150.0,108.70,2750.0,440.0,1760.0,450.0,700.0,...,8500.0,2750.0,135000.0,6750.0,28000.0,16500.0,0.08910,2494.80,1470.15,122505.0
350,2021-12-01,Paid Search,478000.0,3540.0,135.03,95600.0,2108.0,843.0,2125.0,1415.0,...,29000.0,95600.0,1912000.0,38240.0,28000.0,16500.0,0.34073,9540.44,5622.05,468459.0
351,2021-12-01,Paid Social,227000.0,1770.0,128.25,61000.0,1100.0,440.0,1110.0,660.0,...,22500.0,61000.0,1720000.0,34400.0,28000.0,16500.0,0.16184,4531.52,2670.36,222468.0
