In [1]:
import pandas as pd
import numpy as np
import warnings

In [11]:
# --- Stage 1: Setup and Helper Functions ---
print("--- Stage 1: Defining Helper Functions ---")

# Suppress potential warnings for cleaner output
warnings.filterwarnings('ignore')

def find_header_row(df):
    """Dynamically finds the most likely header row in a dataframe."""
    for i, row in df.iterrows():
        row_str = ' '.join(row.astype(str).str.upper().tolist())
        if ('YEAR' in row_str or 'CHANNEL' in row_str or 'KPI' in row_str) and row.notna().sum() > 2:
            return i
    return -1

--- Stage 1: Defining Helper Functions ---


In [12]:
def clean_and_process_wide_format(file_path, metric_name, id_col_name='CHANNEL'):
    """Reads and processes wide-format files with months as columns."""
    try:
        df = pd.read_csv(file_path, header=None, dtype=str, encoding='latin-1')
    except FileNotFoundError:
        print(f"Info: File '{file_path}' not found. Skipping.")
        return pd.DataFrame()

    header_row_index = find_header_row(df)
    if header_row_index == -1:
        print(f"Warning: No header found in '{file_path}'. Skipping.")
        return pd.DataFrame()

    header = df.iloc[header_row_index].str.upper().str.strip()
    data = df.iloc[header_row_index + 1:].copy()
    data.columns = header

    id_col = next((col for col in [id_col_name, 'KPI'] if col in data.columns), data.columns[0])
    data = data.rename(columns={id_col: 'ID_COLUMN'})

    if 'YEAR' not in data.columns: return pd.DataFrame()
    
    # --- FIX: Handle potential duplicate 'YEAR' columns ---
    # If multiple columns are named 'YEAR', pandas returns a DataFrame.
    # We select the first column to ensure we are working with a Series.
    if isinstance(data['YEAR'], pd.DataFrame):
        year_series = data['YEAR'].iloc[:, 0]
    else:
        year_series = data['YEAR']
        
    data['YEAR'] = pd.to_numeric(year_series, errors='coerce')
    data.dropna(subset=['YEAR'], inplace=True)
    data['YEAR'] = data['YEAR'].astype(int)
    
    month_map = {'JANUARY': 1, 'FEBRUARY': 2, 'MARCH': 3, 'APRIL': 4, 'MAY': 5, 'JUNE': 6,
                 'JULY': 7, 'AUGUST': 8, 'SEPTEMBER': 9, 'OCTOBER': 10, 'NOVEMBER': 11, 'DECEMBER': 12}
    month_cols = [col for col in data.columns if col in month_map]
    if not month_cols: return pd.DataFrame()

    melted = data.melt(id_vars=['ID_COLUMN', 'YEAR'], value_vars=month_cols, var_name='MONTH', value_name=metric_name)
    melted[metric_name] = pd.to_numeric(melted[metric_name].astype(str).str.replace(r'[$,]', '', regex=True), errors='coerce')
    melted.dropna(subset=[metric_name], inplace=True)
    melted['MONTH_NUM'] = melted['MONTH'].map(month_map)
    melted['Date'] = pd.to_datetime(dict(year=melted.YEAR, month=melted.MONTH_NUM, day=1))
    melted = melted.rename(columns={'ID_COLUMN': id_col_name})
    
    return melted[['Date', id_col_name, metric_name]]

print("Helper functions defined successfully.")

Helper functions defined successfully.


In [13]:
# --- Stage 2: Process Each Data Category ---
print("\n--- Stage 2: Processing Each Data Category ---")

# Process Spend Data
print("Processing Spend files...")
spend_files = ['Media Spend.csv', 'Media Spend by Channel.csv']
spend_dfs = [clean_and_process_wide_format(f, 'Spend', 'CHANNEL') for f in spend_files]
media_spend = pd.concat(spend_dfs).groupby(['Date', 'CHANNEL'], as_index=False).sum()

tech_spend = clean_and_process_wide_format('Technology Spend.csv', 'Technology_Spend', 'Vendor')
tech_spend_monthly = tech_spend.groupby('Date', as_index=False)['Technology_Spend'].sum() if not tech_spend.empty else pd.DataFrame()



--- Stage 2: Processing Each Data Category ---
Processing Spend files...


In [14]:
# Process Performance Data
print("Processing Performance files...")
web_analytics = clean_and_process_wide_format('Web Analytics.csv', 'Sessions', 'CHANNEL')
mkt_breakdown = clean_and_process_wide_format('Marketing Channel Breakdown.csv', 'Clicks', 'CHANNEL')


Processing Performance files...


In [15]:
# Process Sales & Customer Data (by Customer Type)
print("Processing Sales and Customer files...")
orders_new = clean_and_process_wide_format('Orders By Channel-New.csv', 'Orders_New', 'CHANNEL')
orders_ext = clean_and_process_wide_format('Orders By Channel-Ext.csv', 'Orders_Existing', 'CHANNEL')
cust_new = clean_and_process_wide_format('Cust By Channel-New.csv', 'Customers_New', 'CHANNEL')
cust_ext = clean_and_process_wide_format('Cust By Channel-Ext.csv', 'Customers_Existing', 'CHANNEL')


Processing Sales and Customer files...


In [23]:
# Process Returns Data
print("Processing Returns file...")
try:
    # FIX: More robust and direct logic for the specific format of Returns.csv
    returns_df = pd.read_csv('Returns.csv', header=0, dtype=str, encoding='latin-1')
    
    # Intelligently find the date and returns columns by keyword
    date_col = next((col for col in returns_df.columns if 'MONTH' in col.upper() or 'DATE' in col.upper()), None)
    amount_col = next((col for col in returns_df.columns if 'RETURN' in col.upper()), None)

    if date_col and amount_col:
        # Convert the date column to datetime objects
        returns_df['Date'] = pd.to_datetime(returns_df[date_col], errors='coerce')
        
        # Clean the currency string: remove parentheses, $, and commas, then convert to a negative float
        returns_df['Total_Returns'] = returns_df[amount_col].str.replace(r'[($),]', '', regex=True).astype(float) * -1
        
        # Drop any rows where date conversion failed
        returns_df.dropna(subset=['Date', 'Total_Returns'], inplace=True)
        
        # Select the final columns
        returns_monthly = returns_df[['Date', 'Total_Returns']]
    else:
        print("Warning: Could not find required 'Month' or 'Returns' columns in 'Returns.csv'. Skipping.")
        returns_monthly = pd.DataFrame()

except (FileNotFoundError, KeyError, Exception) as e:
    print(f"Info: Could not process 'Returns.csv'. Error: {e}. Skipping.")
    returns_monthly = pd.DataFrame()


Processing Returns file...


In [24]:
returns_monthly

Unnamed: 0,Date,Total_Returns
0,2024-01-01,-61116.9
1,2024-02-01,-46543.44
2,2024-03-01,-50138.12
3,2024-04-01,-42238.03
4,2024-05-01,-53802.15
5,2024-06-01,-51017.6
6,2024-07-01,-46815.46
7,2024-08-01,-35156.1
8,2024-09-01,-35440.03
9,2024-10-01,-32516.32


In [25]:
# Process High-Level KPIs
print("Processing TOPSHEET...")
topsheet = clean_and_process_wide_format('TOPSHEET.csv', 'Total_Revenue', 'KPI')
revenue_df = pd.DataFrame()
if not topsheet.empty:
    revenue_df = topsheet[topsheet['KPI'].str.contains('GROSS-DISC', na=False)][['Date', 'Total_Revenue']]

print("Individual file processing complete.")


Processing TOPSHEET...
Individual file processing complete.


In [26]:
# --- Stage 3: Unification and Merging ---
print("\n--- Stage 3: Unifying All Dataframes ---")
from functools import reduce


--- Stage 3: Unifying All Dataframes ---


In [27]:
# List of all dataframes that are broken down by channel
channel_dfs = [media_spend, web_analytics, mkt_breakdown, orders_new, orders_ext, cust_new, cust_ext]
channel_dfs_non_empty = [df for df in channel_dfs if not df.empty]

In [28]:
# Merge all channel-specific dataframes
master_df = reduce(lambda left, right: pd.merge(left, right, on=['Date', 'CHANNEL'], how='outer'), channel_dfs_non_empty)


In [29]:
# List of all dataframes aggregated at the business level (by date only)
business_dfs = [revenue_df, returns_monthly, tech_spend_monthly]
business_dfs_non_empty = [df for df in business_dfs if not df.empty]


In [30]:
# Merge business-level data onto the master dataframe
for df in business_dfs_non_empty:
    master_df = pd.merge(master_df, df, on='Date', how='left')

print("Merging complete.")

Merging complete.


In [31]:
master_df

Unnamed: 0,Date,CHANNEL,Spend,Orders_New,Orders_Existing,Customers_New,Customers_Existing,Total_Revenue,Total_Returns
0,2021-01-01,,,,,686.0,362.0,,
1,2021-01-01,,,,,686.0,70.0,,
2,2021-01-01,,,,,686.0,159.0,,
3,2021-01-01,,,,,686.0,1113.0,,
4,2021-01-01,,,,,686.0,412.0,,
...,...,...,...,...,...,...,...,...,...
131581,2025-10-01,TOTAL Media,0.0,,,,,,
131582,2025-11-01,TOTAL Agency,0.0,,,,,,
131583,2025-11-01,TOTAL Media,0.0,,,,,,
131584,2025-12-01,TOTAL Agency,0.0,,,,,,


In [32]:
# --- Stage 4: Final Cleanup and Structuring ---
print("\n--- Stage 4: Performing Final Cleanup ---")
# Standardize channel names
master_df['CHANNEL'] = master_df['CHANNEL'].str.replace(' Media', '', regex=False).str.strip()



--- Stage 4: Performing Final Cleanup ---


In [33]:
# Fill NaN values in numeric columns with 0
numeric_cols = master_df.select_dtypes(include=np.number).columns
master_df[numeric_cols] = master_df[numeric_cols].fillna(0)


In [34]:
# Aggregate data after standardizing channel names to remove any duplicates
group_cols = ['Date', 'CHANNEL']
agg_dict = {col: 'sum' for col in numeric_cols}
master_df = master_df.groupby(group_cols, as_index=False).agg(agg_dict)


In [35]:
# Add YEAR and MONTH columns for readability
master_df['YEAR'] = master_df['Date'].dt.year
master_df['MONTH'] = master_df['Date'].dt.month_name().str.upper()


In [36]:
master_df

Unnamed: 0,Date,CHANNEL,Spend,Orders_New,Orders_Existing,Customers_New,Customers_Existing,Total_Revenue,Total_Returns,YEAR,MONTH
0,2024-01-01,Affiliate,0.0,0.0,116.0,0.0,0.0,0.0,-61116.9,2024,JANUARY
1,2024-01-01,Direct,0.0,800.0,793.0,0.0,0.0,0.0,-61116.9,2024,JANUARY
2,2024-01-01,Display,0.0,0.0,0.0,24.0,58.0,0.0,-61116.9,2024,JANUARY
3,2024-01-01,Email,0.0,271.0,1158.0,0.0,0.0,0.0,-61116.9,2024,JANUARY
4,2024-01-01,Organic Search,0.0,682.0,382.0,0.0,0.0,0.0,-61116.9,2024,JANUARY
...,...,...,...,...,...,...,...,...,...,...,...
159,2025-10-01,TOTAL Agency,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2025,OCTOBER
160,2025-11-01,TOTAL,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2025,NOVEMBER
161,2025-11-01,TOTAL Agency,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2025,NOVEMBER
162,2025-12-01,TOTAL,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2025,DECEMBER


In [37]:
# Define final column order
final_cols = ['YEAR', 'MONTH', 'CHANNEL', 'Sessions', 'Clicks', 'Spend', 
              'Total_Returns', 'Technology_Spend', 'Total_Revenue', 
              'Orders_Existing', 'Customers_Existing', 'Orders_New', 'Customers_New']
existing_final_cols = [col for col in final_cols if col in master_df.columns]
final_df = master_df[existing_final_cols]

print("Final cleanup complete.")

Final cleanup complete.


In [38]:
# --- Stage 5: Verification and Saving ---
print("\n--- Stage 5: Verifying and Saving Final File ---")
print("--- Final Preprocessed Data (First 5 Rows) --- ")
print(final_df.head().to_string())


--- Stage 5: Verifying and Saving Final File ---
--- Final Preprocessed Data (First 5 Rows) --- 
   YEAR    MONTH         CHANNEL  Spend  Total_Returns  Total_Revenue  Orders_Existing  Customers_Existing  Orders_New  Customers_New
0  2024  JANUARY       Affiliate    0.0       -61116.9            0.0            116.0                 0.0         0.0            0.0
1  2024  JANUARY          Direct    0.0       -61116.9            0.0            793.0                 0.0       800.0            0.0
2  2024  JANUARY         Display    0.0       -61116.9            0.0              0.0                58.0         0.0           24.0
3  2024  JANUARY           Email    0.0       -61116.9            0.0           1158.0                 0.0       271.0            0.0
4  2024  JANUARY  Organic Search    0.0       -61116.9            0.0            382.0                 0.0       682.0            0.0


In [39]:
# Save the final dataframe to a CSV file
output_filename = 'monthly_summary_v5_final_non_redundant.csv'
final_df.to_csv(output_filename, index=False)

print(f"\nSuccessfully saved the unified data to '{output_filename}'")



Successfully saved the unified data to 'monthly_summary_v5_final_non_redundant.csv'


In [40]:
tech_spend_monthly