In [1]:

import pandas as pd
import numpy as np
import warnings

In [2]:
# --- Stage 1: Setup and Helper Functions ---
print("--- Stage 1: Defining Helper Functions ---")

# Suppress potential warnings for cleaner output
warnings.filterwarnings('ignore')

def find_header_row(df):
    """Dynamically finds the most likely header row in a dataframe."""
    for i, row in df.iterrows():
        row_str = ' '.join(row.astype(str).str.upper().tolist())
        if ('YEAR' in row_str or 'CHANNEL' in row_str or 'KPI' in row_str) and row.notna().sum() > 2:
            return i
    return -1


--- Stage 1: Defining Helper Functions ---


In [3]:
def clean_and_process_wide_format(file_path, metric_name, id_col_name='CHANNEL'):
    """Reads and processes wide-format files with months as columns."""
    try:
        df = pd.read_csv(file_path, header=None, dtype=str)
    except FileNotFoundError:
        print(f"Info: File '{file_path}' not found. Skipping.")
        return pd.DataFrame()

    header_row_index = find_header_row(df)
    if header_row_index == -1:
        print(f"Warning: No header found in '{file_path}'. Skipping.")
        return pd.DataFrame()

    header = df.iloc[header_row_index].str.upper().str.strip()
    data = df.iloc[header_row_index + 1:].copy()
    data.columns = header

    id_col = next((col for col in [id_col_name, 'KPI'] if col in data.columns), data.columns[0])
    data = data.rename(columns={id_col: 'ID_COLUMN'})

    if 'YEAR' not in data.columns: return pd.DataFrame()
    data['YEAR'] = pd.to_numeric(data['YEAR'], errors='coerce')
    data.dropna(subset=['YEAR'], inplace=True)
    data['YEAR'] = data['YEAR'].astype(int)
    
    month_map = {'JANUARY': 1, 'FEBRUARY': 2, 'MARCH': 3, 'APRIL': 4, 'MAY': 5, 'JUNE': 6,
                 'JULY': 7, 'AUGUST': 8, 'SEPTEMBER': 9, 'OCTOBER': 10, 'NOVEMBER': 11, 'DECEMBER': 12}
    month_cols = [col for col in data.columns if col in month_map]
    if not month_cols: return pd.DataFrame()

    melted = data.melt(id_vars=['ID_COLUMN', 'YEAR'], value_vars=month_cols, var_name='MONTH', value_name=metric_name)
    melted[metric_name] = pd.to_numeric(melted[metric_name].astype(str).str.replace(r'[$,]', '', regex=True), errors='coerce')
    melted.dropna(subset=[metric_name], inplace=True)
    melted['MONTH_NUM'] = melted['MONTH'].map(month_map)
    melted['Date'] = pd.to_datetime(dict(year=melted.YEAR, month=melted.MONTH_NUM, day=1))
    melted = melted.rename(columns={'ID_COLUMN': id_col_name})
    
    return melted[['Date', id_col_name, metric_name]]
print("Helper functions defined successfully.")


Helper functions defined successfully.


In [5]:
# --- Stage 2: Process Each Data Category ---
print("\n--- Stage 2: Processing Each Data Category ---")

# Process Spend Data
print("Processing Spend files...")
spend_files = ['Media Spend.csv', 'Media Spend by Channel.csv']
spend_dfs = [clean_and_process_wide_format(f, 'Spend', 'CHANNEL') for f in spend_files]
media_spend = pd.concat(spend_dfs).groupby(['Date', 'CHANNEL'], as_index=False).sum()

tech_spend = clean_and_process_wide_format('Technology Spend.csv', 'Technology_Spend', 'Vendor')
tech_spend_monthly = tech_spend.groupby('Date', as_index=False)['Technology_Spend'].sum() if not tech_spend.empty else pd.DataFrame()



--- Stage 2: Processing Each Data Category ---
Processing Spend files...


In [8]:

# Process Performance Data
print("Processing Performance files...")
web_analytics = clean_and_process_wide_format('Web Analytics.csv', 'Sessions', 'CHANNEL')
mkt_breakdown = clean_and_process_wide_format('Marketing Channel Breakdown.csv', 'Clicks', 'CHANNEL')


Processing Performance files...


UnicodeDecodeError: 'utf-8' codec can't decode byte 0x96 in position 1752: invalid start byte