In [2]:
import pandas as pd
import numpy as np
import glob

def clean_and_process_all_files():
    """
    Finds all relevant CSVs in the folder, then loads, cleans, and processes them.
    - Cleans currency and percentage strings.
    - Reshapes wide-format data to long format.
    - Standardizes column names.
    - Handles file not found errors gracefully.
    - Saves cleaned files to new CSVs.
    """
    # Use glob to find the files in the current directory
    file_names = [f for f in glob.glob("*.csv") if not f.startswith('cleaned_')]
    
    cleaned_dataframes = {}
    errors = {}

    print(f"Found {len(file_names)} files to process...")

    # --- Helper Functions ---
    def clean_currency(s):
        if isinstance(s, str):
            s = s.replace('$', '').replace(',', '')
            if '(' in s and ')' in s:
                s = '-' + s.replace('(', '').replace(')', '')
            return pd.to_numeric(s, errors='coerce')
        return s

    def clean_numeric_string(s):
        if isinstance(s, str):
            return pd.to_numeric(s.replace(',', ''), errors='coerce')
        return s

    def clean_percentage(s):
        if isinstance(s, str):
            return pd.to_numeric(s.replace('%', ''), errors='coerce') / 100.0
        return s

    for file_name in file_names:
        try:
            print(f"--- Processing: {file_name} ---")
            
            separator = '\t' if file_name == "Marketing Channel Breakdown.csv" else ','
            df = pd.read_csv(file_name, sep=separator)
            
            original_file_name = file_name # Keep track of the original name
            
            # Standardize column names
            df.columns = [str(col).lower().replace(' ', '_').replace('-', '_') for col in df.columns]

            if 'returns' in file_name.lower():
                for col in ['gross_sales', 'discounts', 'returns', 'net_sales']:
                    df[col] = df[col].apply(clean_currency)
                df['month'] = pd.to_datetime(df['month'], format='%m/%d/%Y', errors='coerce')

            elif 'email' in file_name.lower():
                df.rename(columns={'type_(batch_marketing,_triggered,_transactional)': 'email_type'}, inplace=True)
                df['sends'] = df['sends'].apply(clean_numeric_string)
                df['clicks'] = df['clicks'].apply(clean_numeric_string)
                df['date'] = pd.to_datetime(df['year'].astype(str) + '-' + df['month'].astype(str) + '-01', errors='coerce')
                df = df[['date', 'email_type', 'sends', 'clicks']]
            
            elif 'web_analytics' in file_name.lower():
                for col in ['added_to_cart_rate', 'reached_checkout_rate', 'checkout_conversion_rate', 'conversion_rate']:
                    df[col] = df[col].apply(clean_percentage)
                df['date'] = pd.to_datetime(df['year'].astype(str) + '-' + df['month'].astype(str) + '-01', errors='coerce')
                df.drop(columns=['year', 'month'], inplace=True, errors='ignore')

            elif 'channel' in file_name.lower() or 'topsheet' in file_name.lower():
                if 'topsheet' in file_name.lower():
                    df.rename(columns={'kpi': 'metric', 'data_source': 'source'}, inplace=True)
                    id_vars = ['source', 'metric', 'year']
                else:
                    df.rename(columns={'channel': 'channel_name'}, inplace=True)
                    id_vars = ['channel_name', 'year']
                
                month_cols = ['january', 'february', 'march', 'april', 'may', 'june', 'july', 'august', 'september', 'october', 'november', 'december']
                value_vars = [col for col in month_cols if col in df.columns]
                
                for col in value_vars:
                    df[col] = df[col].astype(str).str.replace(',', '').str.replace('$', '').str.replace('%', '')
                    df[col] = pd.to_numeric(df[col], errors='coerce')

                df = df.melt(id_vars=id_vars, value_vars=value_vars, var_name='month', value_name='value')
                
                month_map = {name: i+1 for i, name in enumerate(month_cols)}
                df['month_num'] = df['month'].str.lower().map(month_map)
                df['year'] = pd.to_numeric(df['year'], errors='coerce')
                df.dropna(subset=['year', 'month_num'], inplace=True)
                df['year'] = df['year'].astype(int)
                df['date'] = pd.to_datetime(df['year'].astype(str) + '-' + df['month_num'].astype(str) + '-01', errors='coerce')
                df.drop(columns=['year', 'month', 'month_num'], inplace=True, errors='ignore')
            
            elif 'marketing_channel_breakdown' in file_name.lower():
                 # This file might have a year as the first column, let's rename it
                df.rename(columns={df.columns[0]: 'year'}, inplace=True)
                # Add cleaning logic for its specific currency/numeric columns if needed
                for col in ['ad_spend', 'gross_discount_(shopify)', 'sessions', 'clicks', 'orders', 'new_customers']:
                    if col in df.columns:
                        df[col] = df[col].apply(clean_numeric_string)
                for col in ['ctr', 'conversion_rate']:
                     if col in df.columns:
                        df[col] = df[col].apply(clean_percentage)

            cleaned_dataframes[original_file_name] = df
            print(f"✅ Success.")

        except Exception as e:
            errors[original_file_name] = str(e)
            print(f"❌ Error: {e}")

    print("\n\n--- Processing Summary ---")
    if cleaned_dataframes:
        print(f"Successfully processed: {len(cleaned_dataframes)} files")
        for f in cleaned_dataframes.keys():
            cleaned_file_name = f"cleaned_{f}"
            cleaned_dataframes[f].to_csv(cleaned_file_name, index=False)
            print(f"  - {f} -> Saved to {cleaned_file_name}")

    if errors:
        print(f"\nFailed to process: {len(errors)} files")
        for f, reason in errors.items():
            print(f"  - {f}: {reason}")
    
    return cleaned_dataframes, errors

# --- Run the script ---
if __name__ == '__main__':
    cleaned_data, error_log = clean_and_process_all_files()

    # As an example, display the first 5 rows of a cleaned, reshaped file
    if 'Media Spend by Channel.csv' in cleaned_data:
        print("\n--- Sample of Cleaned Data (Media Spend by Channel.csv) ---")
        print(cleaned_data['Media Spend by Channel.csv'].head())

Found 10 files to process...
--- Processing: Cust By Channel-Ext.csv ---
✅ Success.
--- Processing: Cust By Channel-New.csv ---
✅ Success.
--- Processing: Email.csv ---
✅ Success.
--- Processing: Marketing Channel Breakdown.csv ---
❌ Error: "The following id_vars or value_vars are not present in the DataFrame: ['channel_name', 'year']"
--- Processing: Media Spend by Channel.csv ---
✅ Success.
--- Processing: Orders By Channel-Ext.csv ---
✅ Success.
--- Processing: Orders By Channel-New.csv ---
✅ Success.
--- Processing: Returns.csv ---
✅ Success.
--- Processing: TOPSHEET.csv ---
✅ Success.
--- Processing: Web Analytics.csv ---
✅ Success.


--- Processing Summary ---
Successfully processed: 9 files
  - Cust By Channel-Ext.csv -> Saved to cleaned_Cust By Channel-Ext.csv
  - Cust By Channel-New.csv -> Saved to cleaned_Cust By Channel-New.csv
  - Email.csv -> Saved to cleaned_Email.csv
  - Media Spend by Channel.csv -> Saved to cleaned_Media Spend by Channel.csv
  - Orders By Channel-Ext.c

In [5]:
import pandas as pd

try:
    # Load the user-provided cleaned file
    file_path = "cleaned_Cust By Channel-Ext.csv"
    df = pd.read_csv(file_path)

    print(f"--- Analysis for {file_path} ---")

    # --- 1. Check Data Types and Null Values ---
    print("\n\n--- Data Types and Non-Null Counts ---")
    # Use a buffer to capture the info() output as a string
    from io import StringIO
    buffer = StringIO()
    df.info(buf=buffer)
    info_str = buffer.getvalue()
    print(info_str)

    # --- 2. Check for Duplicates ---
    duplicate_rows = df.duplicated().sum()
    print(f"\n--- Duplicates Check ---")
    print(f"Number of duplicate rows found: {duplicate_rows}")


    # --- 3. Examine Descriptive Statistics ---
    print("\n\n--- Descriptive Statistics ---")
    # The datetime_is_numeric=True is needed for newer pandas versions
    try:
        # Temporarily convert date for describe if it's object
        if df['date'].dtype == 'object':
            df['date'] = pd.to_datetime(df['date'])
        print(df.describe(include='all', datetime_is_numeric=True))
    except Exception as e:
        print(f"Could not generate full descriptive statistics. Error: {e}")


    # --- 4. Deep Dive into Specific Columns ---

    # Channel Name consistency
    print("\n\n--- Unique Channel Names ---")
    unique_channels = df['channel_name'].unique()
    print(unique_channels)

    # Check for negative values where they shouldn't be
    print("\n\n--- Negative Value Check ---")
    if 'value' in df.columns and pd.api.types.is_numeric_dtype(df['value']):
        negative_values = df[df['value'] < 0].shape[0]
        if negative_values > 0:
            print(f"MISTAKE FOUND: There are {negative_values} rows with negative customer counts.")
        else:
            print("No negative customer counts found. That's good.")
    else:
        print("Could not perform negative value check on 'value' column.")

    # Check the date range
    print("\n\n--- Date Range Check ---")
    if 'date' in df.columns:
        try:
            # Ensure 'date' column is datetime
            if df['date'].dtype != '<M8[ns]':
                 df['date'] = pd.to_datetime(df['date'])
            
            min_date = df['date'].min().strftime('%Y-%m-%d')
            max_date = df['date'].max().strftime('%Y-%m-%d')
            print(f"Date range: {min_date} to {max_date}")

            # Check if any dates are in the future
            # Using a fixed date for reproducibility based on conversation context
            future_dates = df[df['date'] > pd.to_datetime("2025-07-01")].shape[0]
            if future_dates > 0:
                 print(f"MISTAKE FOUND: There are {future_dates} dates in the future (after July 1, 2025).")
            else:
                 print("No future dates found. That's good.")
        except Exception as e:
            print(f"Could not perform date analysis. Error: {e}")


except FileNotFoundError:
    print(f"ERROR: The file '{file_path}' was not found. Please ensure it's in the same directory as your notebook.")
except Exception as e:
    print(f"An error occurred during analysis: {e}")

--- Analysis for cleaned_Cust By Channel-Ext.csv ---


--- Data Types and Non-Null Counts ---
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 648 entries, 0 to 647
Data columns (total 3 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   channel_name  648 non-null    object
 1   value         648 non-null    int64 
 2   date          648 non-null    object
dtypes: int64(1), object(2)
memory usage: 15.3+ KB


--- Duplicates Check ---
Number of duplicate rows found: 0


--- Descriptive Statistics ---
Could not generate full descriptive statistics. Error: NDFrame.describe() got an unexpected keyword argument 'datetime_is_numeric'


--- Unique Channel Names ---
['Paid Search' 'Paid Social' 'Affiliate' 'Display' 'Email' 'SMS'
 'Organic Search' 'Direct' 'Unattributed' 'Other' 'Organic Social']


--- Negative Value Check ---
No negative customer counts found. That's good.


--- Date Range Check ---
Date range: 2021-01-01 to 2025-12-01
MISTAKE F

In [7]:
import pandas as pd
import glob
import re

def clean_all_data_files_final():
    """
    Finds all raw CSVs in the folder, then loads, cleans, reshapes, and saves them.
    This version includes specific fixes for the marketing file and date formats.
    """
    # Define all the expected raw filenames
    file_names = [
        "Cust By Channel-Ext.csv", "Cust By Channel-New.csv", "Email.csv",
        "Media Spend by Channel.csv",
        "Orders By Channel-Ext.csv", "Orders By Channel-New.csv",
        "Returns.csv", "TOPSHEET.csv", "Web Analytics.csv"
    ]
    
    cleaned_dataframes = {}
    errors = {}

    print(f"Starting the cleaning process for {len(file_names)} files...")

    # --- Helper Functions ---
    def clean_currency(s):
        if not isinstance(s, str): return pd.to_numeric(s, errors='coerce')
        s = s.replace('$', '').replace(',', '').strip()
        if '(' in s and ')' in s:
            s = '-' + s.replace('(', '').replace(')', '')
        return pd.to_numeric(s, errors='coerce')

    def clean_percentage(s):
        if not isinstance(s, str): return pd.to_numeric(s, errors='coerce')
        return pd.to_numeric(s.replace('%', '').replace('"', '').strip(), errors='coerce') / 100.0

    def clean_numeric(s):
        if not isinstance(s, str): return pd.to_numeric(s, errors='coerce')
        return pd.to_numeric(s.replace(',', '').strip(), errors='coerce')

    for file_name in file_names:
        try:
            print(f"--- Processing: {file_name} ---")
            
            # --- File Loading and Parsing ---
            if file_name == "Marketing Channel Breakdown.csv":
                # Manual parsing for the most problematic file
                with open(file_name, 'r', encoding='utf-8') as f:
                    lines = f.readlines()
                header = [h.strip() for h in lines[0].split('\t')]
                rows = [line.strip().split('\t') for line in lines[1:]]
                df = pd.DataFrame(rows, columns=header)
            else:
                df = pd.read_csv(file_name)

            # --- Cleaning and Transformation ---
            original_file_name = file_name
            df.columns = [str(col).lower().replace(' ', '_').replace('-', '_') for col in df.columns]

            if 'marketing_channel_breakdown' in file_name.lower():
                df.rename(columns={df.columns[0]: 'year_period'}, inplace=True)
                for col in ['ad_spend', 'gross_discount_(shopify)']: df[col] = df[col].apply(clean_currency)
                for col in ['sessions', 'clicks', 'orders', 'new_customers']: df[col] = df[col].apply(clean_numeric)
                for col in ['ctr', 'conversion_rate']: df[col] = df[col].apply(clean_percentage)
                df['year'] = df['year_period'].apply(lambda x: str(x).split(' ')[0])

            elif 'returns' in file_name.lower():
                for col in ['gross_sales', 'discounts', 'returns', 'net_sales']: df[col] = df[col].apply(clean_currency)
                df['month'] = pd.to_datetime(df['month'], format='%m/%d/%Y', errors='coerce')

            elif 'email' in file_name.lower():
                df.rename(columns={'type_(batch_marketing,_triggered,_transactional)': 'email_type'}, inplace=True)
                for col in ['sends', 'clicks']: df[col] = df[col].apply(clean_numeric)
                df['date'] = pd.to_datetime(df['year'].astype(str) + '-' + df['month'].astype(str) + '-01', errors='coerce')
                df = df[['date', 'email_type', 'sends', 'clicks']]

            elif 'web_analytics' in file_name.lower():
                for col in ['added_to_cart_rate', 'reached_checkout_rate', 'checkout_conversion_rate', 'conversion_rate']: df[col] = df[col].apply(clean_percentage)
                df['date'] = pd.to_datetime(df['year'].astype(str) + '-' + df['month'].astype(str) + '-01', errors='coerce')
                df.drop(columns=['year', 'month'], inplace=True, errors='ignore')

            elif 'channel' in file_name.lower() or 'topsheet' in file_name.lower():
                id_vars = []
                if 'topsheet' in file_name.lower():
                    df.rename(columns={'kpi': 'metric', 'data_source': 'source'}, inplace=True)
                    id_vars = [c for c in ['source', 'metric', 'year'] if c in df.columns]
                else:
                    df.rename(columns={'channel': 'channel_name'}, inplace=True)
                    id_vars = [c for c in ['channel_name', 'year'] if c in df.columns]

                month_cols = ['january', 'february', 'march', 'april', 'may', 'june', 'july', 'august', 'september', 'october', 'november', 'december']
                value_vars = [col for col in month_cols if col in df.columns]
                
                for col in value_vars: df[col] = df[col].apply(clean_numeric)

                df = df.melt(id_vars=id_vars, value_vars=value_vars, var_name='month', value_name='value')
                
                month_map = {name: i+1 for i, name in enumerate(month_cols)}
                df['month_num'] = df['month'].str.lower().map(month_map)
                df['year'] = pd.to_numeric(df['year'], errors='coerce')
                df.dropna(subset=['year', 'month_num'], inplace=True)
                df['year'] = df['year'].astype(int)
                df['date'] = pd.to_datetime(df['year'].astype(str) + '-' + df['month_num'].astype(str) + '-01', errors='coerce')
                df.drop(columns=['year', 'month', 'month_num'], inplace=True, errors='ignore')

            cleaned_dataframes[original_file_name] = df
            print(f"✅ Success.")

        except Exception as e:
            errors[original_file_name] = str(e)
            print(f"❌ Error: {e}")

    print("\n\n--- Processing Summary ---")
    if cleaned_dataframes:
        print(f"Successfully processed: {len(cleaned_dataframes)} files")
        for f in cleaned_dataframes.keys():
            cleaned_file_name = f"cleaned_{f}"
            cleaned_dataframes[f].to_csv(cleaned_file_name, index=False)
            print(f"  - {f} -> Saved to {cleaned_file_name}")

    if errors:
        print(f"\nFailed to process: {len(errors)} files")
        for f, reason in errors.items():
            print(f"  - {f}: {reason}")
    
    return cleaned_dataframes, errors

# --- Run the entire cleaning process ---
if __name__ == '__main__':
    cleaned_data, error_log = clean_all_data_files_final()


Starting the cleaning process for 9 files...
--- Processing: Cust By Channel-Ext.csv ---
✅ Success.
--- Processing: Cust By Channel-New.csv ---
✅ Success.
--- Processing: Email.csv ---
✅ Success.
--- Processing: Media Spend by Channel.csv ---
✅ Success.
--- Processing: Orders By Channel-Ext.csv ---
✅ Success.
--- Processing: Orders By Channel-New.csv ---
✅ Success.
--- Processing: Returns.csv ---
✅ Success.
--- Processing: TOPSHEET.csv ---
✅ Success.
--- Processing: Web Analytics.csv ---
✅ Success.


--- Processing Summary ---
Successfully processed: 9 files
  - Cust By Channel-Ext.csv -> Saved to cleaned_Cust By Channel-Ext.csv
  - Cust By Channel-New.csv -> Saved to cleaned_Cust By Channel-New.csv
  - Email.csv -> Saved to cleaned_Email.csv
  - Media Spend by Channel.csv -> Saved to cleaned_Media Spend by Channel.csv
  - Orders By Channel-Ext.csv -> Saved to cleaned_Orders By Channel-Ext.csv
  - Orders By Channel-New.csv -> Saved to cleaned_Orders By Channel-New.csv
  - Returns.csv -

In [8]:
import pandas as pd
import glob
import re

def clean_all_data_files_final():
    """
    Finds all raw CSVs in the folder, then loads, cleans, reshapes, and saves them.
    This version includes specific fixes for the marketing file and date formats.
    """
    # Define all the expected raw filenames
    file_names = [
        "Media Spend by Channel.csv"
    ]
    
    cleaned_dataframes = {}
    errors = {}

    print(f"Starting the cleaning process for {len(file_names)} files...")

    # --- Helper Functions ---
    def clean_currency(s):
        if not isinstance(s, str): return pd.to_numeric(s, errors='coerce')
        s = s.replace('$', '').replace(',', '').strip()
        if '(' in s and ')' in s:
            s = '-' + s.replace('(', '').replace(')', '')
        return pd.to_numeric(s, errors='coerce')

    def clean_percentage(s):
        if not isinstance(s, str): return pd.to_numeric(s, errors='coerce')
        return pd.to_numeric(s.replace('%', '').replace('"', '').strip(), errors='coerce') / 100.0

    def clean_numeric(s):
        if not isinstance(s, str): return pd.to_numeric(s, errors='coerce')
        return pd.to_numeric(s.replace(',', '').strip(), errors='coerce')

    for file_name in file_names:
        try:
            print(f"--- Processing: {file_name} ---")
            
            # --- File Loading and Parsing ---
            if file_name == "Marketing Channel Breakdown.csv":
                # Manual parsing for the most problematic file
                with open(file_name, 'r', encoding='utf-8') as f:
                    lines = f.readlines()
                header = [h.strip() for h in lines[0].split('\t')]
                rows = [line.strip().split('\t') for line in lines[1:]]
                df = pd.DataFrame(rows, columns=header)
            else:
                df = pd.read_csv(file_name)

            # --- Cleaning and Transformation ---
            original_file_name = file_name
            df.columns = [str(col).lower().replace(' ', '_').replace('-', '_') for col in df.columns]

            if 'marketing_channel_breakdown' in file_name.lower():
                df.rename(columns={df.columns[0]: 'year_period'}, inplace=True)
                for col in ['ad_spend', 'gross_discount_(shopify)']: df[col] = df[col].apply(clean_currency)
                for col in ['sessions', 'clicks', 'orders', 'new_customers']: df[col] = df[col].apply(clean_numeric)
                for col in ['ctr', 'conversion_rate']: df[col] = df[col].apply(clean_percentage)
                df['year'] = df['year_period'].apply(lambda x: str(x).split(' ')[0])

            elif 'returns' in file_name.lower():
                for col in ['gross_sales', 'discounts', 'returns', 'net_sales']: df[col] = df[col].apply(clean_currency)
                df['month'] = pd.to_datetime(df['month'], format='%m/%d/%Y', errors='coerce')

            elif 'email' in file_name.lower():
                df.rename(columns={'type_(batch_marketing,_triggered,_transactional)': 'email_type'}, inplace=True)
                for col in ['sends', 'clicks']: df[col] = df[col].apply(clean_numeric)
                df['date'] = pd.to_datetime(df['year'].astype(str) + '-' + df['month'].astype(str) + '-01', errors='coerce')
                df = df[['date', 'email_type', 'sends', 'clicks']]

            elif 'web_analytics' in file_name.lower():
                for col in ['added_to_cart_rate', 'reached_checkout_rate', 'checkout_conversion_rate', 'conversion_rate']: df[col] = df[col].apply(clean_percentage)
                df['date'] = pd.to_datetime(df['year'].astype(str) + '-' + df['month'].astype(str) + '-01', errors='coerce')
                df.drop(columns=['year', 'month'], inplace=True, errors='ignore')

            elif 'channel' in file_name.lower() or 'topsheet' in file_name.lower():
                id_vars = []
                if 'topsheet' in file_name.lower():
                    df.rename(columns={'kpi': 'metric', 'data_source': 'source'}, inplace=True)
                    id_vars = [c for c in ['source', 'metric', 'year'] if c in df.columns]
                else:
                    df.rename(columns={'channel': 'channel_name'}, inplace=True)
                    id_vars = [c for c in ['channel_name', 'year'] if c in df.columns]

                month_cols = ['january', 'february', 'march', 'april', 'may', 'june', 'july', 'august', 'september', 'october', 'november', 'december']
                value_vars = [col for col in month_cols if col in df.columns]
                
                # CORRECTED: Use clean_currency for files like media spend that have dollar values.
                # This is more robust than clean_numeric for these specific files.
                for col in value_vars: df[col] = df[col].apply(clean_currency)

                df = df.melt(id_vars=id_vars, value_vars=value_vars, var_name='month', value_name='value')
                
                month_map = {name: i+1 for i, name in enumerate(month_cols)}
                df['month_num'] = df['month'].str.lower().map(month_map)
                df['year'] = pd.to_numeric(df['year'], errors='coerce')
                df.dropna(subset=['year', 'month_num'], inplace=True)
                df['year'] = df['year'].astype(int)
                df['date'] = pd.to_datetime(df['year'].astype(str) + '-' + df['month_num'].astype(str) + '-01', errors='coerce')
                df.drop(columns=['year', 'month', 'month_num'], inplace=True, errors='ignore')

            cleaned_dataframes[original_file_name] = df
            print(f"✅ Success.")

        except Exception as e:
            errors[original_file_name] = str(e)
            print(f"❌ Error: {e}")

    print("\n\n--- Processing Summary ---")
    if cleaned_dataframes:
        print(f"Successfully processed: {len(cleaned_dataframes)} files")
        for f in cleaned_dataframes.keys():
            cleaned_file_name = f"cleaned_{f}"
            cleaned_dataframes[f].to_csv(cleaned_file_name, index=False)
            print(f"  - {f} -> Saved to {cleaned_file_name}")

    if errors:
        print(f"\nFailed to process: {len(errors)} files")
        for f, reason in errors.items():
            print(f"  - {f}: {reason}")
    
    return cleaned_dataframes, errors

# --- Run the entire cleaning process ---
if __name__ == '__main__':
    cleaned_data, error_log = clean_all_data_files_final()


Starting the cleaning process for 1 files...
--- Processing: Media Spend by Channel.csv ---
✅ Success.


--- Processing Summary ---
Successfully processed: 1 files
  - Media Spend by Channel.csv -> Saved to cleaned_Media Spend by Channel.csv


In [10]:
import pandas as pd
import glob
import re

def clean_all_data_files_final():
    """
    Finds all raw CSVs in the folder, then loads, cleans, reshapes, and saves them.
    This version includes specific fixes for the marketing file and date formats.
    """
    # Define all the expected raw filenames
    file_names = [
        "Email.csv"
    ]
    
    cleaned_dataframes = {}
    errors = {}

    print(f"Starting the cleaning process for {len(file_names)} files...")

    # --- Helper Functions ---
    def clean_currency(s):
        if not isinstance(s, str): return pd.to_numeric(s, errors='coerce')
        s = s.replace('$', '').replace(',', '').strip()
        if '(' in s and ')' in s:
            s = '-' + s.replace('(', '').replace(')', '')
        return pd.to_numeric(s, errors='coerce')

    def clean_percentage(s):
        if not isinstance(s, str): return pd.to_numeric(s, errors='coerce')
        return pd.to_numeric(s.replace('%', '').replace('"', '').strip(), errors='coerce') / 100.0

    def clean_numeric(s):
        if not isinstance(s, str): return pd.to_numeric(s, errors='coerce')
        return pd.to_numeric(s.replace(',', '').strip(), errors='coerce')

    for file_name in file_names:
        try:
            print(f"--- Processing: {file_name} ---")
            
            # --- File Loading and Parsing ---
            if file_name == "Marketing Channel Breakdown.csv":
                # Manual parsing for the most problematic file
                with open(file_name, 'r', encoding='utf-8') as f:
                    lines = f.readlines()
                header = [h.strip() for h in lines[0].split('\t')]
                rows = [line.strip().split('\t') for line in lines[1:]]
                df = pd.DataFrame(rows, columns=header)
            else:
                df = pd.read_csv(file_name)

            # --- Cleaning and Transformation ---
            original_file_name = file_name
            df.columns = [str(col).lower().replace(' ', '_').replace('-', '_') for col in df.columns]

            if 'marketing_channel_breakdown' in file_name.lower():
                df.rename(columns={df.columns[0]: 'year_period'}, inplace=True)
                for col in ['ad_spend', 'gross_discount_(shopify)']: df[col] = df[col].apply(clean_currency)
                for col in ['sessions', 'clicks', 'orders', 'new_customers']: df[col] = df[col].apply(clean_numeric)
                for col in ['ctr', 'conversion_rate']: df[col] = df[col].apply(clean_percentage)
                df['year'] = df['year_period'].apply(lambda x: str(x).split(' ')[0])

            elif 'returns' in file_name.lower():
                for col in ['gross_sales', 'discounts', 'returns', 'net_sales']: df[col] = df[col].apply(clean_currency)
                df['month'] = pd.to_datetime(df['month'], format='%m/%d/%Y', errors='coerce')

            elif 'email' in file_name.lower():
                df.rename(columns={'type_(batch_marketing,_triggered,_transactional)': 'email_type'}, inplace=True)
                for col in ['sends', 'clicks']: df[col] = df[col].apply(clean_numeric)
                
                # Remove pre-calculated "Total" rows to avoid double-counting
                df = df[df['email_type'] != 'Total'].copy()
                
                df['date'] = pd.to_datetime(df['year'].astype(str) + '-' + df['month'].astype(str) + '-01', errors='coerce')
                df = df[['date', 'email_type', 'sends', 'clicks']]

            elif 'web_analytics' in file_name.lower():
                for col in ['added_to_cart_rate', 'reached_checkout_rate', 'checkout_conversion_rate', 'conversion_rate']: df[col] = df[col].apply(clean_percentage)
                df['date'] = pd.to_datetime(df['year'].astype(str) + '-' + df['month'].astype(str) + '-01', errors='coerce')
                df.drop(columns=['year', 'month'], inplace=True, errors='ignore')

            elif 'channel' in file_name.lower() or 'topsheet' in file_name.lower():
                id_vars = []
                if 'topsheet' in file_name.lower():
                    df.rename(columns={'kpi': 'metric', 'data_source': 'source'}, inplace=True)
                    id_vars = [c for c in ['source', 'metric', 'year'] if c in df.columns]
                else:
                    df.rename(columns={'channel': 'channel_name'}, inplace=True)
                    id_vars = [c for c in ['channel_name', 'year'] if c in df.columns]

                month_cols = ['january', 'february', 'march', 'april', 'may', 'june', 'july', 'august', 'september', 'october', 'november', 'december']
                value_vars = [col for col in month_cols if col in df.columns]
                
                # CORRECTED: Use clean_currency for files like media spend that have dollar values.
                # This is more robust than clean_numeric for these specific files.
                for col in value_vars: df[col] = df[col].apply(clean_currency)

                df = df.melt(id_vars=id_vars, value_vars=value_vars, var_name='month', value_name='value')
                
                month_map = {name: i+1 for i, name in enumerate(month_cols)}
                df['month_num'] = df['month'].str.lower().map(month_map)
                df['year'] = pd.to_numeric(df['year'], errors='coerce')
                df.dropna(subset=['year', 'month_num'], inplace=True)
                df['year'] = df['year'].astype(int)
                df['date'] = pd.to_datetime(df['year'].astype(str) + '-' + df['month_num'].astype(str) + '-01', errors='coerce')
                df.drop(columns=['year', 'month', 'month_num'], inplace=True, errors='ignore')

            cleaned_dataframes[original_file_name] = df
            print(f"✅ Success.")

        except Exception as e:
            errors[original_file_name] = str(e)
            print(f"❌ Error: {e}")

    print("\n\n--- Processing Summary ---")
    if cleaned_dataframes:
        print(f"Successfully processed: {len(cleaned_dataframes)} files")
        for f in cleaned_dataframes.keys():
            cleaned_file_name = f"cleaned_{f}"
            cleaned_dataframes[f].to_csv(cleaned_file_name, index=False)
            print(f"  - {f} -> Saved to {cleaned_file_name}")

    if errors:
        print(f"\nFailed to process: {len(errors)} files")
        for f, reason in errors.items():
            print(f"  - {f}: {reason}")
    
    return cleaned_dataframes, errors

# --- Run the entire cleaning process ---
if __name__ == '__main__':
    cleaned_data, error_log = clean_all_data_files_final()


Starting the cleaning process for 1 files...
--- Processing: Email.csv ---
✅ Success.


--- Processing Summary ---
Successfully processed: 1 files
  - Email.csv -> Saved to cleaned_Email.csv


In [11]:
import pandas as pd
import glob
import re

def clean_all_data_files_final():
    """
    Finds all raw CSVs in the folder, then loads, cleans, reshapes, and saves them.
    This version includes specific fixes for the marketing file and date formats.
    """
    # Define all the expected raw filenames
    file_names = [
        "Web Analytics.csv"
    ]
    
    cleaned_dataframes = {}
    errors = {}

    print(f"Starting the cleaning process for {len(file_names)} files...")

    # --- Helper Functions ---
    def clean_currency(s):
        if not isinstance(s, str): return pd.to_numeric(s, errors='coerce')
        s = s.replace('$', '').replace(',', '').strip()
        if '(' in s and ')' in s:
            s = '-' + s.replace('(', '').replace(')', '')
        return pd.to_numeric(s, errors='coerce')

    def clean_percentage(s):
        if not isinstance(s, str): return pd.to_numeric(s, errors='coerce')
        return pd.to_numeric(s.replace('%', '').replace('"', '').strip(), errors='coerce') / 100.0

    def clean_numeric(s):
        if not isinstance(s, str): return pd.to_numeric(s, errors='coerce')
        return pd.to_numeric(s.replace(',', '').strip(), errors='coerce')

    for file_name in file_names:
        try:
            print(f"--- Processing: {file_name} ---")
            
            # --- File Loading and Parsing ---
            if file_name == "Marketing Channel Breakdown.csv":
                # Manual parsing for the most problematic file
                with open(file_name, 'r', encoding='utf-8') as f:
                    lines = f.readlines()
                header = [h.strip() for h in lines[0].split('\t')]
                rows = [line.strip().split('\t') for line in lines[1:]]
                df = pd.DataFrame(rows, columns=header)
            else:
                df = pd.read_csv(file_name)

            # --- Cleaning and Transformation ---
            original_file_name = file_name
            df.columns = [str(col).lower().replace(' ', '_').replace('-', '_') for col in df.columns]

            if 'marketing_channel_breakdown' in file_name.lower():
                df.rename(columns={df.columns[0]: 'year_period'}, inplace=True)
                for col in ['ad_spend', 'gross_discount_(shopify)']: df[col] = df[col].apply(clean_currency)
                for col in ['sessions', 'clicks', 'orders', 'new_customers']: df[col] = df[col].apply(clean_numeric)
                for col in ['ctr', 'conversion_rate']: df[col] = df[col].apply(clean_percentage)
                df['year'] = df['year_period'].apply(lambda x: str(x).split(' ')[0])

            elif 'returns' in file_name.lower():
                for col in ['gross_sales', 'discounts', 'returns', 'net_sales']: df[col] = df[col].apply(clean_currency)
                df['month'] = pd.to_datetime(df['month'], format='%m/%d/%Y', errors='coerce')

            elif 'email' in file_name.lower():
                df.rename(columns={'type_(batch_marketing,_triggered,_transactional)': 'email_type'}, inplace=True)
                for col in ['sends', 'clicks']: df[col] = df[col].apply(clean_numeric)
                
                # Remove pre-calculated "Total" rows to avoid double-counting
                df = df[df['email_type'] != 'Total'].copy()
                
                df['date'] = pd.to_datetime(df['year'].astype(str) + '-' + df['month'].astype(str) + '-01', errors='coerce')
                df = df[['date', 'email_type', 'sends', 'clicks']]

            elif 'web_analytics' in file_name.lower():
                for col in ['added_to_cart_rate', 'reached_checkout_rate', 'checkout_conversion_rate', 'conversion_rate']: df[col] = df[col].apply(clean_percentage)
                df['date'] = pd.to_datetime(df['year'].astype(str) + '-' + df['month'].astype(str) + '-01', errors='coerce')
                df.drop(columns=['year', 'month'], inplace=True, errors='ignore')

            elif 'channel' in file_name.lower() or 'topsheet' in file_name.lower():
                id_vars = []
                if 'topsheet' in file_name.lower():
                    df.rename(columns={'kpi': 'metric', 'data_source': 'source'}, inplace=True)
                    id_vars = [c for c in ['source', 'metric', 'year'] if c in df.columns]
                else:
                    df.rename(columns={'channel': 'channel_name'}, inplace=True)
                    id_vars = [c for c in ['channel_name', 'year'] if c in df.columns]

                month_cols = ['january', 'february', 'march', 'april', 'may', 'june', 'july', 'august', 'september', 'october', 'november', 'december']
                value_vars = [col for col in month_cols if col in df.columns]
                
                # CORRECTED: Use clean_currency for files like media spend that have dollar values.
                # This is more robust than clean_numeric for these specific files.
                for col in value_vars: df[col] = df[col].apply(clean_currency)

                df = df.melt(id_vars=id_vars, value_vars=value_vars, var_name='month', value_name='value')
                
                month_map = {name: i+1 for i, name in enumerate(month_cols)}
                df['month_num'] = df['month'].str.lower().map(month_map)
                df['year'] = pd.to_numeric(df['year'], errors='coerce')
                df.dropna(subset=['year', 'month_num'], inplace=True)
                df['year'] = df['year'].astype(int)
                df['date'] = pd.to_datetime(df['year'].astype(str) + '-' + df['month_num'].astype(str) + '-01', errors='coerce')
                df.drop(columns=['year', 'month', 'month_num'], inplace=True, errors='ignore')

            cleaned_dataframes[original_file_name] = df
            print(f"✅ Success.")

        except Exception as e:
            errors[original_file_name] = str(e)
            print(f"❌ Error: {e}")

    print("\n\n--- Processing Summary ---")
    if cleaned_dataframes:
        print(f"Successfully processed: {len(cleaned_dataframes)} files")
        for f in cleaned_dataframes.keys():
            cleaned_file_name = f"cleaned_{f}"
            cleaned_dataframes[f].to_csv(cleaned_file_name, index=False)
            print(f"  - {f} -> Saved to {cleaned_file_name}")

    if errors:
        print(f"\nFailed to process: {len(errors)} files")
        for f, reason in errors.items():
            print(f"  - {f}: {reason}")
    
    return cleaned_dataframes, errors

# --- Run the entire cleaning process ---
if __name__ == '__main__':
    cleaned_data, error_log = clean_all_data_files_final()


Starting the cleaning process for 1 files...
--- Processing: Web Analytics.csv ---
✅ Success.


--- Processing Summary ---
Successfully processed: 1 files
  - Web Analytics.csv -> Saved to cleaned_Web Analytics.csv


In [15]:
import pandas as pd
import numpy as np
import glob

def clean_and_process_all_files():
    """
    Finds all relevant CSVs in the folder, then loads, cleans, and processes them.
    - Cleans currency and percentage strings.
    - Reshapes wide-format data to long format.
    - Standardizes column names.
    - Handles file not found errors gracefully.
    - Saves cleaned files to new CSVs.
    """
    # Use glob to find the files in the current directory
    file_names = [f for f in glob.glob("*.csv") if not f.startswith('cleaned_')]
    
    cleaned_dataframes = {}
    errors = {}

    print(f"Found {len(file_names)} files to process...")

    # --- Helper Functions ---
    def clean_currency(s):
        if isinstance(s, str):
            s = s.replace('$', '').replace(',', '')
            if '(' in s and ')' in s:
                s = '-' + s.replace('(', '').replace(')', '')
            return pd.to_numeric(s, errors='coerce')
        return s

    def clean_numeric_string(s):
        if isinstance(s, str):
            return pd.to_numeric(s.replace(',', ''), errors='coerce')
        return s

    def clean_percentage(s):
        if isinstance(s, str):
            return pd.to_numeric(s.replace('%', ''), errors='coerce') / 100.0
        return s

    for file_name in file_names:
        try:
            print(f"--- Processing: {file_name} ---")
            
            separator = '\t' if file_name == "TOPSHEET.csv" else ','
            df = pd.read_csv(file_name, sep=separator)
            
            original_file_name = file_name # Keep track of the original name
            
            # Standardize column names
            df.columns = [str(col).lower().replace(' ', '_').replace('-', '_') for col in df.columns]

            if 'returns' in file_name.lower():
                for col in ['gross_sales', 'discounts', 'returns', 'net_sales']:
                    df[col] = df[col].apply(clean_currency)
                df['month'] = pd.to_datetime(df['month'], format='%m/%d/%Y', errors='coerce')

            elif 'email' in file_name.lower():
                df.rename(columns={'type_(batch_marketing,_triggered,_transactional)': 'email_type'}, inplace=True)
                df['sends'] = df['sends'].apply(clean_numeric_string)
                df['clicks'] = df['clicks'].apply(clean_numeric_string)
                df['date'] = pd.to_datetime(df['year'].astype(str) + '-' + df['month'].astype(str) + '-01', errors='coerce')
                df = df[['date', 'email_type', 'sends', 'clicks']]
            
            elif 'web_analytics' in file_name.lower():
                for col in ['added_to_cart_rate', 'reached_checkout_rate', 'checkout_conversion_rate', 'conversion_rate']:
                    df[col] = df[col].apply(clean_percentage)
                df['date'] = pd.to_datetime(df['year'].astype(str) + '-' + df['month'].astype(str) + '-01', errors='coerce')
                df.drop(columns=['year', 'month'], inplace=True, errors='ignore')

            elif 'channel' in file_name.lower() or 'topsheet' in file_name.lower():
                if 'topsheet' in file_name.lower():
                    df.rename(columns={'kpi': 'metric', 'data_source': 'source'}, inplace=True)
                    id_vars = ['source', 'metric', 'year']
                else:
                    df.rename(columns={'channel': 'channel_name'}, inplace=True)
                    id_vars = ['channel_name', 'year']
                
                month_cols = ['january', 'february', 'march', 'april', 'may', 'june', 'july', 'august', 'september', 'october', 'november', 'december']
                value_vars = [col for col in month_cols if col in df.columns]
                
                for col in value_vars:
                    df[col] = df[col].astype(str).str.replace(',', '').str.replace('$', '').str.replace('%', '')
                    df[col] = pd.to_numeric(df[col], errors='coerce')

                df = df.melt(id_vars=id_vars, value_vars=value_vars, var_name='month', value_name='value')
                
                month_map = {name: i+1 for i, name in enumerate(month_cols)}
                df['month_num'] = df['month'].str.lower().map(month_map)
                df['year'] = pd.to_numeric(df['year'], errors='coerce')
                df.dropna(subset=['year', 'month_num'], inplace=True)
                df['year'] = df['year'].astype(int)
                df['date'] = pd.to_datetime(df['year'].astype(str) + '-' + df['month_num'].astype(str) + '-01', errors='coerce')
                df.drop(columns=['year', 'month', 'month_num'], inplace=True, errors='ignore')
            
            elif 'marketing_channel_breakdown' in file_name.lower():
                 # This file might have a year as the first column, let's rename it
                df.rename(columns={df.columns[0]: 'year'}, inplace=True)
                # Add cleaning logic for its specific currency/numeric columns if needed
                for col in ['ad_spend', 'gross_discount_(shopify)', 'sessions', 'clicks', 'orders', 'new_customers']:
                    if col in df.columns:
                        df[col] = df[col].apply(clean_numeric_string)
                for col in ['ctr', 'conversion_rate']:
                     if col in df.columns:
                        df[col] = df[col].apply(clean_percentage)

            cleaned_dataframes[original_file_name] = df
            print(f"✅ Success.")

        except Exception as e:
            errors[original_file_name] = str(e)
            print(f"❌ Error: {e}")

    print("\n\n--- Processing Summary ---")
    if cleaned_dataframes:
        print(f"Successfully processed: {len(cleaned_dataframes)} files")
        for f in cleaned_dataframes.keys():
            cleaned_file_name = f"cleaned_{f}"
            cleaned_dataframes[f].to_csv(cleaned_file_name, index=False)
            print(f"  - {f} -> Saved to {cleaned_file_name}")

    if errors:
        print(f"\nFailed to process: {len(errors)} files")
        for f, reason in errors.items():
            print(f"  - {f}: {reason}")
    
    return cleaned_dataframes, errors

# --- Run the script ---
if __name__ == '__main__':
    cleaned_data, error_log = clean_and_process_all_files()

    # As an example, display the first 5 rows of a cleaned, reshaped file
    if 'Media Spend by Channel.csv' in cleaned_data:
        print("\n--- Sample of Cleaned Data (Media Spend by Channel.csv) ---")
        print(cleaned_data['Media Spend by Channel.csv'].head())

Found 11 files to process...
--- Processing: Cleaned_Marketing Channel Breakdown.csv ---
❌ Error: "The following id_vars or value_vars are not present in the DataFrame: ['channel_name']"
--- Processing: Cust By Channel-Ext.csv ---
✅ Success.
--- Processing: Cust By Channel-New.csv ---
✅ Success.
--- Processing: Email.csv ---
✅ Success.
--- Processing: Marketing Channel Breakdown.csv ---
❌ Error: "The following id_vars or value_vars are not present in the DataFrame: ['channel_name', 'year']"
--- Processing: Media Spend by Channel.csv ---
✅ Success.
--- Processing: Orders By Channel-Ext.csv ---
✅ Success.
--- Processing: Orders By Channel-New.csv ---
✅ Success.
--- Processing: Returns.csv ---
✅ Success.
--- Processing: TOPSHEET.csv ---
❌ Error: "The following id_vars or value_vars are not present in the DataFrame: ['source', 'metric', 'year']"
--- Processing: Web Analytics.csv ---
✅ Success.


--- Processing Summary ---
Successfully processed: 8 files


PermissionError: [Errno 13] Permission denied: 'cleaned_Cust By Channel-Ext.csv'