In [3]:
import pandas as pd

# Load the dataset
file_path = "../data/final_dataset.csv"  # Path to your dataset
df = pd.read_csv(file_path)

# Preview the data
print(df.head())

  InvoiceNo  Quantity InvoiceDate  Price  CustomerID    Country
0    540267        96    1/6/2011   0.72       12415  Australia
1    567085        16   9/16/2011   0.83       12434  Australia
2    540267        36    1/6/2011   1.85       12415  Australia
3    558537        48   6/30/2011   1.25       12424  Australia
4    556917       144   6/15/2011   2.49       12415  Australia


In [5]:
import pandas as pd
import numpy as np

# Handle missing values
def handle_missing_values(df):
    print("Initial shape:", df.shape)
    # Drop rows with missing critical fields
    df = df.dropna(subset=['InvoiceNo', 'InvoiceDate'])
    # Fill missing Price or Quantity with median (if applicable)
    df['Price'] = df['Price'].fillna(df['Price'].median())
    df['Quantity'] = df['Quantity'].fillna(df['Quantity'].median())
    df['CustomerID'] = df['CustomerID'].fillna(-1)  # Use -1 for missing CustomerID
    print("Shape after handling missing values:", df.shape)
    return df

# Remove duplicates
def remove_duplicates(df):
    print("Shape before removing duplicates:", df.shape)
    df = df.drop_duplicates()
    print("Shape after removing duplicates:", df.shape)
    return df

# Handle outliers
def handle_outliers(df):
    print("Handling outliers...")
    # Define thresholds
    price_upper = df['Price'].quantile(0.99)  # 99th percentile
    quantity_upper = df['Quantity'].quantile(0.99)
    
    # Filter out rows with extreme values
    df = df[(df['Price'] <= price_upper) & (df['Quantity'] <= quantity_upper)]
    print("Shape after handling outliers:", df.shape)
    return df

# Add new metrics
def add_new_metrics(df):
    print("Adding new metrics...")
    df['TotalRevenue'] = df['Price'] * df['Quantity']
    return df

# Aggregate data
def aggregate_data(df):
    print("Aggregating data...")
    # Ensure InvoiceDate is datetime
    df['InvoiceDate'] = pd.to_datetime(df['InvoiceDate'])
    
    # Daily aggregation
    daily = df.groupby(df['InvoiceDate'].dt.date).agg({
        'Quantity': 'sum',
        'TotalRevenue': 'sum',
        'InvoiceNo': 'count'  # Total transactions
    }).reset_index().rename(columns={'InvoiceNo': 'Transactions'})
    print("Daily aggregation complete.")
    
    # Weekly aggregation
    weekly = df.groupby(df['InvoiceDate'].dt.to_period('W')).agg({
        'Quantity': 'sum',
        'TotalRevenue': 'sum',
        'InvoiceNo': 'count'
    }).reset_index().rename(columns={'InvoiceNo': 'Transactions'})
    print("Weekly aggregation complete.")
    
    # Monthly aggregation
    monthly = df.groupby(df['InvoiceDate'].dt.to_period('M')).agg({
        'Quantity': 'sum',
        'TotalRevenue': 'sum',
        'InvoiceNo': 'count'
    }).reset_index().rename(columns={'InvoiceNo': 'Transactions'})
    print("Monthly aggregation complete.")
    
    return daily, weekly, monthly

# Main transformation function
def transform_data(df):
    df = handle_missing_values(df)
    df = remove_duplicates(df)
    df = handle_outliers(df)
    df = add_new_metrics(df)
    daily, weekly, monthly = aggregate_data(df)
    return df, daily, weekly, monthly

# Run the transformation
df_transformed, daily_agg, weekly_agg, monthly_agg = transform_data(df)

# Display sample results
print("Sample transformed data:")
print(df_transformed.head())

print("\nSample daily aggregation:")
print(daily_agg.head())

print("\nSample weekly aggregation:")
print(weekly_agg.head())

print("\nSample monthly aggregation:")
print(monthly_agg.head())

Initial shape: (5184, 6)
Shape after handling missing values: (5184, 6)
Shape before removing duplicates: (5184, 6)
Shape after removing duplicates: (4200, 6)
Handling outliers...
Shape after handling outliers: (4135, 6)
Adding new metrics...
Aggregating data...
Daily aggregation complete.
Weekly aggregation complete.
Monthly aggregation complete.
Sample transformed data:
  InvoiceNo  Quantity InvoiceDate  Price  CustomerID    Country  TotalRevenue
0    540267        96  2011-01-06   0.72       12415  Australia         69.12
1    567085        16  2011-09-16   0.83       12434  Australia         13.28
2    540267        36  2011-01-06   1.85       12415  Australia         66.60
3    558537        48  2011-06-30   1.25       12424  Australia         60.00
4    556917       144  2011-06-15   2.49       12415  Australia        358.56

Sample daily aggregation:
  InvoiceDate  Quantity  TotalRevenue  Transactions
0  2010-12-01       258        363.23            14
1  2010-12-02        22   

In [6]:
df.to_csv("final_dataset.csv")
# Save the aggregated data to CSV files
import os

# Function to save aggregated data
def save_aggregated_data(daily, weekly, monthly, directory="../data/"):
    # Ensure the directory exists
    os.makedirs(directory, exist_ok=True)
    
    # Define file paths
    daily_file = os.path.join(directory, "daily_aggregation.csv")
    weekly_file = os.path.join(directory, "weekly_aggregation.csv")
    monthly_file = os.path.join(directory, "monthly_aggregation.csv")
    
    # Save to CSV
    daily.to_csv(daily_file, index=False)
    print(f"Daily aggregation saved to: {daily_file}")
    
    weekly.to_csv(weekly_file, index=False)
    print(f"Weekly aggregation saved to: {weekly_file}")
    
    monthly.to_csv(monthly_file, index=False)
    print(f"Monthly aggregation saved to: {monthly_file}")

# Save the aggregated data
save_aggregated_data(daily_agg, weekly_agg, monthly_agg)


Daily aggregation saved to: ../data/daily_aggregation.csv
Weekly aggregation saved to: ../data/weekly_aggregation.csv
Monthly aggregation saved to: ../data/monthly_aggregation.csv
