In [None]:
# 1. Imports
import sys
sys.path.append("../src")
from data_processing import *

import pandas as pd

# ✅ Toggle: fast mode skips plotting to save time on large datasets
FAST_MODE = True  # Set False to run full visualizations

# Define load_data function directly in notebook
def load_data(path="../data/raw/data.csv"):
    return pd.read_csv(path)

# -----------------------------------------------
# 🚀 Main Notebook Workflow

# Load Data
df = load_data()

# Overview
overview(df)

# Summary Stats
summary_stats = summary_statistics(df)
print(summary_stats)

# Visualizations
if not FAST_MODE:
    print("📊 Plotting distributions...")
    plot_numerical_distributions(df)
    plot_categorical_distributions(df)

# Correlation
print("🔗 Correlation Matrix")
correlation_matrix(df)

# Missing values
missing = missing_values(df)
print("Missing Values:\n", missing[missing > 0])

# Outliers
outliers = detect_outliers(df)

print("Outliers:\n", outliers)
def add_outlier_flags(df):
    """
    Adds new boolean columns indicating if a value in certain numeric columns is an outlier based on IQR method.
    Drops 'CountryCode' column as it has no variance and prints which columns were dropped.
    """
    dropped_cols = []
    # Drop constant column
    if 'CountryCode' in df.columns:
        df = df.drop(columns=['CountryCode'])
        dropped_cols.append('CountryCode')
    
    if dropped_cols:
        print(f"Dropped columns: {', '.join(dropped_cols)}")
    
    numeric_cols = ['Amount', 'Value', 'PricingStrategy']
    for col in numeric_cols:
        if col in df.columns:
            q1 = df[col].quantile(0.25)
            q3 = df[col].quantile(0.75)
            iqr = q3 - q1
            lower_bound = q1 - 1.5 * iqr
            upper_bound = q3 + 1.5 * iqr
            
            # Create flag column, 1 if outlier, 0 otherwise
            flag_col = f"{col}_outlier_flag"
            df[flag_col] = ((df[col] < lower_bound) | (df[col] > upper_bound)).astype(int)

    return df


df = add_outlier_flags(df)
# Count how many outliers were flagged per column and print
numeric_cols = ['Amount', 'Value', 'PricingStrategy']
for col in numeric_cols:
    flag_col = f"{col}_outlier_flag"
    if flag_col in df.columns:
        count_outliers = df[flag_col].sum()
        print(f"Number of outliers flagged in {col}: {count_outliers}")
# Boxplots
if not FAST_MODE:
    plot_boxplots(df)

# Save Cleaned Data
save_processed_data(df)


🔎 Data Overview
Shape: (95662, 16)

Data Types:
 TransactionId            object
BatchId                  object
AccountId                object
SubscriptionId           object
CustomerId               object
CurrencyCode             object
CountryCode               int64
ProviderId               object
ProductId                object
ProductCategory          object
ChannelId                object
Amount                  float64
Value                     int64
TransactionStartTime     object
PricingStrategy           int64
FraudResult               int64
dtype: object

First 5 Rows:
          TransactionId         BatchId       AccountId       SubscriptionId  \
0  TransactionId_76871   BatchId_36123  AccountId_3957   SubscriptionId_887   
1  TransactionId_73770   BatchId_15642  AccountId_4841  SubscriptionId_3829   
2  TransactionId_26203   BatchId_53941  AccountId_4229   SubscriptionId_222   
3    TransactionId_380  BatchId_102363   AccountId_648  SubscriptionId_2185   
4  Transaction