In [None]:
# Exploratory Data Analysis
# Only run once
import sys
import os

project_root = os.path.abspath('../')
os.chdir(project_root)

src_path = os.path.abspath(os.path.join(os.getcwd(), 'src'))
if src_path not in sys.path:
    sys.path.insert(0, src_path)

# Setup

In [None]:
from load_data import DataLoader
from preprocess import Preprocessor, FeatureEngineering
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

dl = DataLoader()
pre = Preprocessor()
fe = FeatureEngineering()

In [None]:
train = dl.load_data('', 'data/raw')
test = dl.load_data('', 'data/raw')

# train_clean = dl.load_data('train_clean.csv', 'data/processed')
# test_clean = dl.load_data('test_clean.csv', 'data/processed')

train_df = train.copy()
test_df = test.copy()

# train_df = train_clean.copy()
# test_df = test_clean.copy()

# EDA

## Info

In [None]:
dl.data_info(train)

In [None]:
dl.preview_data(train)

In [None]:
dl.data_info(test)

In [None]:
dl.preview_data(test)

## Cleaning and validate

In [None]:
def validate_and_clean_data(df, name="data", is_test=False):
    print(f"=== Data Validation and Cleaning for {name} ===")
    df_clean = df.copy()
    
    skip_cols = ['id']
    if not is_test:
        skip_cols.append('accident_risk')
    
    print("\n1. Checking numeric columns for non-numeric values:")
    numeric_cols = df_clean.select_dtypes(include=['int64', 'float64']).columns
    
    for col in numeric_cols:
        if col in skip_cols:
            continue

        str_values = df_clean[col].astype(str)
        non_numeric_mask = ~str_values.str.match(r'^-?\d*\.?\d*$')
        non_numeric_count = non_numeric_mask.sum()

        if non_numeric_count > 0:
            print(f"  - {col}: Found {non_numeric_count} non-numeric values")
            print(f"    Sample values: {df_clean[col][non_numeric_mask].unique()[:5]}")

            df_clean[col] = pd.to_numeric(df_clean[col], errors='coerce')
    
    print("\n2. Checking categorical columns for data type issues:")
    categorical_cols = df_clean.select_dtypes(include=['object']).columns
    
    for col in categorical_cols:
        unique_vals = df_clean[col].unique()
        print(f"  - {col}: {len(unique_vals)} unique values")
        print(f"    Sample values: {unique_vals[:5]}")
        
        if len(unique_vals) > 50:
            try:
                numeric_test = pd.to_numeric(df_clean[col], errors='coerce')
                if numeric_test.notna().sum() / len(df_clean) > 0.8:
                    print(f"    WARNING: {col} might be numeric but stored as object")
                    print(f"    Consider converting to numeric type")
            except:
                pass
    
    print("\n3. Checking for constant columns:")
    constant_cols = [col for col in df_clean.columns if df_clean[col].nunique() <= 1]
    if constant_cols:
        print(f"  Found {len(constant_cols)} constant columns:")
        for col in constant_cols:
            print(f"    - {col}: {df_clean[col].iloc[0] if len(df_clean) > 0 else 'Empty'}")
    else:
        print("  No constant columns found")
    
    print("\n4. Checking for duplicate columns:")
    duplicate_cols = []
    for i, col1 in enumerate(df_clean.columns):
        for col2 in df_clean.columns[i+1:]:
            if df_clean[col1].equals(df_clean[col2]):
                duplicate_cols.append((col1, col2))
    
    if duplicate_cols:
        print(f"  Found {len(duplicate_cols)} duplicate column pairs:")
        for col1, col2 in duplicate_cols:
            print(f"    - {col1} == {col2}")
    else:
        print("  No duplicate columns found")
    
    print(f"\n5. Data types summary after cleaning:")
    print(f"  - Numeric columns: {len(df_clean.select_dtypes(include=['int64', 'float64']).columns)}")
    print(f"  - Categorical columns: {len(df_clean.select_dtypes(include=['object', 'category']).columns)}")
    print(f"  - Other types: {len(df_clean.select_dtypes(exclude=['int64', 'float64', 'object', 'category']).columns)}")
    
    return df_clean

train_clean = validate_and_clean_data(train_df, "train")
test_clean = validate_and_clean_data(test_df, "test", is_test=True)

In [None]:
def update_column_definitions(df, is_test=False):
    cols_to_drop = ['id']
    if not is_test:
        cols_to_drop.append('accident_risk')
    
    available_cols = [col for col in cols_to_drop if col in df.columns]
    df_features = df.drop(columns=available_cols)
    
    num_cols_updated = df_features.select_dtypes(include=['float64', 'int64']).columns
    cat_cols_updated = df_features.select_dtypes(include=['object', 'category']).columns
    
    print("Updated column definitions:")
    print(f"Numerical columns ({len(num_cols_updated)}): {list(num_cols_updated)}")
    print(f"Categorical columns ({len(cat_cols_updated)}): {list(cat_cols_updated)}")
    
    return num_cols_updated, cat_cols_updated

num_cols_clean, cat_cols_clean = update_column_definitions(train_clean, is_test=False)
num_cols_test, cat_cols_test = update_column_definitions(test_clean, is_test=True)

In [None]:
def fix_data_types(df):
    df_fixed = df.copy()
    
    for col in df_fixed.select_dtypes(include=['int64', 'float64']).columns:
        if col in ['id', 'accident_risk']:
            continue
        
        df_fixed[col] = pd.to_numeric(df_fixed[col], errors='coerce')
        
        if df_fixed[col].isna().sum() > df[col].isna().sum():
            print(f"Fixed non-numeric values in column '{col}'")
    
    return df_fixed

print("Checking and fixing data type issues...")
train_df = fix_data_types(train_df)
test_df = fix_data_types(test_df)

In [None]:
train_clean.to_csv('data/processed/train_clean.csv', index=False)
test_clean.to_csv('data/processed/test_clean.csv', index=False)

In [None]:
num_cols = train_df.drop(columns=['id', 'target']).select_dtypes(include=['float64', 'int64']).columns
cat_cols = train_df.drop(columns=['id', 'target']).select_dtypes(include=['object', 'category', 'bool']).columns
num_cols_test = test_df.drop(columns=['id']).select_dtypes(include=['float64', 'int64']).columns
cat_cols_test = test_df.drop(columns=['id']).select_dtypes(include=['object', 'category', 'bool']).columns

## Missing value

In [None]:
def show_missing(df, name="data"):
    missing = df.isnull().sum()
    missing = missing[missing > 0].sort_values(ascending=False)
    if not missing.empty:
        print(f"Missing values in {name}:")
        display(missing)
        missing.plot(kind='bar', title=f'Missing Values in {name}')
        plt.show()
    else:
        print(f"No missing values found in {name}.")

In [None]:
show_missing(train_df, "train")

## Data distribution

In [None]:
if len(num_cols) > 0:
    n_cols = 2
    n_rows = -(-len(num_cols) // n_cols)

    fig, axes = plt.subplots(n_rows, n_cols, figsize=(12, 4 * n_rows))
    if n_rows == 1 and len(num_cols) == 1:
        axes = [axes]
    else:
        axes = axes.flatten()

    for i, col in enumerate(num_cols):
        sns.histplot(train_df, x=col, kde=True, bins=30, ax=axes[i])
        axes[i].set_title(f'Distribution of {col}')
        axes[i].set_xlabel(col)
        axes[i].set_ylabel('Frequency')

    for i in range(len(num_cols), len(axes)):
        fig.delaxes(axes[i])
        
    plt.tight_layout()
    plt.show()
else:
    print("No numerical columns found in the dataset.")

In [None]:
if len(cat_cols) > 0:
    n_cols = 2
    n_rows = -(-len(cat_cols) // n_cols)

    fig, axes = plt.subplots(n_rows, n_cols, figsize=(12, 4 * n_rows))
    if n_rows == 1 and len(cat_cols) == 1:
        axes = [axes]
    else:
        axes = axes.flatten()

    for i, col in enumerate(cat_cols):
        sns.countplot(train_df, x=col, ax=axes[i])
        axes[i].set_title(f'Distribution of {col}')
        axes[i].set_xlabel(col)
        # axes[i].set_xticklabels(axes[i].get_xticklabels(), rotation=45, ha='right')
        axes[i].set_ylabel('Frequency')

    for i in range(len(cat_cols), len(axes)):
        fig.delaxes(axes[i])
        
    plt.tight_layout()
    plt.show()
else:
    print("No categorical columns found in the dataset.")

## Outlier

In [None]:
if len(num_cols) > 0:
    n_cols = 2
    n_rows = -(-len(num_cols) // n_cols)

    fig, axes = plt.subplots(n_rows, n_cols, figsize=(12, 4 * n_rows))
    if n_rows == 1 and len(num_cols) == 1:
        axes = [axes]
    else:
        axes = axes.flatten()

    for i, col in enumerate(num_cols):
        sns.boxplot(train_df, y=col, ax=axes[i])
        axes[i].set_title(f'Boxplot of {col}')
        axes[i].set_ylabel(col)
        
    for i in range(len(num_cols), len(axes)):
        fig.delaxes(axes[i])
        
    plt.tight_layout()
    plt.show()
else:
    print("No numerical columns found in the dataset.")

## Feature-target relationship analysis

### Numerical and target

In [None]:
if len(num_cols) > 0:
    n_cols = 2
    n_rows = -(-len(num_cols) // n_cols)

    fig, axes = plt.subplots(n_rows, n_cols, figsize=(12, 4 * n_rows))
    if n_rows == 1 and len(num_cols) == 1:
        axes = [axes]
    else:
        axes = axes.flatten()

    for i, col in enumerate(num_cols):
        sns.violinplot(train_df, x='target', y=col, ax=axes[i])
        axes[i].set_title(f'Violin plot of {col} by target')
        axes[i].set_xlabel('Target')
        axes[i].set_ylabel(col)
        
    for i in range(len(num_cols), len(axes)):
        fig.delaxes(axes[i])
        
    plt.tight_layout()
    plt.show()
else:
    print("No numerical columns found for violin plot analysis.")

### Categorical and target

In [None]:
if len(cat_cols) > 0:
    for col in cat_cols:
        ct = pd.crosstab(train_df[col], train_df['target'], normalize='index')
        ct.plot(kind='bar', stacked=True, figsize=(6, 4))
        plt.title(f'Proportion of target by {col}')
        plt.xlabel(col)
        plt.ylabel('Proportion')
        plt.legend(title='target', loc='upper right')
        plt.tight_layout()
        plt.show()
else:
    print("No categorical columns to analyze with target.")

## Pairplot matrix

In [None]:
if len(num_cols) > 0:
    pairplot_cols = num_cols.tolist() + ['target']
    
    sns.pairplot(train_df[pairplot_cols], hue='target', diag_kind='kde', palette="Set1", corner=True)
    plt.suptitle('Pairplot of Numerical Features with Target', y=1.02)
    plt.show()
else:
    print("No numerical columns found for pairplot analysis.")

## Multicolinearity analysis

In [None]:
train_to_encode = train_df.copy()
train_encoded = pre.label_encode(train_to_encode)
train_corr = train_encoded.corr()

plt.figure(figsize=(12, 10))
sns.heatmap(train_corr, annot=True, fmt='.2f', cmap='coolwarm', square=True, cbar_kws={"shrink": .8})
plt.title('Correlation Heatmap of Encoded Features')
plt.show()

In [None]:
def analyze_correlation(corr_matrix, top_n=5):
    corr_pairs = (
        corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))
        .stack()
        .reset_index()
    )
    corr_pairs.columns = ['Feature 1', 'Feature 2', 'Correlation']
    top_pos = corr_pairs.sort_values(by='Correlation', ascending=False).head(top_n)
    top_neg = corr_pairs.sort_values(by='Correlation').head(top_n)
    print("Top positively correlated feature pairs:")
    display(top_pos)
    print("\nTop negatively correlated feature pairs:")
    display(top_neg)

analyze_correlation(train_corr, top_n=5)

## Skewness and kurtosis analysis

In [None]:
if len(num_cols) > 0:
    skew_kurt = pd.DataFrame({
        'skew': train_df[num_cols].skew(),
        'kurtosis': train_df[num_cols].kurt()
    })

    display(skew_kurt)

    skew_kurt.plot(kind='bar', subplots=True, layout=(2, 1), figsize=(10, 6), title=['Skewness', 'Kurtosis'])
    plt.tight_layout()
    plt.show()
else:
    print("No numerical columns found for skewness and kurtosis analysis.")

## Unique values and category frequency analysis

In [None]:
if len(cat_cols) > 0:
    for col in cat_cols:
        print(f"Feature: {col}")
        print(f"  Unique category count: {train_df[col].nunique()}")
        if train_df[col].nunique() > 0:
            top_cat = train_df[col].value_counts().idxmax()
            top_freq = train_df[col].value_counts().max()
            print(f"  Most frequent categories: {top_cat} ({top_freq} data)")
        print("-" * 45)
else:
    print("No categorical columns found in the dataset.")
    print("All features appear to be numerical.")

## Constant value analysis

In [None]:
constant_cols = [col for col in train_df.columns if train_df[col].nunique() == 1]
if constant_cols:
    print("Columns with constant values (only one unique value):")
    for col in constant_cols:
        print(f"- {col}: {train_df[col].unique()[0]}")
else:
    print("There are no columns with constant values.")