In [2]:
# =============================================================================
# US ACCIDENTS SEVERITY PREDICTION - OPTIMIZED VERSION
# =============================================================================
# Uses optimal sampling: 67K per class (268K total) with SMOTE for minorities
# FIXED: Now properly handles NaN values BEFORE determining sample sizes
# =============================================================================

import numpy as np
import pandas as pd
import warnings
warnings.filterwarnings('ignore')

print("=" * 80)
print("US ACCIDENTS SEVERITY PREDICTION - OPTIMIZED PIPELINE (67K per class)")
print("=" * 80)

# =============================================================================
# PART 1: DATA RETRIEVAL FROM KAGGLE
# =============================================================================
print("\n[PART 1] RETRIEVING DATA FROM KAGGLE")
print("-" * 80)

import kagglehub
from kagglehub import KaggleDatasetAdapter

# Download the dataset
print("Downloading dataset from Kaggle...")
df = kagglehub.load_dataset(
    KaggleDatasetAdapter.PANDAS,
    "sobhanmoosavi/us-accidents",
    "US_Accidents_March23.csv"
)

print(f"‚úì Dataset loaded: {df.shape[0]:,} rows √ó {df.shape[1]} columns")
print(f"‚úì Memory usage: {df.memory_usage(deep=True).sum() / 1024**2:.2f} MB")

# =============================================================================
# PART 2: INITIAL DATA EXPLORATION
# =============================================================================
print("\n[PART 2] INITIAL DATA EXPLORATION")
print("-" * 80)

print("\nDataset Info:")
print(f"  Shape: {df.shape}")
print(f"  Columns: {df.shape[1]}")

print(f"\nSeverity Distribution (Original):")
severity_dist = df['Severity'].value_counts().sort_index()
total = len(df)
for sev, count in severity_dist.items():
    pct = count / total * 100
    print(f"  Severity {sev}: {count:,} ({pct:.2f}%)")

# =============================================================================
# PART 3: FEATURE ENGINEERING
# =============================================================================
print("\n[PART 3] FEATURE ENGINEERING")
print("-" * 80)

print("\nCreating time-based features...")

# Convert datetime columns
df['Start_Time'] = pd.to_datetime(df['Start_Time'], errors='coerce')

# Extract time features
df['year'] = df['Start_Time'].dt.year
df['month'] = df['Start_Time'].dt.month
df['day'] = df['Start_Time'].dt.day
df['hour'] = df['Start_Time'].dt.hour
df['dow'] = df['Start_Time'].dt.dayofweek

# Create time-based flags
df['is_weekend'] = (df['dow'] >= 5).astype(int)
df['is_morning_peak'] = ((df['hour'] >= 6) & (df['hour'] <= 9)).astype(int)
df['is_evening_peak'] = ((df['hour'] >= 16) & (df['hour'] <= 19)).astype(int)

print("‚úì Time features created")

# Create weather condition flags
print("\nCreating weather flags...")

def create_weather_flags(df):
    """Create binary flags for weather conditions"""
    if 'Weather_Condition' not in df.columns:
        return df
    
    weather = df['Weather_Condition'].fillna('').str.lower()
    
    df['is_rain'] = weather.str.contains('rain|drizzle|shower', regex=True).astype(int)
    df['is_snow'] = weather.str.contains('snow|sleet|ice|wintry', regex=True).astype(int)
    df['is_fog'] = weather.str.contains('fog|mist|haze', regex=True).astype(int)
    df['is_thunder'] = weather.str.contains('thunder|t-storm|storm', regex=True).astype(int)
    df['is_wind'] = weather.str.contains('wind', regex=True).astype(int)
    
    return df

df = create_weather_flags(df)

# Check if it's nighttime
if 'Sunrise_Sunset' in df.columns:
    df['is_night'] = (df['Sunrise_Sunset'] == 'Night').astype(int)
else:
    df['is_night'] = ((df['hour'] < 6) | (df['hour'] >= 20)).astype(int)

print("‚úì Weather flags created")

# =============================================================================
# PART 4: FEATURE SELECTION & DATA CLEANING
# =============================================================================
print("\n[PART 4] FEATURE SELECTION & DATA CLEANING")
print("-" * 80)

# Define feature groups
numeric_features = [
    'Temperature(F)', 'Humidity(%)', 'Pressure(in)', 
    'Visibility(mi)', 'Wind_Speed(mph)', 'Precipitation(in)', 
    'Distance(mi)'
]

boolean_features = [
    'Amenity', 'Bump', 'Crossing', 'Give_Way', 'Junction', 
    'No_Exit', 'Railway', 'Roundabout', 'Station', 'Stop', 
    'Traffic_Calming', 'Traffic_Signal'
]

time_features = [
    'hour', 'dow', 'month', 'year', 'day', 
    'is_weekend', 'is_morning_peak', 'is_evening_peak'
]

weather_flags = [
    'is_rain', 'is_snow', 'is_fog', 'is_thunder', 'is_wind', 'is_night'
]

# Combine all features
all_features = []
for feature in numeric_features + boolean_features + time_features + weather_flags:
    if feature in df.columns:
        all_features.append(feature)
    else:
        print(f"‚ö†Ô∏è  Warning: Feature '{feature}' not found")

print(f"\nTotal features selected: {len(all_features)}")

# =============================================================================
# PART 4B: HANDLE MISSING VALUES STRATEGICALLY (NEW!)
# =============================================================================
print("\n[PART 4B] HANDLING MISSING VALUES STRATEGICALLY")
print("-" * 80)

# Create ML dataframe
ml_df = df[all_features + ['Severity']].copy()

print(f"Initial dataset size: {len(ml_df):,} rows")

# Check missing values per column
print("\nMissing values per feature:")
missing_counts = ml_df.isnull().sum()
for col, count in missing_counts[missing_counts > 0].items():
    pct = count / len(ml_df) * 100
    print(f"  {col}: {count:,} ({pct:.1f}%)")

# STRATEGY: Fill numeric features with median instead of dropping rows
# This preserves many more samples, especially for minority classes
print("\nüìä Strategy: Impute missing values instead of dropping rows")
print("-" * 80)

from sklearn.impute import SimpleImputer

# Separate numeric and non-numeric features for imputation
numeric_cols = ml_df.select_dtypes(include=[np.number]).columns.tolist()

# Impute numeric columns with median
print("Imputing numeric features with median...")
numeric_imputer = SimpleImputer(strategy='median')
ml_df[numeric_cols] = numeric_imputer.fit_transform(ml_df[numeric_cols])

# Check if any rows still have NaN (shouldn't be many)
remaining_nan = ml_df.isnull().sum().sum()
if remaining_nan > 0:
    print(f"Dropping {ml_df.isnull().any(axis=1).sum():,} rows with remaining NaN values")
    ml_df = ml_df.dropna()

print(f"\n‚úì After imputation: {len(ml_df):,} rows preserved")

# Show class distribution AFTER cleaning
print("\nClass distribution after data cleaning:")
for sev, count in ml_df['Severity'].value_counts().sort_index().items():
    pct = count / len(ml_df) * 100
    print(f"  Severity {sev}: {count:,} ({pct:.2f}%)")

# =============================================================================
# PART 5: HANDLE CLASS IMBALANCE - OPTIMIZED SAMPLING (67K per class)
# =============================================================================
print("\n[PART 5] HANDLING CLASS IMBALANCE - OPTIMIZED SAMPLING")
print("=" * 80)

print("\n‚ö†Ô∏è  CURRENT Distribution (IMBALANCED):")
print("-" * 80)
for sev, count in ml_df['Severity'].value_counts().sort_index().items():
    pct = count / len(ml_df) * 100
    bar = '‚ñà' * min(int(pct / 2), 50)
    print(f"  Severity {sev}: {count:>10,} ({pct:>5.2f}%) {bar}")

# Analyze class sizes
class_counts = ml_df['Severity'].value_counts().sort_index()
min_class = class_counts.min()
min_severity = class_counts.idxmin()

print(f"\nüìä Class Distribution Analysis:")
print(f"  Smallest class: Severity {min_severity} with {min_class:,} samples")
print(f"  Largest class: Severity {class_counts.idxmax()} with {class_counts.max():,} samples")
print(f"  Imbalance ratio: {class_counts.max() / min_class:.0f}:1")

# =============================================================================
# STRATEGY: Target 67K per class using SMOTE for minorities
# =============================================================================
print("\n" + "=" * 80)
print("SAMPLING STRATEGY: Hybrid SMOTE + Undersampling (Target: 67K per class)")
print("=" * 80)

# Target: 67,000 samples per class (268K total) - based on smallest original class
TARGET_PER_CLASS = 67000

print(f"\nüéØ Target: {TARGET_PER_CLASS:,} samples per class")
print(f"   Total after balancing: {TARGET_PER_CLASS * 4:,} samples")

# Check if smallest class can support the target
if min_class < TARGET_PER_CLASS:
    print(f"\n‚ö†Ô∏è  Smallest class ({min_class:,}) is smaller than target ({TARGET_PER_CLASS:,})")
    print("   Will use SMOTE to generate synthetic samples for minority classes")
    USE_SMOTE = True
else:
    USE_SMOTE = False

# Prepare features and target for SMOTE
X_for_sampling = ml_df.drop('Severity', axis=1)
y_for_sampling = ml_df['Severity']

if USE_SMOTE:
    print("\nApplying SMOTE + RandomUnderSampler...")
    print("-" * 80)
    
    try:
        from imblearn.over_sampling import SMOTE
        from imblearn.under_sampling import RandomUnderSampler
        from imblearn.pipeline import Pipeline
        
        # Calculate sampling strategy
        # For classes smaller than target: oversample to target
        # For classes larger than target: undersample to target
        over_sampling_strategy = {}
        under_sampling_strategy = {}
        
        for severity in sorted(class_counts.index):
            current_count = class_counts[severity]
            if current_count < TARGET_PER_CLASS:
                # Need to oversample this class
                over_sampling_strategy[severity] = TARGET_PER_CLASS
                print(f"  Severity {severity}: SMOTE {current_count:,} ‚Üí {TARGET_PER_CLASS:,}")
            else:
                # Will undersample later
                under_sampling_strategy[severity] = TARGET_PER_CLASS
                print(f"  Severity {severity}: Undersample {current_count:,} ‚Üí {TARGET_PER_CLASS:,}")
        
        # Create pipeline: SMOTE first (for minorities), then undersample (for majorities)
        steps = []
        
        if over_sampling_strategy:
            # SMOTE with k_neighbors adjusted for smallest class
            min_samples = min(class_counts.values)
            k_neighbors = min(5, min_samples - 1) if min_samples > 1 else 1
            steps.append(('smote', SMOTE(
                sampling_strategy=over_sampling_strategy,
                random_state=42,
                k_neighbors=k_neighbors,
                n_jobs=-1
            )))
        
        if under_sampling_strategy:
            steps.append(('undersample', RandomUnderSampler(
                sampling_strategy=under_sampling_strategy,
                random_state=42
            )))
        
        resampling_pipeline = Pipeline(steps=steps)
        
        print("\nExecuting resampling pipeline...")
        X_resampled, y_resampled = resampling_pipeline.fit_resample(X_for_sampling, y_for_sampling)
        
        # Create balanced dataframe
        ml_df_balanced = pd.DataFrame(X_resampled, columns=X_for_sampling.columns)
        ml_df_balanced['Severity'] = y_resampled
        
        print("‚úì SMOTE resampling complete!")
        
    except ImportError:
        print("\n‚ö†Ô∏è  imbalanced-learn not installed. Using simple resampling with replacement...")
        print("   Install with: pip install imbalanced-learn")
        USE_SMOTE = False

if not USE_SMOTE:
    # Fallback: Simple resampling
    from sklearn.utils import resample
    
    print("\nApplying balanced resampling (with replacement for minorities)...")
    print("-" * 80)
    
    balanced_dfs = []
    
    for severity in sorted(ml_df['Severity'].unique()):
        severity_df = ml_df[ml_df['Severity'] == severity]
        current_size = len(severity_df)
        
        if current_size >= TARGET_PER_CLASS:
            # Undersample (random selection without replacement)
            print(f"Severity {severity}: Undersampling {current_size:>10,} ‚Üí {TARGET_PER_CLASS:,}")
            sampled = resample(
                severity_df, 
                n_samples=TARGET_PER_CLASS, 
                random_state=42, 
                replace=False
            )
        else:
            # Oversample (with replacement to reach target)
            oversample_factor = TARGET_PER_CLASS / current_size
            print(f"Severity {severity}: Oversampling  {current_size:>10,} ‚Üí {TARGET_PER_CLASS:,} ({oversample_factor:.1f}x)")
            sampled = resample(
                severity_df, 
                n_samples=TARGET_PER_CLASS, 
                random_state=42, 
                replace=True
            )
        
        balanced_dfs.append(sampled)
    
    # Combine and shuffle
    ml_df_balanced = pd.concat(balanced_dfs, ignore_index=True)

# Shuffle the balanced dataset
ml_df_balanced = ml_df_balanced.sample(frac=1, random_state=42).reset_index(drop=True)

print("\n" + "=" * 80)
print("‚úÖ BALANCED Distribution (AFTER SAMPLING):")
print("=" * 80)
for sev, count in ml_df_balanced['Severity'].value_counts().sort_index().items():
    pct = count / len(ml_df_balanced) * 100
    bar = '‚ñà' * int(pct / 2)
    print(f"  Severity {sev}: {count:>10,} ({pct:>5.1f}%) {bar}")

print(f"\nüìä Summary:")
print(f"  Total samples: {len(ml_df_balanced):,}")
print(f"  From original: {len(ml_df):,}")
print(f"  Target per class: {TARGET_PER_CLASS:,}")
print(f"  Perfect balance achieved: All classes at 25.0%")

# =============================================================================
# PART 6: PREPARE DATA FOR MULTI-CLASS CLASSIFICATION
# =============================================================================
print("\n[PART 6] PREPARING MULTI-CLASS CLASSIFICATION")
print("=" * 80)

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder

# Separate features and target
multiclass_X = ml_df_balanced.drop('Severity', axis=1)
multiclass_y = ml_df_balanced['Severity']

# Encode labels
label_encoder = LabelEncoder()
multiclass_y_encoded = label_encoder.fit_transform(multiclass_y)

print(f"\nDataset prepared:")
print(f"  Features: {multiclass_X.shape[1]}")
print(f"  Samples: {len(multiclass_y_encoded):,}")
print(f"  Classes: {label_encoder.classes_}")

# Train-test split (80/20) with stratification
multiclass_X_train, multiclass_X_test, multiclass_y_train, multiclass_y_test = train_test_split(
    multiclass_X, multiclass_y_encoded, 
    test_size=0.2, 
    random_state=42, 
    stratify=multiclass_y_encoded
)

print(f"\nTrain-test split:")
print(f"  Training: {len(multiclass_X_train):,} samples ({len(multiclass_X_train)/len(multiclass_X)*100:.0f}%)")
print(f"  Testing:  {len(multiclass_X_test):,} samples ({len(multiclass_X_test)/len(multiclass_X)*100:.0f}%)")

# Scale features
multiclass_scaler = StandardScaler()
multiclass_X_train_scaled = multiclass_scaler.fit_transform(multiclass_X_train)
multiclass_X_test_scaled = multiclass_scaler.transform(multiclass_X_test)

# Store feature names
multiclass_feature_names = multiclass_X.columns.tolist()

print("‚úì Data scaled and ready for training")

# =============================================================================
# PART 7: PREPARE DATA FOR BINARY CLASSIFICATION
# =============================================================================
print("\n[PART 7] PREPARING BINARY CLASSIFICATION")
print("=" * 80)

# Create binary target: Severity 1,2 ‚Üí LOW (0), Severity 3,4 ‚Üí HIGH (1)
binary_y = ml_df_balanced['Severity'].map({1: 0, 2: 0, 3: 1, 4: 1})

print("\nBinary mapping:")
print("  Severity 1, 2 ‚Üí LOW  (0)")
print("  Severity 3, 4 ‚Üí HIGH (1)")

print(f"\nBinary distribution:")
for cls, count in binary_y.value_counts().sort_index().items():
    label = "LOW" if cls == 0 else "HIGH"
    pct = count / len(binary_y) * 100
    print(f"  {label:4} ({cls}): {count:,} ({pct:.1f}%)")

# Train-test split for binary
binary_X_train, binary_X_test, binary_y_train, binary_y_test = train_test_split(
    multiclass_X, binary_y, 
    test_size=0.2, 
    random_state=42, 
    stratify=binary_y
)

print(f"\nBinary split:")
print(f"  Training: {len(binary_X_train):,} samples")
print(f"  Testing:  {len(binary_X_test):,} samples")

# Scale features for binary
binary_scaler = StandardScaler()
binary_X_train_scaled = binary_scaler.fit_transform(binary_X_train)
binary_X_test_scaled = binary_scaler.transform(binary_X_test)

# Store feature names
binary_feature_names = binary_X_train.columns.tolist()

print("‚úì Binary data scaled and ready")

# =============================================================================
# PART 8: TRAIN MULTI-CLASS MODELS
# =============================================================================
print("\n[PART 8] TRAINING MULTI-CLASS MODELS")
print("=" * 80)

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import time

# Check for optional libraries
try:
    from xgboost import XGBClassifier
    XGBOOST_AVAILABLE = True
except (ImportError, Exception):
    XGBOOST_AVAILABLE = False
    print("‚ö†Ô∏è  XGBoost not available")

try:
    from lightgbm import LGBMClassifier
    LIGHTGBM_AVAILABLE = True
except (ImportError, Exception):
    LIGHTGBM_AVAILABLE = False
    print("‚ö†Ô∏è  LightGBM not available")

# Define models
multiclass_models = {
    'Logistic Regression': LogisticRegression(
        max_iter=1000, 
        random_state=42, 
        multi_class='multinomial',
        n_jobs=-1
    ),
    'Random Forest': RandomForestClassifier(
        n_estimators=200,
        max_depth=20,
        min_samples_split=5,
        random_state=42,
        n_jobs=-1
    ),
}

# Add XGBoost if available
if XGBOOST_AVAILABLE:
    multiclass_models['XGBoost'] = XGBClassifier(
        n_estimators=200,
        max_depth=10,
        learning_rate=0.1,
        random_state=42,
        n_jobs=-1,
        eval_metric='mlogloss'
    )

# Add LightGBM if available
if LIGHTGBM_AVAILABLE:
    multiclass_models['LightGBM'] = LGBMClassifier(
        n_estimators=200,
        max_depth=15,
        learning_rate=0.1,
        random_state=42,
        n_jobs=-1,
        verbose=-1
    )

# Train models
multiclass_results = {}
multiclass_predictions = {}
multiclass_probabilities = {}
multiclass_trained_models = {}

print(f"\nTraining {len(multiclass_models)} models on {len(multiclass_X_train):,} samples...")
print("-" * 80)

for name, model in multiclass_models.items():
    print(f"\n{name}")
    print("  " + "-" * 40)
    
    start_time = time.time()
    
    # Determine if scaling is needed
    needs_scaling = 'Logistic' in name or 'KNN' in name or 'Neural' in name
    
    # Train model
    if needs_scaling:
        model.fit(multiclass_X_train_scaled, multiclass_y_train)
        y_pred = model.predict(multiclass_X_test_scaled)
        y_prob = model.predict_proba(multiclass_X_test_scaled)
    else:
        model.fit(multiclass_X_train, multiclass_y_train)
        y_pred = model.predict(multiclass_X_test)
        y_prob = model.predict_proba(multiclass_X_test)
    
    train_time = time.time() - start_time
    
    # Store model and predictions
    multiclass_trained_models[name] = model
    multiclass_predictions[name] = y_pred
    multiclass_probabilities[name] = y_prob
    
    # Calculate metrics
    multiclass_results[name] = {
        'Accuracy': accuracy_score(multiclass_y_test, y_pred),
        'Precision': precision_score(multiclass_y_test, y_pred, average='weighted', zero_division=0),
        'Recall': recall_score(multiclass_y_test, y_pred, average='weighted', zero_division=0),
        'F1-Score': f1_score(multiclass_y_test, y_pred, average='weighted', zero_division=0),
        'Training Time': train_time
    }
    
    print(f"  Accuracy:  {multiclass_results[name]['Accuracy']:.4f}")
    print(f"  Precision: {multiclass_results[name]['Precision']:.4f}")
    print(f"  Recall:    {multiclass_results[name]['Recall']:.4f}")
    print(f"  F1-Score:  {multiclass_results[name]['F1-Score']:.4f}")
    print(f"  Time:      {train_time:.2f}s")

print("\n" + "=" * 80)
print("‚úÖ MULTI-CLASS TRAINING COMPLETE!")
print("=" * 80)

# =============================================================================
# PART 9: TRAIN BINARY MODELS
# =============================================================================
print("\n[PART 9] TRAINING BINARY CLASSIFICATION MODELS")
print("=" * 80)

from sklearn.metrics import roc_auc_score

# Define binary models
binary_models = {
    'Logistic Regression': LogisticRegression(
        max_iter=1000,
        random_state=42,
        n_jobs=-1
    ),
    'Random Forest': RandomForestClassifier(
        n_estimators=200,
        max_depth=20,
        min_samples_split=5,
        random_state=42,
        n_jobs=-1
    ),
}

# Add XGBoost if available
if XGBOOST_AVAILABLE:
    binary_models['XGBoost'] = XGBClassifier(
        n_estimators=200,
        max_depth=10,
        learning_rate=0.1,
        random_state=42,
        n_jobs=-1,
        eval_metric='logloss'
    )

# Add LightGBM if available
if LIGHTGBM_AVAILABLE:
    binary_models['LightGBM'] = LGBMClassifier(
        n_estimators=200,
        max_depth=15,
        learning_rate=0.1,
        random_state=42,
        n_jobs=-1,
        verbose=-1
    )

# Train binary models
binary_results = {}
binary_predictions = {}
binary_probabilities = {}
binary_trained_models = {}

print(f"\nTraining {len(binary_models)} models...")
print("-" * 80)

for name, model in binary_models.items():
    print(f"\n{name}")
    print("  " + "-" * 40)
    
    start_time = time.time()
    
    # Determine if scaling is needed
    needs_scaling = 'Logistic' in name or 'KNN' in name or 'Naive' in name
    
    # Train model
    if needs_scaling:
        model.fit(binary_X_train_scaled, binary_y_train)
        y_pred = model.predict(binary_X_test_scaled)
        y_prob = model.predict_proba(binary_X_test_scaled)[:, 1]
    else:
        model.fit(binary_X_train, binary_y_train)
        y_pred = model.predict(binary_X_test)
        y_prob = model.predict_proba(binary_X_test)[:, 1]
    
    train_time = time.time() - start_time
    
    # Store model and predictions
    binary_trained_models[name] = model
    binary_predictions[name] = y_pred
    binary_probabilities[name] = y_prob
    
    # Calculate metrics
    binary_results[name] = {
        'Accuracy': accuracy_score(binary_y_test, y_pred),
        'Precision': precision_score(binary_y_test, y_pred, zero_division=0),
        'Recall': recall_score(binary_y_test, y_pred, zero_division=0),
        'F1-Score': f1_score(binary_y_test, y_pred, zero_division=0),
        'AUC-ROC': roc_auc_score(binary_y_test, y_prob),
        'Training Time': train_time
    }
    
    print(f"  Accuracy:  {binary_results[name]['Accuracy']:.4f}")
    print(f"  Precision: {binary_results[name]['Precision']:.4f}")
    print(f"  Recall:    {binary_results[name]['Recall']:.4f}")
    print(f"  F1-Score:  {binary_results[name]['F1-Score']:.4f}")
    print(f"  AUC-ROC:   {binary_results[name]['AUC-ROC']:.4f}")
    print(f"  Time:      {train_time:.2f}s")

print("\n" + "=" * 80)
print("‚úÖ BINARY CLASSIFICATION TRAINING COMPLETE!")
print("=" * 80)

# =============================================================================
# PART 10: SUMMARY & MODEL RANKINGS
# =============================================================================
print("\n[PART 10] FINAL SUMMARY")
print("=" * 80)

print("\nüèÜ MULTI-CLASS MODEL RANKINGS (by Accuracy):")
print("-" * 80)
sorted_multiclass = sorted(multiclass_results.items(), key=lambda x: x[1]['Accuracy'], reverse=True)
for rank, (model, metrics) in enumerate(sorted_multiclass, 1):
    emoji = "ü•á" if rank == 1 else "ü•à" if rank == 2 else "ü•â" if rank == 3 else "  "
    print(f"{emoji} {rank}. {model:25} ‚Üí Acc: {metrics['Accuracy']:.4f} | F1: {metrics['F1-Score']:.4f} | Time: {metrics['Training Time']:.1f}s")

print("\nüèÜ BINARY MODEL RANKINGS (by AUC-ROC):")
print("-" * 80)
sorted_binary = sorted(binary_results.items(), key=lambda x: x[1]['AUC-ROC'], reverse=True)
for rank, (model, metrics) in enumerate(sorted_binary, 1):
    emoji = "ü•á" if rank == 1 else "ü•à" if rank == 2 else "ü•â" if rank == 3 else "  "
    print(f"{emoji} {rank}. {model:25} ‚Üí AUC: {metrics['AUC-ROC']:.4f} | F1: {metrics['F1-Score']:.4f} | Time: {metrics['Training Time']:.1f}s")

print("\n" + "=" * 80)
print("üéâ TRAINING COMPLETE - OPTIMIZED WITH 268K SAMPLES (67K per class)!")
print("=" * 80)

print("\nüìà Key Improvements in this version:")
print(f"  ‚úì Target: {TARGET_PER_CLASS:,} samples per class ({TARGET_PER_CLASS * 4:,} total)")
print(f"  ‚úì Missing values imputed (not dropped) to preserve minority class samples")
print(f"  ‚úì SMOTE used for synthetic oversampling of minority classes")
print(f"  ‚úì Better representation of rare severity levels (1 and 4)")
print(f"  ‚úì More robust model performance and generalization")

print("\n" + "=" * 80)

US ACCIDENTS SEVERITY PREDICTION - OPTIMIZED PIPELINE (67K per class)

[PART 1] RETRIEVING DATA FROM KAGGLE
--------------------------------------------------------------------------------
Downloading dataset from Kaggle...
‚úì Dataset loaded: 7,728,394 rows √ó 46 columns
‚úì Memory usage: 10870.28 MB

[PART 2] INITIAL DATA EXPLORATION
--------------------------------------------------------------------------------

Dataset Info:
  Shape: (7728394, 46)
  Columns: 46

Severity Distribution (Original):
  Severity 1: 67,366 (0.87%)
  Severity 2: 6,156,981 (79.67%)
  Severity 3: 1,299,337 (16.81%)
  Severity 4: 204,710 (2.65%)

[PART 3] FEATURE ENGINEERING
--------------------------------------------------------------------------------

Creating time-based features...
‚úì Time features created

Creating weather flags...
‚úì Weather flags created

[PART 4] FEATURE SELECTION & DATA CLEANING
--------------------------------------------------------------------------------

Total features selec