## 1. Configuration & Setup

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
from datetime import datetime
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import f1_score, classification_report, confusion_matrix
import gc
from tqdm.auto import tqdm
import xgboost as xgb
tqdm.pandas()

warnings.filterwarnings('ignore')
pd.set_option('display.max_columns', None)

print("Libraries imported successfully.")

Libraries imported successfully.


In [2]:
USE_GPU = True  

gpu_config = {
    'xgboost_gpu': USE_GPU
}

In [3]:
class Config:
    DATA_PATH = '/kaggle/input/ride-hailing-trip-classification-dataset/'
    
    N_FOLDS = 5
    RANDOM_STATE = 42
    TARGET_COL = 'Trip_Label'
    ID_COL = 'Trip_ID'
    
    @staticmethod
    def get_catboost_params(use_gpu=False):
        params = {
            'iterations': 1000,
            'learning_rate': 0.05,
            'depth': 6,
            'loss_function': 'MultiClass',
            'eval_metric': 'TotalF1:average=Macro',
            'auto_class_weights': 'Balanced',
            'random_seed': 42,
            'verbose': 100,
            'early_stopping_rounds': 50
        }
        if use_gpu:
            params['task_type'] = 'GPU'
            params['devices'] = '0'
            print("  CatBoost: GPU mode activated")
        else:
            params['task_type'] = 'CPU'
            print("  CatBoost: CPU mode")
        return params
    
    @staticmethod
    def get_lightgbm_params(use_gpu=False):
        params = {
            'objective': 'multiclass',
            'metric': 'multi_logloss',
            'boosting_type': 'gbdt',
            'num_leaves': 31,
            'learning_rate': 0.05,
            'feature_fraction': 0.8,
            'bagging_fraction': 0.8,
            'bagging_freq': 5,
            'random_state': 42,
            'verbose': -1,
            'min_data_in_leaf': 100,
            'min_sum_hessian_in_leaf': 1e-2
        }
        if use_gpu:
            params['device'] = 'gpu'
            print("  LightGBM: GPU mode activated")
        else:
            params['device'] = 'cpu'
            print("  LightGBM: CPU mode")
        return params
    
    @staticmethod
    def get_xgboost_params(use_gpu=False):
        params = {
            'objective': 'multi:softprob',
            'eval_metric': 'mlogloss',
            'max_depth': 6,
            'learning_rate': 0.05,
            'subsample': 0.8,
            'colsample_bytree': 0.8,
            'random_state': 42,
            'verbosity': 1
        }
        if use_gpu:
            params['device'] = 'cuda'
            print("  XGBoost: GPU mode activated")
        else:
            params['device'] = 'cpu'
            print("  XGBoost: CPU mode")
        return params

config = Config()
print("\nConfiguration loaded successfully.")


Configuration loaded successfully.


## 2. Data Loading & Validation

In [4]:
def load_data():
    print("Loading data...")
    files = ['train.csv', 'test.csv', 'sample_submission.csv']
    data = {}
    
    for file in tqdm(files, desc="Loading files"):
        data[file.replace('.csv', '')] = pd.read_csv(config.DATA_PATH + file)
    
    train = data['train']
    test = data['test']
    sample_submission = data['sample_submission']
    
    print(f"Train shape: {train.shape}")
    print(f"Test shape: {test.shape}")
    print(f"Sample submission shape: {sample_submission.shape}")
    
    if config.TARGET_COL in train.columns:
        print(f"\nTarget distribution:")
        print(train[config.TARGET_COL].value_counts())
    
    return train, test, sample_submission

train, test, sample_submission = load_data()

Loading data...


Loading files:   0%|          | 0/3 [00:00<?, ?it/s]

Train shape: (8000000, 25)
Test shape: (4000000, 24)
Sample submission shape: (4000000, 2)

Target distribution:
Trip_Label
Perfect_Trip         4397607
Safety_Violation     1601595
Navigation_Issue      801790
Service_Complaint     798695
Fraud_Indication      400313
Name: count, dtype: int64


In [5]:
train

Unnamed: 0,Trip_ID,Timestamp,Pickup_Lat,Pickup_Long,Dropoff_Lat,Dropoff_Long,GPS_Accuracy_M,Distance_KM,Est_Price_IDR,Surge_Multiplier,Accel_X,Accel_Y,Accel_Z,Gyro_Z,Pickup_Zone,Dropoff_Zone,Device_FP,Promo_Code,Car_Model,Payment_Method,Weather,Traffic,Battery_Level,Signal_Strength,Trip_Label
0,TRIP-00383699,,-6.171481,106.890715,,106.961226,7.602804,21.586724,328518.276846,3.093531,-3.637907,0.304190,,0.235675,JKT-ZONE-06399,JKT-ZONE-03083,Apple-iPhone-7-v3.0,PCP6AQAY,Honda Brio,Cash,Rain,Lancar,14%,3G,Safety_Violation
1,TRIP-11839677,2024-05-17 00:47:56,,106.962939,-6.317811,106.936982,3.567463,2.900550,57636.166476,,0.398592,0.595000,9.918528,0.105604,JKT-ZONE-18157,JKT-ZONE-00734,Oppo-Reno-6-v4.0,NO_VOUCHER,,Credit_Card,Cloudy,Lancar,25%,3G,Perfect_Trip
2,TRIP-00401267,2024-01-05 15:27:46,-6.103016,106.716991,-6.002158,106.614401,12.665372,16.015038,141390.172639,1.726858,-0.125994,0.424621,9.867769,-0.034304,JKT-ZONE-08919,JKT-ZONE-17755,Samsung-iPhone-5-v7.0,NO_VOUCHER,Toyota Avanza,,Clear,,46%,3G,Perfect_Trip
3,TRIP-07296718,2024-03-25 10:51:57,-6.128631,,-6.025428,106.972658,24.973850,11.497623,169474.278890,3.031589,0.656659,-0.211875,9.891046,-0.076039,JKT-ZONE-02891,JKT-ZONE-16716,Vivo-Hot-10-v7.0,,Honda Brio,OVO,Clear,Padat,,Edge,Perfect_Trip
4,TRIP-10098925,2024-04-26 21:15:24,,106.930359,-5.958007,,13.835665,19.666645,102339.029906,1.055553,1.105691,0.245058,9.658218,-0.120996,JKT-ZONE-04617,JKT-ZONE-11542,Apple-Hot-8-v4.0,NO_VOUCHER,Honda Brio,Credit_Card,Storm,Lancar,5%,3G,Perfect_Trip
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7999995,TRIP-01353088,,-6.112390,107.044023,-6.226453,106.919490,21.854880,18.799213,109668.178265,1.143535,-2.939686,-0.132234,10.060896,0.676360,JKT-ZONE-03505,JKT-ZONE-02747,Oppo-iPhone-6-v6.0,V8MIQRKA,Honda Brio,Credit_Card,Cloudy,Lancar,22%,Edge,Safety_Violation
7999996,TRIP-02587105,,,106.834473,-6.170596,106.831777,3.460803,2.883191,75433.883346,3.125755,,1.179822,9.997362,-0.122138,,JKT-ZONE-07653,Apple-Reno-11-v5.0,NO_VOUCHER,Daihatsu Sigra,Gopay,Storm,Padat,,Edge,Perfect_Trip
7999997,TRIP-03695100,2024-02-12 18:24:59,-6.312129,106.750740,,,11.436401,16.256813,88926.496956,1.245230,,,,0.126295,JKT-ZONE-05303,JKT-ZONE-10856,Apple-Reno-11-v6.0,NO_VOUCHER,,Credit_Card,Cloudy,,,Edge,Service_Complaint
7999998,TRIP-05040403,2024-02-28 08:06:42,-6.239998,106.763947,-6.318290,106.835430,18.077910,0.001180,20404.680619,1.699778,-1.353521,0.398377,9.948944,0.241210,,JKT-ZONE-12772,Oppo-iPhone-6-v3.0,NO_VOUCHER,Honda Brio,,Storm,Padat,6%,3G,Fraud_Indication


In [6]:
def optimize_memory(df):
    print(f"Optimizing memory for dataframe with {len(df.columns)} columns...")
    for col in tqdm(df.columns, desc="Optimizing columns"):
        col_type = df[col].dtype
        if col_type != 'object':
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
            else:
                if c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
    return df

print("Optimizing memory...")
train = optimize_memory(train)
test = optimize_memory(test)
print("Memory optimization completed.")

Optimizing memory...
Optimizing memory for dataframe with 25 columns...


Optimizing columns:   0%|          | 0/25 [00:00<?, ?it/s]

Optimizing memory for dataframe with 24 columns...


Optimizing columns:   0%|          | 0/24 [00:00<?, ?it/s]

Memory optimization completed.


## 3. Feature Engineering Experiments

In [7]:
def add_temporal_features(df):
    """Temporal features from Timestamp"""
    df = df.copy()
    if 'Timestamp' in df.columns:
        df['Timestamp_parsed'] = pd.to_datetime(df['Timestamp'], errors='coerce')
        df['Hour'] = df['Timestamp_parsed'].dt.hour
        df['DayOfWeek'] = df['Timestamp_parsed'].dt.dayofweek
        df['Month'] = df['Timestamp_parsed'].dt.month
        df['IsWeekend'] = (df['DayOfWeek'] >= 5).astype(np.int8)
        df['IsRushHour'] = ((df['Hour'] >= 7) & (df['Hour'] <= 9) | 
                            (df['Hour'] >= 17) & (df['Hour'] <= 19)).astype(np.int8)
        df['IsLateNight'] = ((df['Hour'] >= 22) | (df['Hour'] <= 5)).astype(np.int8)
        df.drop('Timestamp_parsed', axis=1, inplace=True)
    return df

def add_distance_features(df):
    """Distance-based features"""
    df = df.copy()
    
    # Haversine distance
    if all(col in df.columns for col in ['Pickup_Lat', 'Pickup_Long', 'Dropoff_Lat', 'Dropoff_Long']):
        def haversine(lat1, lon1, lat2, lon2):
            R = 6371
            lat1_rad = np.radians(lat1)
            lat2_rad = np.radians(lat2)
            delta_lat = np.radians(lat2 - lat1)
            delta_lon = np.radians(lon2 - lon1)
            a = np.sin(delta_lat/2)**2 + np.cos(lat1_rad) * np.cos(lat2_rad) * np.sin(delta_lon/2)**2
            c = 2 * np.arctan2(np.sqrt(a), np.sqrt(1-a))
            return R * c
        
        df['Haversine_Distance'] = haversine(
            df['Pickup_Lat'], df['Pickup_Long'],
            df['Dropoff_Lat'], df['Dropoff_Long']
        )
        
        if 'Distance_KM' in df.columns:
            df['Distance_Ratio'] = df['Distance_KM'] / (df['Haversine_Distance'] + 1e-6)
            df['Distance_Difference'] = np.abs(df['Distance_KM'] - df['Haversine_Distance'])
    
    # Same zone indicator
    if 'Pickup_Zone' in df.columns and 'Dropoff_Zone' in df.columns:
        df['Is_Same_Zone'] = (df['Pickup_Zone'] == df['Dropoff_Zone']).astype(np.int8)
    
    return df

def add_sensor_features(df):
    """Sensor aggregation features"""
    df = df.copy()
    
    if all(col in df.columns for col in ['Accel_X', 'Accel_Y', 'Accel_Z']):
        df['Accel_Magnitude'] = np.sqrt(df['Accel_X']**2 + df['Accel_Y']**2 + df['Accel_Z']**2)
        df['Accel_Max'] = df[['Accel_X', 'Accel_Y', 'Accel_Z']].max(axis=1)
        df['Accel_Min'] = df[['Accel_X', 'Accel_Y', 'Accel_Z']].min(axis=1)
        df['Accel_Std'] = df[['Accel_X', 'Accel_Y', 'Accel_Z']].std(axis=1)
        df['Accel_Range'] = df['Accel_Max'] - df['Accel_Min']
    
    if 'Gyro_Z' in df.columns:
        df['Gyro_Abs'] = np.abs(df['Gyro_Z'])
    
    return df

def add_economic_features(df):
    """Economic/pricing features"""
    df = df.copy()
    
    if 'Est_Price_IDR' in df.columns and 'Distance_KM' in df.columns:
        df['Price_per_KM'] = df['Est_Price_IDR'] / (df['Distance_KM'] + 1e-6)
    
    if 'Promo_Code' in df.columns:
        df['Has_Promo'] = (df['Promo_Code'].notna() & (df['Promo_Code'] != 'NO_VOUCHER')).astype(np.int8)
    
    if 'Surge_Multiplier' in df.columns:
        surge_filled = df['Surge_Multiplier'].fillna(1.0)
        df['Surge_Category'] = pd.cut(surge_filled, 
                                      bins=[0, 1, 1.5, 2, 10], 
                                      labels=[0, 1, 2, 3]).astype(np.int8)
    
    return df

def add_interaction_features(df):
    """Interaction features between different domains"""
    df = df.copy()
    
    # Surge × Hour
    if 'Surge_Multiplier' in df.columns and 'Hour' in df.columns:
        df['Surge_Hour_Interaction'] = df['Surge_Multiplier'] * df['Hour']
    
    # Distance × Traffic
    if 'Distance_KM' in df.columns and 'Traffic' in df.columns:
        traffic_map = {'Lancar': 1, 'Moderate': 2, 'Padat': 3}
        df['Traffic_Numeric'] = df['Traffic'].map(traffic_map).fillna(0).astype(np.int8)
        df['Distance_Traffic'] = df['Distance_KM'] * df['Traffic_Numeric']
    
    return df

def add_advanced_temporal_features(df):
    """Advanced temporal features with cyclical encoding"""
    df = df.copy()
    
    if 'Timestamp' in df.columns:
        df['Timestamp_parsed'] = pd.to_datetime(df['Timestamp'], errors='coerce')
        df['Hour'] = df['Timestamp_parsed'].dt.hour
        
        # Cyclical encoding for hour
        df['Hour_Sin'] = np.sin(2 * np.pi * df['Hour'] / 24)
        df['Hour_Cos'] = np.cos(2 * np.pi * df['Hour'] / 24)
        
        # Day of week cyclical
        df['DayOfWeek'] = df['Timestamp_parsed'].dt.dayofweek
        df['DayOfWeek_Sin'] = np.sin(2 * np.pi * df['DayOfWeek'] / 7)
        df['DayOfWeek_Cos'] = np.cos(2 * np.pi * df['DayOfWeek'] / 7)
        
        # Time of day categories
        df['TimeOfDay'] = pd.cut(df['Hour'], bins=[0, 6, 12, 18, 24], 
                                  labels=['Night', 'Morning', 'Afternoon', 'Evening'],
                                  include_lowest=True)
        
        df.drop('Timestamp_parsed', axis=1, inplace=True)
    
    return df

def add_zone_frequency_features(df, reference_df=None):
    """Zone popularity and frequency features"""
    df = df.copy()
    
    # Use reference_df (train) to calculate frequencies if provided
    ref = reference_df if reference_df is not None else df
    
    if 'Pickup_Zone' in df.columns:
        pickup_freq = ref['Pickup_Zone'].value_counts().to_dict()
        df['Pickup_Zone_Frequency'] = df['Pickup_Zone'].map(pickup_freq).fillna(0).astype(np.int32)
        df['Pickup_Zone_Popularity'] = pd.cut(df['Pickup_Zone_Frequency'], 
                                              bins=[0, 100, 500, 1000, float('inf')],
                                              labels=[0, 1, 2, 3]).astype(np.int8)
    
    if 'Dropoff_Zone' in df.columns:
        dropoff_freq = ref['Dropoff_Zone'].value_counts().to_dict()
        df['Dropoff_Zone_Frequency'] = df['Dropoff_Zone'].map(dropoff_freq).fillna(0).astype(np.int32)
        df['Dropoff_Zone_Popularity'] = pd.cut(df['Dropoff_Zone_Frequency'],
                                               bins=[0, 100, 500, 1000, float('inf')],
                                               labels=[0, 1, 2, 3]).astype(np.int8)
    
    return df

def add_device_environment_features(df):
    """Device and environment binning features"""
    df = df.copy()
    
    # Battery level bins
    if 'Battery_Level' in df.columns:
        # Remove % sign if exists and convert to numeric
        if df['Battery_Level'].dtype == 'object':
            df['Battery_Level_Numeric'] = df['Battery_Level'].str.replace('%', '').astype(float)
        else:
            df['Battery_Level_Numeric'] = df['Battery_Level']
        
        df['Battery_Category'] = pd.cut(df['Battery_Level_Numeric'],
                                        bins=[0, 20, 50, 80, 100],
                                        labels=[0, 1, 2, 3]).astype(np.int8)
        df['Is_Low_Battery'] = (df['Battery_Level_Numeric'] < 20).astype(np.int8)
    
    # Signal strength encoding
    if 'Signal_Strength' in df.columns:
        signal_map = {'Edge': 1, '3G': 2, '4G': 3, '5G': 4}
        df['Signal_Strength_Numeric'] = df['Signal_Strength'].map(signal_map).fillna(0).astype(np.int8)
    
    # GPS accuracy bins
    if 'GPS_Accuracy_M' in df.columns:
        df['GPS_Accuracy_Category'] = pd.cut(df['GPS_Accuracy_M'],
                                             bins=[0, 5, 10, 20, float('inf')],
                                             labels=[0, 1, 2, 3]).astype(np.int8)
    
    # Weather severity
    if 'Weather' in df.columns:
        weather_severity = {'Clear': 0, 'Cloudy': 1, 'Rain': 2, 'Storm': 3}
        df['Weather_Severity'] = df['Weather'].map(weather_severity).fillna(0).astype(np.int8)
    
    return df

def add_price_binning_features(df):
    """Price and economic binning features"""
    df = df.copy()
    
    # Price bins
    if 'Est_Price_IDR' in df.columns:
        df['Price_Category'] = pd.qcut(df['Est_Price_IDR'].fillna(df['Est_Price_IDR'].median()), 
                                       q=5, labels=[0, 1, 2, 3, 4], duplicates='drop').astype(np.int8)
    
    # Distance bins
    if 'Distance_KM' in df.columns:
        df['Distance_Category'] = pd.cut(df['Distance_KM'],
                                         bins=[0, 2, 5, 10, 20, float('inf')],
                                         labels=[0, 1, 2, 3, 4]).astype(np.int8)
    
    # Surge bins (more granular)
    if 'Surge_Multiplier' in df.columns:
        surge_filled = df['Surge_Multiplier'].fillna(1.0)
        df['Surge_Level'] = pd.cut(surge_filled,
                                   bins=[0, 1.0, 1.2, 1.5, 2.0, float('inf')],
                                   labels=[0, 1, 2, 3, 4]).astype(np.int8)
    
    return df

def add_ratio_features(df):
    """Various ratio and proportion features"""
    df = df.copy()
    
    # Speed (distance/time estimate based on price)
    if 'Distance_KM' in df.columns and 'Est_Price_IDR' in df.columns:
        df['Distance_Price_Ratio'] = df['Distance_KM'] / (df['Est_Price_IDR'] + 1)
    
    # Sensor ratios
    if all(col in df.columns for col in ['Accel_X', 'Accel_Y', 'Accel_Z']):
        accel_sum = df['Accel_X'].abs() + df['Accel_Y'].abs() + df['Accel_Z'].abs()
        df['Accel_X_Ratio'] = df['Accel_X'].abs() / (accel_sum + 1e-6)
        df['Accel_Y_Ratio'] = df['Accel_Y'].abs() / (accel_sum + 1e-6)
        df['Accel_Z_Ratio'] = df['Accel_Z'].abs() / (accel_sum + 1e-6)
    
    # GPS accuracy to distance ratio
    if 'GPS_Accuracy_M' in df.columns and 'Distance_KM' in df.columns:
        df['GPS_Distance_Ratio'] = df['GPS_Accuracy_M'] / ((df['Distance_KM'] * 1000) + 1)
    
    return df

def add_advanced_interaction_features(df):
    """More complex interaction features"""
    df = df.copy()
    
    # Price × Surge × Distance
    if all(col in df.columns for col in ['Est_Price_IDR', 'Surge_Multiplier', 'Distance_KM']):
        df['Price_Surge_Distance'] = (df['Est_Price_IDR'] * 
                                      df['Surge_Multiplier'].fillna(1.0) / 
                                      (df['Distance_KM'] + 1))
    
    # Weather × Traffic
    if 'Weather' in df.columns and 'Traffic' in df.columns:
        df['Weather_Traffic'] = df['Weather'].astype(str) + '_' + df['Traffic'].astype(str)
    
    # Hour × Weekend
    if 'Hour' in df.columns and 'DayOfWeek' in df.columns:
        df['Hour_Weekend'] = df['Hour'] * (df['DayOfWeek'] >= 5).astype(int)
    
    # Battery × Signal (connectivity score)
    if 'Battery_Level' in df.columns and 'Signal_Strength' in df.columns:
        battery_num = df['Battery_Level']
        if battery_num.dtype == 'object':
            battery_num = df['Battery_Level'].str.replace('%', '').astype(float)
        
        signal_map = {'Edge': 1, '3G': 2, '4G': 3, '5G': 4}
        signal_num = df['Signal_Strength'].map(signal_map).fillna(2)
        df['Connectivity_Score'] = battery_num * signal_num
    
    return df

def add_statistical_features(df):
    """Statistical aggregation features"""
    df = df.copy()
    
    # Coordinate statistics
    if all(col in df.columns for col in ['Pickup_Lat', 'Pickup_Long', 'Dropoff_Lat', 'Dropoff_Long']):
        df['Lat_Range'] = np.abs(df['Dropoff_Lat'] - df['Pickup_Lat'])
        df['Long_Range'] = np.abs(df['Dropoff_Long'] - df['Pickup_Long'])
        df['Coord_Range_Sum'] = df['Lat_Range'] + df['Long_Range']
    
    # Sensor variance indicators
    if all(col in df.columns for col in ['Accel_X', 'Accel_Y', 'Accel_Z']):
        df['Accel_CV'] = df[['Accel_X', 'Accel_Y', 'Accel_Z']].std(axis=1) / (df[['Accel_X', 'Accel_Y', 'Accel_Z']].mean(axis=1).abs() + 1e-6)
    
    return df

In [8]:
# def test_feature_engineering_impact(train_data, test_data, sample_frac=0.3, exp_subset=None):
#     """
#     Test impact of each FE group individually against baseline
    
#     Args:
#         train_data: Training dataset
#         test_data: Test dataset
#         sample_frac: Fraction of data to use (default 0.3 = 30% for memory efficiency)
#         exp_subset: List of experiment names to run (None = run all)
#     """
#     print("\n" + "="*80)
#     print("FEATURE ENGINEERING IMPACT EXPERIMENT")
#     print(f"Sample Size: {sample_frac*100:.0f}% of data (memory optimization)")
#     print("="*80)
    
#     # Sample data to reduce memory usage
#     if sample_frac < 1.0:
#         print(f"\nSampling {sample_frac*100:.0f}% of data for faster testing...")
#         train_sample = train_data.sample(frac=sample_frac, random_state=42).reset_index(drop=True)
#         test_sample = test_data.sample(frac=sample_frac, random_state=42).reset_index(drop=True)
#         print(f"Train sample: {len(train_sample):,} rows, Test sample: {len(test_sample):,} rows")
#     else:
#         train_sample = train_data.copy()
#         test_sample = test_data.copy()
    
#     fe_experiments = {
#         'baseline': ('Baseline (No FE)', None),
#         'temporal': ('Temporal Features', add_temporal_features),
#         'distance': ('Distance Features', add_distance_features),
#         'sensor': ('Sensor Aggregation', add_sensor_features),
#         'economic': ('Economic Features', add_economic_features),
#         'interaction': ('Interaction Features', add_interaction_features),
#         'advanced_temporal': ('Advanced Temporal + Cyclical', add_advanced_temporal_features),
#         'zone_frequency': ('Zone Popularity Features', add_zone_frequency_features),
#         'device_environment': ('Device & Environment Bins', add_device_environment_features),
#         'price_binning': ('Price & Distance Bins', add_price_binning_features),
#         'ratio_features': ('Various Ratio Features', add_ratio_features),
#         'advanced_interaction': ('Complex Interactions', add_advanced_interaction_features),
#         'statistical': ('Statistical Aggregations', add_statistical_features)
#     }
    
#     # Filter experiments if subset specified
#     if exp_subset:
#         fe_experiments = {k: v for k, v in fe_experiments.items() if k in exp_subset or k == 'baseline'}
#         print(f"\nRunning {len(fe_experiments)} experiments: {list(fe_experiments.keys())}")
    
#     results = []
    
#     for exp_name, (exp_desc, fe_function) in fe_experiments.items():
#         print(f"\n{'='*60}")
#         print(f"Testing: {exp_name.upper()} - {exp_desc}")
#         print(f"{'='*60}")
        
#         try:
#             # Apply FE if not baseline
#             if fe_function is not None:
#                 train_fe = fe_function(train_sample.copy())
#                 test_fe = fe_function(test_sample.copy())
#             else:
#                 train_fe = train_sample.copy()
#                 test_fe = test_sample.copy()
            
#             # Preprocessing
#             cols_to_drop = [config.ID_COL, 'Timestamp', config.TARGET_COL]
#             cols_to_drop = [col for col in cols_to_drop if col in train_fe.columns]
#             y = train_fe[config.TARGET_COL].copy()
            
#             X_tr = train_fe.drop(cols_to_drop, axis=1).copy()
#             X_te = test_fe.drop([col for col in cols_to_drop if col in test_fe.columns], axis=1).copy()
            
#             numeric_cols = X_tr.select_dtypes(include=[np.number]).columns.tolist()
#             categorical_cols = X_tr.select_dtypes(include=['object']).columns.tolist()
            
#             # Imputation (Median + Mode)
#             for col in numeric_cols:
#                 if X_tr[col].isnull().sum() > 0:
#                     val = X_tr[col].median()
#                     X_tr[col].fillna(val, inplace=True)
#                     X_te[col].fillna(val, inplace=True)
            
#             for col in categorical_cols:
#                 if X_tr[col].isnull().sum() > 0:
#                     val = X_tr[col].mode()[0] if len(X_tr[col].mode()) > 0 else 'Unknown'
#                     X_tr[col].fillna(val, inplace=True)
#                     X_te[col].fillna(val, inplace=True)
            
#             # Outlier clipping
#             for col in numeric_cols:
#                 q99 = X_tr[col].quantile(0.99)
#                 q01 = X_tr[col].quantile(0.01)
#                 X_tr[col] = X_tr[col].clip(q01, q99)
#                 X_te[col] = X_te[col].clip(q01, q99)
            
#             # Target encoding
#             le_temp = LabelEncoder()
#             y_temp = le_temp.fit_transform(y)
            
#             if len(categorical_cols) > 0:
#                 X_tr, X_te, _ = encode_categorical_target(X_tr, X_te, categorical_cols, y_temp, smoothing=10)
            
#             # Clear FE data immediately
#             del train_fe, test_fe
#             gc.collect()
            
#             # Encode target
#             le_target = LabelEncoder()
#             y_encoded = le_target.fit_transform(y)
            
#             # Train with 3-fold CV (faster)
#             params = config.get_xgboost_params(use_gpu=gpu_config['xgboost_gpu'])
#             params['num_class'] = len(np.unique(y_encoded))
            
#             skf = StratifiedKFold(n_splits=3, shuffle=True, random_state=config.RANDOM_STATE)
#             oof_predictions = np.zeros((len(X_tr), len(np.unique(y_encoded))))
#             fold_scores = []
            
#             for fold, (train_idx, val_idx) in enumerate(skf.split(X_tr, y_encoded), 1):
#                 X_train_fold = X_tr.iloc[train_idx]
#                 X_val_fold = X_tr.iloc[val_idx]
#                 y_train_fold = y_encoded[train_idx]
#                 y_val_fold = y_encoded[val_idx]
                
#                 dtrain = xgb.DMatrix(X_train_fold, label=y_train_fold)
#                 dval = xgb.DMatrix(X_val_fold, label=y_val_fold)
                
#                 model = xgb.train(
#                     params,
#                     dtrain,
#                     num_boost_round=300,
#                     evals=[(dval, 'valid')],
#                     early_stopping_rounds=30,
#                     verbose_eval=False
#                 )
                
#                 oof_predictions[val_idx] = model.predict(dval)
#                 oof_pred_labels = np.argmax(oof_predictions[val_idx], axis=1)
#                 fold_score = f1_score(y_val_fold, oof_pred_labels, average='macro')
#                 fold_scores.append(fold_score)
                
#                 # Aggressive cleanup per fold
#                 del dtrain, dval, model, X_train_fold, X_val_fold
#                 gc.collect()
            
#             oof_pred_labels = np.argmax(oof_predictions, axis=1)
#             cv_score = f1_score(y_encoded, oof_pred_labels, average='macro')
            
#             # Calculate features added
#             original_features = train_sample.shape[1]
#             if fe_function is not None:
#                 temp_fe = fe_function(train_sample.head(100))
#                 new_features = temp_fe.shape[1]
#                 del temp_fe
#             else:
#                 new_features = original_features
#             features_added = new_features - original_features
            
#             results.append({
#                 'experiment': exp_name,
#                 'description': exp_desc,
#                 'cv_score': cv_score,
#                 'std': np.std(fold_scores),
#                 'features_added': features_added
#             })
            
#             print(f"  CV Score (Macro F1): {cv_score:.6f} (+/- {np.std(fold_scores):.6f})")
#             print(f"  Features Added: {features_added}")
            
        
#             del X_tr, X_te, y, y_temp, y_encoded, oof_predictions, le_temp, le_target
#             gc.collect()
            
#         except Exception as e:
#             print(f"  Error: {str(e)}")
#             results.append({
#                 'experiment': exp_name,
#                 'description': exp_desc,
#                 'cv_score': 0.0,
#                 'std': 0.0,
#                 'features_added': 0
#             })
#             gc.collect()
    
  
#     del train_sample, test_sample
#     gc.collect()
    
#     # Summary
#     print(f"\n{'='*80}")
#     print("EXPERIMENT RESULTS SUMMARY")
#     print(f"{'='*80}\n")
    
#     results_df = pd.DataFrame(results)
    
#     # Calculate improvement vs baseline
#     baseline_score = results_df[results_df['experiment'] == 'baseline']['cv_score'].values[0]
#     results_df['improvement'] = ((results_df['cv_score'] - baseline_score) / baseline_score * 100).round(2)
#     results_df['abs_improvement'] = (results_df['cv_score'] - baseline_score).round(6)
    
#     # Sort by score
#     results_df = results_df.sort_values('cv_score', ascending=False).reset_index(drop=True)
    
#     print(results_df[['experiment', 'description', 'cv_score', 'std', 'features_added', 
#                       'abs_improvement', 'improvement']].to_string(index=False))
    
#     # Identify improvements
#     improvements = results_df[results_df['improvement'] > 0]
#     if len(improvements) > 1: 
#         print(f"\n{'='*80}")
#         print(f"POSITIVE IMPACT FEATURES ({len(improvements)-1} groups):")
#         print(f"{'='*80}")
#         for _, row in improvements[improvements['experiment'] != 'baseline'].iterrows():
#             print(f"  ✓ {row['description']}: +{row['abs_improvement']:.6f} ({row['improvement']:+.2f}%)")
#     else:
#         print(f"\n{'='*80}")
#         print("⚠ NO FEATURE ENGINEERING GROUPS IMPROVED PERFORMANCE")
#         print(f"{'='*80}")
    
#     print(f"\n{'='*80}")
#     print(f"BASELINE SCORE: {baseline_score:.6f}")
#     print(f"BEST SCORE: {results_df.iloc[0]['cv_score']:.6f} ({results_df.iloc[0]['description']})")
#     print(f"{'='*80}")
    
#     return results_df

# batch1 = ['baseline', 'temporal', 'distance', 'sensor', 'economic', 'interaction']
# fe_results_1 = test_feature_engineering_impact(train, test, sample_frac=0.3, exp_subset=batch1)

In [10]:
# def test_fe_combinations(train_data, test_data, sample_frac=0.3):
#     """
#     Test various combinations of validated FE functions to find optimal setup
#     Args:
#         train_data: Training dataset
#         test_data: Test dataset
#         sample_frac: Fraction of data to use (default 0.3 for memory efficiency)
#     """
#     print("\n" + "="*80)
#     print("FE COMBINATION OPTIMIZATION EXPERIMENT")
#     print(f"Sample Size: {sample_frac*100:.0f}% of data")
#     print("="*80)
    
#     # Sample data
#     if sample_frac < 1.0:
#         print(f"\nSampling {sample_frac*100:.0f}% of data...")
#         train_sample = train_data.sample(frac=sample_frac, random_state=42).reset_index(drop=True)
#         test_sample = test_data.sample(frac=sample_frac, random_state=42).reset_index(drop=True)
#         print(f"Train: {len(train_sample):,} rows, Test: {len(test_sample):,} rows")
#     else:
#         train_sample = train_data.copy()
#         test_sample = test_data.copy()
    
#     # Define FE combinations to test
#     combinations = {
#         'baseline': {
#             'name': 'Baseline (No FE)',
#             'functions': []
#         },
#         'all_validated': {
#             'name': 'All 8 Validated FE',
#             'functions': [
#                 add_temporal_features,
#                 add_distance_features,
#                 add_sensor_features,
#                 add_economic_features,
#                 add_advanced_temporal_features,
#                 add_zone_frequency_features,
#                 add_ratio_features,
#                 add_statistical_features
#             ]
#         },
#         'top_2': {
#             'name': 'Top 2 Best (Statistical + Ratio)',
#             'functions': [
#                 add_statistical_features,
#                 add_ratio_features
#             ]
#         },
#         'top_3': {
#             'name': 'Top 3 (Statistical + Ratio + Advanced Temporal)',
#             'functions': [
#                 add_statistical_features,
#                 add_ratio_features,
#                 add_advanced_temporal_features
#             ]
#         },
#         'top_5': {
#             'name': 'Top 5 (+ Zone + Economic)',
#             'functions': [
#                 add_statistical_features,
#                 add_ratio_features,
#                 add_advanced_temporal_features,
#                 add_zone_frequency_features,
#                 add_economic_features
#             ]
#         },
#         'domain_mix': {
#             'name': 'Mixed Domains (Temporal + Distance + Economic + Statistical)',
#             'functions': [
#                 add_temporal_features,
#                 add_distance_features,
#                 add_economic_features,
#                 add_statistical_features
#             ]
#         },
#         'advanced_only': {
#             'name': 'Advanced Features Only (Cyclical + Zone + Ratio)',
#             'functions': [
#                 add_advanced_temporal_features,
#                 add_zone_frequency_features,
#                 add_ratio_features
#             ]
#         },
#         'basic_features': {
#             'name': 'Basic Features (Temporal + Distance + Sensor)',
#             'functions': [
#                 add_temporal_features,
#                 add_distance_features,
#                 add_sensor_features
#             ]
#         }
#     }
    
#     results = []
    
#     for combo_id, combo_info in combinations.items():
#         combo_name = combo_info['name']
#         fe_functions = combo_info['functions']
        
#         print(f"\n{'='*70}")
#         print(f"Testing: {combo_name}")
#         print(f"FE Functions: {len(fe_functions)}")
#         print(f"{'='*70}")
        
#         try:
#             # Apply FE combination
#             train_fe = train_sample.copy()
#             test_fe = test_sample.copy()
            
#             for fe_func in fe_functions:
#                 train_fe = fe_func(train_fe)
#                 test_fe = fe_func(test_fe)
            
#             # Preprocessing
#             cols_to_drop = [config.ID_COL, 'Timestamp', config.TARGET_COL]
#             cols_to_drop = [col for col in cols_to_drop if col in train_fe.columns]
#             y = train_fe[config.TARGET_COL].copy()
            
#             X_tr = train_fe.drop(cols_to_drop, axis=1).copy()
#             X_te = test_fe.drop([col for col in cols_to_drop if col in test_fe.columns], axis=1).copy()
            
#             numeric_cols = X_tr.select_dtypes(include=[np.number]).columns.tolist()
#             categorical_cols = X_tr.select_dtypes(include=['object']).columns.tolist()
            
#             # Imputation
#             for col in numeric_cols:
#                 if X_tr[col].isnull().sum() > 0:
#                     val = X_tr[col].median()
#                     X_tr[col].fillna(val, inplace=True)
#                     X_te[col].fillna(val, inplace=True)
            
#             for col in categorical_cols:
#                 if X_tr[col].isnull().sum() > 0:
#                     val = X_tr[col].mode()[0] if len(X_tr[col].mode()) > 0 else 'Unknown'
#                     X_tr[col].fillna(val, inplace=True)
#                     X_te[col].fillna(val, inplace=True)
            
#             # Outlier clipping
#             for col in numeric_cols:
#                 q99 = X_tr[col].quantile(0.99)
#                 q01 = X_tr[col].quantile(0.01)
#                 X_tr[col] = X_tr[col].clip(q01, q99)
#                 X_te[col] = X_te[col].clip(q01, q99)
            
#             # Target encoding
#             le_temp = LabelEncoder()
#             y_temp = le_temp.fit_transform(y)
            
#             if len(categorical_cols) > 0:
#                 X_tr, X_te, _ = encode_categorical_target(X_tr, X_te, categorical_cols, y_temp, smoothing=10)
            
#             # Clear FE data
#             del train_fe, test_fe
#             gc.collect()
            
#             # Encode target
#             le_target = LabelEncoder()
#             y_encoded = le_target.fit_transform(y)
            
#             # 3-fold CV training
#             params = config.get_xgboost_params(use_gpu=gpu_config['xgboost_gpu'])
#             params['num_class'] = len(np.unique(y_encoded))
            
#             skf = StratifiedKFold(n_splits=3, shuffle=True, random_state=config.RANDOM_STATE)
#             oof_predictions = np.zeros((len(X_tr), len(np.unique(y_encoded))))
#             fold_scores = []
            
#             for fold, (train_idx, val_idx) in enumerate(skf.split(X_tr, y_encoded), 1):
#                 X_train_fold = X_tr.iloc[train_idx]
#                 X_val_fold = X_tr.iloc[val_idx]
#                 y_train_fold = y_encoded[train_idx]
#                 y_val_fold = y_encoded[val_idx]
                
#                 dtrain = xgb.DMatrix(X_train_fold, label=y_train_fold)
#                 dval = xgb.DMatrix(X_val_fold, label=y_val_fold)
                
#                 model = xgb.train(
#                     params,
#                     dtrain,
#                     num_boost_round=300,
#                     evals=[(dval, 'valid')],
#                     early_stopping_rounds=30,
#                     verbose_eval=False
#                 )
                
#                 oof_predictions[val_idx] = model.predict(dval)
#                 oof_pred_labels = np.argmax(oof_predictions[val_idx], axis=1)
#                 fold_score = f1_score(y_val_fold, oof_pred_labels, average='macro')
#                 fold_scores.append(fold_score)
                
#                 del dtrain, dval, model, X_train_fold, X_val_fold
#                 gc.collect()
            
#             oof_pred_labels = np.argmax(oof_predictions, axis=1)
#             cv_score = f1_score(y_encoded, oof_pred_labels, average='macro')
            
#             # Calculate features
#             features_added = X_tr.shape[1] - train_sample.shape[1]
            
#             results.append({
#                 'combination': combo_id,
#                 'name': combo_name,
#                 'cv_score': cv_score,
#                 'std': np.std(fold_scores),
#                 'num_fe_functions': len(fe_functions),
#                 'features_added': features_added
#             })
            
#             print(f"  CV Score: {cv_score:.6f} (+/- {np.std(fold_scores):.6f})")
#             print(f"  Features Added: {features_added}")
            
#             # Aggressive cleanup
#             del X_tr, X_te, y, y_temp, y_encoded, oof_predictions, le_temp, le_target
#             gc.collect()
            
#         except Exception as e:
#             print(f"  Error: {str(e)}")
#             results.append({
#                 'combination': combo_id,
#                 'name': combo_name,
#                 'cv_score': 0.0,
#                 'std': 0.0,
#                 'num_fe_functions': len(fe_functions),
#                 'features_added': 0
#             })
#             gc.collect()
    
#     # Cleanup
#     del train_sample, test_sample
#     gc.collect()
    
#     # Results summary
#     print(f"\n{'='*80}")
#     print("COMBINATION RESULTS SUMMARY")
#     print(f"{'='*80}\n")
    
#     results_df = pd.DataFrame(results)
    
#     # Calculate improvement vs baseline
#     baseline_score = results_df[results_df['combination'] == 'baseline']['cv_score'].values[0]
#     results_df['improvement'] = ((results_df['cv_score'] - baseline_score) / baseline_score * 100).round(2)
#     results_df['abs_improvement'] = (results_df['cv_score'] - baseline_score).round(6)
    
#     # Sort by score
#     results_df = results_df.sort_values('cv_score', ascending=False).reset_index(drop=True)
    
#     print(results_df[['name', 'cv_score', 'std', 'num_fe_functions', 'features_added', 
#                       'abs_improvement', 'improvement']].to_string(index=False))
    
#     # Highlight best combinations
#     print(f"\n{'='*80}")
#     print("TOP 3 COMBINATIONS:")
#     print(f"{'='*80}")
#     for i, row in results_df.head(3).iterrows():
#         if row['combination'] != 'baseline':
#             print(f"\n{i+1}. {row['name']}")
#             print(f"   Score: {row['cv_score']:.6f} ({row['improvement']:+.2f}%)")
#             print(f"   FE Functions: {row['num_fe_functions']}, Features Added: {row['features_added']}")
    
#     print(f"\n{'='*80}")
#     print(f"BASELINE: {baseline_score:.6f}")
#     print(f"BEST COMBINATION: {results_df.iloc[0]['cv_score']:.6f} ({results_df.iloc[0]['name']})")
#     print(f"{'='*80}")
    
#     return results_df

# combo_results = test_fe_combinations(train, test, sample_frac=0.3)

## 3. Preprocessing

In [11]:
def encode_categorical_target(train, test, categorical_cols, y_train, smoothing=10):
    """
    Encodes based on target mean, with smoothing to prevent overfitting
    """
    print("\n" + "="*80)
    print("TARGET ENCODING")
    print("="*80)
    
    encoders = {}
    global_mean = y_train.mean()
    
    for col in tqdm(categorical_cols, desc="Target encoding"):
        temp_df = pd.DataFrame({col: train[col], 'target': y_train})
        
        agg = temp_df.groupby(col)['target'].agg(['mean', 'count'])
        smoothed_mean = (agg['mean'] * agg['count'] + global_mean * smoothing) / (agg['count'] + smoothing)
        
        encoding_map = smoothed_mean.to_dict()
        
        train[col] = train[col].map(encoding_map).fillna(global_mean).astype(np.float32)
        test[col] = test[col].map(encoding_map).fillna(global_mean).astype(np.float32)
        
        encoders[col] = {
            'type': 'target',
            'unique_values': len(encoding_map),
            'global_mean': global_mean
        }
    
    print(f"\nEncoded {len(categorical_cols)} categorical features")
    return train, test, encoders

In [12]:
def apply_feature_engineering(train, test, feature_groups=None):
    """
    Apply validated feature engineering transformations
    
    Parameters:
    -----------
    train : pd.DataFrame
        Training dataset
    test : pd.DataFrame
        Test dataset
    feature_groups : list, optional
        List of FE functions to apply. If None, applies all validated FE.
        
    Returns:
    --------
    train_fe, test_fe : pd.DataFrame
        Datasets with engineered features
    """
    validated_fe_functions = [
        add_statistical_features,
        add_ratio_features        
    ]
    
    if feature_groups is not None:
        fe_functions = feature_groups
    else:
        fe_functions = validated_fe_functions
    
    train_fe = train.copy()
    test_fe = test.copy()
    
    print("\n" + "="*80)
    print("FEATURE ENGINEERING")
    print("="*80)
    print(f"Applying {len(fe_functions)} feature engineering transformations...")
    
    for fe_func in fe_functions:
        func_name = fe_func.__name__.replace('add_', '').replace('_', ' ').title()
        print(f"  • {func_name}")
        train_fe = fe_func(train_fe)
        test_fe = fe_func(test_fe)
    
    features_added = train_fe.shape[1] - train.shape[1]
    print(f"\n✓ Feature engineering completed")
    print(f"  Original features: {train.shape[1]}")
    print(f"  New features: {train_fe.shape[1]}")
    print(f"  Features added: {features_added}")
    
    return train_fe, test_fe


def preprocess_data(train, test, apply_fe=True):
    """
    Main preprocessing pipeline with best configuration:
    - Feature Engineering (optional)
    - Missing Value Imputation (Median + Mode)
    - Outlier Clipping
    - Target Encoding for categoricals
    - Target Label Encoding
    
    Parameters:
    -----------
    train : pd.DataFrame
        Training dataset
    test : pd.DataFrame
        Test dataset
    apply_fe : bool, default=True
        Whether to apply feature engineering
        
    Returns:
    --------
    X_train, X_test, y_train, le_target, encoders
    """
    print("\n" + "="*80)
    print("DATA PREPROCESSING PIPELINE")
    print("="*80)
    
    # Apply Feature Engineering
    if apply_fe:
        train, test = apply_feature_engineering(train, test)
    
    cols_to_drop = [config.ID_COL, 'Timestamp']
    if config.TARGET_COL in train.columns:
        y = train[config.TARGET_COL].copy()
        cols_to_drop.append(config.TARGET_COL)
    else:
        y = None
    
    cols_to_drop = [col for col in cols_to_drop if col in train.columns]
    X_train = train.drop(cols_to_drop, axis=1).copy()
    X_test = test.drop([col for col in cols_to_drop if col in test.columns], axis=1).copy()
    
    print(f"\nInitial shapes:")
    print(f"  X_train: {X_train.shape}")
    print(f"  X_test: {X_test.shape}")
    
    print(f"\nMissing values before imputation:")
    train_missing = X_train.isnull().sum()
    if train_missing.sum() > 0:
        print(train_missing[train_missing > 0])
    else:
        print("  No missing values found")
    
    numeric_cols = X_train.select_dtypes(include=[np.number]).columns.tolist()
    categorical_cols = X_train.select_dtypes(include=['object']).columns.tolist()
    
    print(f"\nFeature types detected:")
    print(f"  Numeric features: {len(numeric_cols)}")
    print(f"  Categorical features: {len(categorical_cols)}")
    
    print("\n" + "="*80)
    print("STEP 1: Missing Value Imputation (Median + Mode)")
    print("="*80)
    
    for col in tqdm(numeric_cols, desc="Imputing numeric features"):
        if X_train[col].isnull().sum() > 0:
            median_val = X_train[col].median()
            X_train[col].fillna(median_val, inplace=True)
            X_test[col].fillna(median_val, inplace=True)
    
    for col in tqdm(categorical_cols, desc="Imputing categorical features"):
        if X_train[col].isnull().sum() > 0:
            mode_val = X_train[col].mode()[0] if len(X_train[col].mode()) > 0 else 'Unknown'
            X_train[col].fillna(mode_val, inplace=True)
            X_test[col].fillna(mode_val, inplace=True)
    
    print("\n" + "="*80)
    print("STEP 2: Outlier Clipping")
    print("="*80)
    
    for col in tqdm(numeric_cols, desc="Clipping outliers"):
        q99 = X_train[col].quantile(0.99)
        q01 = X_train[col].quantile(0.01)
        X_train[col] = X_train[col].clip(q01, q99)
        X_test[col] = X_test[col].clip(q01, q99)
    
    print("\n" + "="*80)
    print("STEP 3: Categorical Encoding (Target Encoding)")
    print("="*80)
    
    if len(categorical_cols) > 0:
        if y is None:
            raise ValueError("Target encoding requires target variable")
        le_target_temp = LabelEncoder()
        y_temp = le_target_temp.fit_transform(y)
        X_train, X_test, encoders = encode_categorical_target(
            X_train, X_test, categorical_cols, y_temp
        )
    else:
        encoders = {}
        print("No categorical features to encode")
    
    print("\n" + "="*80)
    print("STEP 4: Target Label Encoding")
    print("="*80)
    
    if y is not None:
        le_target = LabelEncoder()
        y_encoded = le_target.fit_transform(y)
        print(f"\nTarget classes encoded:")
        for i, label in enumerate(le_target.classes_):
            count = (y_encoded == i).sum()
            print(f"  {i}: {label:20s} - {count:,} samples ({count/len(y_encoded)*100:.2f}%)")
    else:
        y_encoded = None
        le_target = None
    
    print("\n" + "="*80)
    print("PREPROCESSING COMPLETED")
    print("="*80)
    print(f"Final shapes:")
    print(f"  X_train: {X_train.shape}")
    print(f"  X_test: {X_test.shape}")
    if y_encoded is not None:
        print(f"  y_train: {y_encoded.shape}")
    print("="*80)
    
    return X_train, X_test, y_encoded, le_target, encoders

X_train, X_test, y_train, le_target, encoders = preprocess_data(train, test, apply_fe=False)
gc.collect()


DATA PREPROCESSING PIPELINE

Initial shapes:
  X_train: (8000000, 22)
  X_test: (4000000, 22)

Missing values before imputation:
Pickup_Lat           929348
Pickup_Long          611083
Dropoff_Lat         1914440
Dropoff_Long        1556029
GPS_Accuracy_M      1504090
Distance_KM          941317
Est_Price_IDR       1151401
Surge_Multiplier     612307
Accel_X             1608442
Accel_Y             1081337
Accel_Z             1840229
Gyro_Z               701714
Pickup_Zone         1381274
Dropoff_Zone         765445
Device_FP           1499497
Promo_Code          1107810
Car_Model           1721777
Payment_Method      1141190
Weather              514649
Traffic             1808403
Battery_Level       1746399
Signal_Strength      408588
dtype: int64

Feature types detected:
  Numeric features: 12
  Categorical features: 10

STEP 1: Missing Value Imputation (Median + Mode)


Imputing numeric features:   0%|          | 0/12 [00:00<?, ?it/s]

Imputing categorical features:   0%|          | 0/10 [00:00<?, ?it/s]


STEP 2: Outlier Clipping


Clipping outliers:   0%|          | 0/12 [00:00<?, ?it/s]


STEP 3: Categorical Encoding (Target Encoding)

TARGET ENCODING


Target encoding:   0%|          | 0/10 [00:00<?, ?it/s]


Encoded 10 categorical features

STEP 4: Target Label Encoding

Target classes encoded:
  0: Fraud_Indication     - 400,313 samples (5.00%)
  1: Navigation_Issue     - 801,790 samples (10.02%)
  2: Perfect_Trip         - 4,397,607 samples (54.97%)
  3: Safety_Violation     - 1,601,595 samples (20.02%)
  4: Service_Complaint    - 798,695 samples (9.98%)

PREPROCESSING COMPLETED
Final shapes:
  X_train: (8000000, 22)
  X_test: (4000000, 22)
  y_train: (8000000,)


107

## 5. Model Training

In [13]:
def macro_f1_eval(preds, dtrain):
    """
    Custom evaluation function for XGBoost to calculate Macro F1
    """
    labels = dtrain.get_label()
    preds_reshaped = preds.reshape(len(labels), -1)
    pred_labels = np.argmax(preds_reshaped, axis=1)
    # Calculate macro F1
    score = f1_score(labels, pred_labels, average='macro')
    return 'macro_f1', score

In [15]:
import json

best_params = {
    'max_depth': 4,
    'min_child_weight': 6,
    'max_delta_step': 2,
    'gamma': 0.288030339461526,
    'learning_rate': 0.2576527098472486,
    'subsample': 0.8550487035478368,
    'colsample_bytree': 0.5862536459445281,
    'colsample_bylevel': 0.8511513462275047,
    'colsample_bynode': 0.9714460526171997,
    'reg_alpha': 1.9234102324777442,
    'reg_lambda': 0.07276868503501757
}

## 5.2. Score Improvement Experiments

In [None]:
# ============================================================================
# EXPERIMENT FUNCTIONS DEFINITION
# ============================================================================

from sklearn.utils.class_weight import compute_class_weight
from sklearn.metrics import f1_score
from scipy.optimize import minimize
from sklearn.feature_selection import SelectFromModel

def add_zone_pair_features(train, test):
    """
    Experiment 1: Zone Pair Encoding
    Creates features based on pickup-dropoff zone combinations (route popularity)
    """
    train_copy = train.copy()
    test_copy = test.copy()
    
    if 'Pickup_Zone' in train.columns and 'Dropoff_Zone' in train.columns:
        # Create zone pair identifier
        train_copy['Zone_Pair'] = train_copy['Pickup_Zone'].astype(str) + '_to_' + train_copy['Dropoff_Zone'].astype(str)
        test_copy['Zone_Pair'] = test_copy['Pickup_Zone'].astype(str) + '_to_' + test_copy['Dropoff_Zone'].astype(str)
        
        # Zone pair frequency (route popularity)
        zone_pair_freq = train_copy['Zone_Pair'].value_counts().to_dict()
        train_copy['Zone_Pair_Frequency'] = train_copy['Zone_Pair'].map(zone_pair_freq).fillna(0).astype(np.int32)
        test_copy['Zone_Pair_Frequency'] = test_copy['Zone_Pair'].map(zone_pair_freq).fillna(0).astype(np.int32)
        
        # Zone pair popularity category
        train_copy['Zone_Pair_Popularity'] = pd.cut(
            train_copy['Zone_Pair_Frequency'], 
            bins=[0, 10, 50, 200, float('inf')],
            labels=[0, 1, 2, 3]
        ).astype(np.int8)
        test_copy['Zone_Pair_Popularity'] = pd.cut(
            test_copy['Zone_Pair_Frequency'], 
            bins=[0, 10, 50, 200, float('inf')],
            labels=[0, 1, 2, 3]
        ).astype(np.int8)
        
        # Average metrics per zone pair
        if 'Est_Price_IDR' in train.columns:
            zone_pair_avg_price = train_copy.groupby('Zone_Pair')['Est_Price_IDR'].mean().to_dict()
            train_copy['Zone_Pair_Avg_Price'] = train_copy['Zone_Pair'].map(zone_pair_avg_price).fillna(train_copy['Est_Price_IDR'].mean()).astype(np.float32)
            test_copy['Zone_Pair_Avg_Price'] = test_copy['Zone_Pair'].map(zone_pair_avg_price).fillna(train_copy['Est_Price_IDR'].mean()).astype(np.float32)
        
        if 'Distance_KM' in train.columns:
            zone_pair_avg_dist = train_copy.groupby('Zone_Pair')['Distance_KM'].mean().to_dict()
            train_copy['Zone_Pair_Avg_Distance'] = train_copy['Zone_Pair'].map(zone_pair_avg_dist).fillna(train_copy['Distance_KM'].mean()).astype(np.float32)
            test_copy['Zone_Pair_Avg_Distance'] = test_copy['Zone_Pair'].map(zone_pair_avg_dist).fillna(train_copy['Distance_KM'].mean()).astype(np.float32)
        
        # Drop the Zone_Pair string column (already encoded as features)
        train_copy.drop('Zone_Pair', axis=1, inplace=True)
        test_copy.drop('Zone_Pair', axis=1, inplace=True)
    
    return train_copy, test_copy


def compute_class_weights_dict(y):
    """
    Experiment 2: Class Weights
    Compute class weights to handle imbalanced classes
    """
    classes = np.unique(y)
    weights = compute_class_weight('balanced', classes=classes, y=y)
    weight_dict = {int(cls): float(weight) for cls, weight in zip(classes, weights)}
    
    print("\nClass Distribution:")
    for cls in classes:
        count = np.sum(y == cls)
        pct = count / len(y) * 100
        print(f"  Class {cls}: {count:,} samples ({pct:.2f}%) -> Weight: {weight_dict[cls]:.4f}")
    
    return weight_dict


def optimize_thresholds(y_true, y_pred_proba, n_classes=5):
    """
    Experiment 3: Threshold Optimization
    Find optimal decision thresholds for each class to maximize F1 score
    """
    from sklearn.preprocessing import label_binarize
    
    y_true_bin = label_binarize(y_true, classes=range(n_classes))
    
    def objective(thresholds):
        y_pred = np.argmax(y_pred_proba - thresholds, axis=1)
        return -f1_score(y_true, y_pred, average='macro')
    
    # Initial thresholds (equal for all classes)
    initial_thresholds = np.zeros(n_classes)
    
    # Optimize
    result = minimize(
        objective, 
        initial_thresholds, 
        method='Nelder-Mead',
        options={'maxiter': 1000, 'disp': False}
    )
    
    optimal_thresholds = result.x
    
    # Apply optimal thresholds
    y_pred_optimized = np.argmax(y_pred_proba - optimal_thresholds, axis=1)
    optimized_score = f1_score(y_true, y_pred_optimized, average='macro')
    baseline_score = f1_score(y_true, np.argmax(y_pred_proba, axis=1), average='macro')
    
    print(f"\nThreshold Optimization Results:")
    print(f"  Baseline F1 (default thresholds): {baseline_score:.6f}")
    print(f"  Optimized F1: {optimized_score:.6f}")
    print(f"  Improvement: {optimized_score - baseline_score:.6f}")
    print(f"\n  Optimal Thresholds: {optimal_thresholds}")
    
    return optimal_thresholds, optimized_score


def select_important_features(X_train, y_train, X_test, threshold='mean'):
    """
    Experiment 4: Feature Selection
    Remove low-importance features to reduce noise
    """
    from xgboost import XGBClassifier
    
    print(f"\nOriginal feature count: {X_train.shape[1]}")
    
    # Train a quick model to get feature importances
    model = XGBClassifier(
        n_estimators=50,
        max_depth=4,
        learning_rate=0.1,
        random_state=42,
        n_jobs=-1,
        tree_method='hist'
    )
    model.fit(X_train, y_train)
    
    # Select features
    selector = SelectFromModel(model, threshold=threshold, prefit=True)
    X_train_selected = selector.transform(X_train)
    X_test_selected = selector.transform(X_test)
    
    selected_features = X_train.columns[selector.get_support()].tolist()
    removed_features = X_train.columns[~selector.get_support()].tolist()
    
    print(f"Selected feature count: {X_train_selected.shape[1]}")
    print(f"Removed feature count: {len(removed_features)}")
    print(f"Removed features (top 10): {removed_features[:10]}")
    
    return pd.DataFrame(X_train_selected, columns=selected_features), pd.DataFrame(X_test_selected, columns=selected_features)


print("✓ Experiment functions defined:")
print("  1. add_zone_pair_features() - Zone pair route popularity encoding")
print("  2. compute_class_weights_dict() - Balanced class weights")
print("  3. optimize_thresholds() - Optimal decision thresholds")
print("  4. select_important_features() - Feature selection based on importance")

In [None]:
# ============================================================================
# RUN EXPERIMENTS AND COMPARE SCORES
# Baseline CV Score: 0.576275
# ============================================================================

def run_single_experiment(X_train, y_train, experiment_name, 
                          use_zone_pairs=False, 
                          use_class_weights=False,
                          use_threshold_opt=False,
                          use_feature_selection=False):
    """
    Run a single experiment with specified configurations
    """
    print(f"\n{'='*80}")
    print(f"EXPERIMENT: {experiment_name}")
    print(f"{'='*80}")
    
    # Make copies
    X_tr = X_train.copy()
    y_tr = y_train.copy()
    
    # Experiment 1: Zone Pair Features
    if use_zone_pairs:
        print("\n[1/4] Applying Zone Pair Features...")
        # Need access to original train/test with zone columns
        # This will be handled in the main experiment loop
        pass
    
    # Experiment 4: Feature Selection (do this before training)
    if use_feature_selection:
        print("\n[4/4] Applying Feature Selection...")
        X_tr_selected, _ = select_important_features(X_tr, y_tr, X_tr, threshold='median')
        X_tr = X_tr_selected
    
    # Setup XGBoost parameters
    params = config.get_xgboost_params(use_gpu=gpu_config['xgboost_gpu'])
    params.update(best_params)  # Use tuned parameters
    params['num_class'] = len(np.unique(y_tr))
    
    # Experiment 2: Class Weights
    if use_class_weights:
        print("\n[2/4] Computing Class Weights...")
        class_weights = compute_class_weights_dict(y_tr)
        # Convert to XGBoost sample weights
        sample_weights = np.array([class_weights[int(y)] for y in y_tr])
    else:
        sample_weights = None
    
    # Cross-validation with same settings as baseline
    skf = StratifiedKFold(n_splits=config.N_FOLDS, shuffle=True, random_state=config.RANDOM_STATE)
    
    oof_predictions = np.zeros((len(X_tr), len(np.unique(y_tr))))
    fold_scores = []
    
    print(f"\n[Training] Running {config.N_FOLDS}-Fold Cross-Validation...")
    
    for fold, (train_idx, val_idx) in enumerate(skf.split(X_tr, y_tr), 1):
        X_tr_fold, X_val_fold = X_tr.iloc[train_idx], X_tr.iloc[val_idx]
        y_tr_fold, y_val_fold = y_tr.iloc[train_idx], y_tr.iloc[val_idx]
        
        dtrain = xgb.DMatrix(X_tr_fold, label=y_tr_fold)
        dval = xgb.DMatrix(X_val_fold, label=y_val_fold)
        
        # Apply sample weights if using class weights
        if sample_weights is not None:
            dtrain.set_weight(sample_weights[train_idx])
        
        model = xgb.train(
            params,
            dtrain,
            num_boost_round=1000,
            evals=[(dval, 'valid')],
            early_stopping_rounds=50,
            verbose_eval=False
        )
        
        oof_predictions[val_idx] = model.predict(dval)
        
        oof_pred_labels = np.argmax(oof_predictions[val_idx], axis=1)
        fold_score = f1_score(y_val_fold, oof_pred_labels, average='macro')
        fold_scores.append(fold_score)
        
        print(f"  Fold {fold}: F1 = {fold_score:.6f}")
    
    # Calculate OOF score
    oof_pred_labels = np.argmax(oof_predictions, axis=1)
    base_cv_score = f1_score(y_tr, oof_pred_labels, average='macro')
    
    # Experiment 3: Threshold Optimization
    if use_threshold_opt:
        print("\n[3/4] Optimizing Decision Thresholds...")
        optimal_thresholds, optimized_cv_score = optimize_thresholds(y_tr, oof_predictions)
        final_cv_score = optimized_cv_score
    else:
        final_cv_score = base_cv_score
    
    print(f"\n{'='*80}")
    print(f"RESULTS: {experiment_name}")
    print(f"{'='*80}")
    print(f"  CV Score (Base): {base_cv_score:.6f}")
    if use_threshold_opt:
        print(f"  CV Score (Threshold Optimized): {final_cv_score:.6f}")
    print(f"  Mean ± Std: {np.mean(fold_scores):.6f} ± {np.std(fold_scores):.6f}")
    print(f"{'='*80}\n")
    
    return final_cv_score, fold_scores


# ============================================================================
# EXPERIMENT CONFIGURATIONS
# ============================================================================

BASELINE_SCORE = 0.576275  # Current best score

experiments = {
    'Baseline (Current)': {
        'use_zone_pairs': False,
        'use_class_weights': False,
        'use_threshold_opt': False,
        'use_feature_selection': False
    },
    'Exp1: Class Weights': {
        'use_zone_pairs': False,
        'use_class_weights': True,
        'use_threshold_opt': False,
        'use_feature_selection': False
    },
    'Exp2: Zone Pair Encoding': {
        'use_zone_pairs': True,
        'use_class_weights': False,
        'use_threshold_opt': False,
        'use_feature_selection': False
    },
    'Exp3: Threshold Optimization': {
        'use_zone_pairs': False,
        'use_class_weights': False,
        'use_threshold_opt': True,
        'use_feature_selection': False
    },
    'Exp4: Feature Selection': {
        'use_zone_pairs': False,
        'use_class_weights': False,
        'use_threshold_opt': False,
        'use_feature_selection': True
    },
    'Exp5: Best Combination': {
        'use_zone_pairs': True,
        'use_class_weights': True,
        'use_threshold_opt': True,
        'use_feature_selection': False
    }
}

# ============================================================================
# RUN ALL EXPERIMENTS
# ============================================================================

print("="*80)
print("SCORE IMPROVEMENT EXPERIMENTS")
print(f"Baseline Score to Beat: {BASELINE_SCORE:.6f}")
print("="*80)

experiment_results = {}

for exp_name, exp_config in experiments.items():
    # Prepare data for this experiment
    train_exp = train.copy()
    test_exp = test.copy()
    
    # Apply Zone Pair Features if needed (before preprocessing)
    if exp_config['use_zone_pairs']:
        print(f"\n[Pre-processing] Adding Zone Pair Features for {exp_name}...")
        train_exp, test_exp = add_zone_pair_features(train_exp, test_exp)
    
    # Apply same preprocessing as baseline
    cols_to_drop = [config.ID_COL, 'Timestamp', config.TARGET_COL]
    cols_to_drop = [col for col in cols_to_drop if col in train_exp.columns]
    
    y = train_exp[config.TARGET_COL].copy()
    X_tr = train_exp.drop(cols_to_drop, axis=1).copy()
    
    # Handle missing values (same as baseline)
    numeric_cols = X_tr.select_dtypes(include=[np.number]).columns.tolist()
    categorical_cols = X_tr.select_dtypes(include=['object']).columns.tolist()
    
    for col in numeric_cols:
        if X_tr[col].isnull().sum() > 0:
            X_tr[col].fillna(X_tr[col].median(), inplace=True)
    
    for col in categorical_cols:
        if X_tr[col].isnull().sum() > 0:
            X_tr[col].fillna(X_tr[col].mode()[0] if len(X_tr[col].mode()) > 0 else 'Unknown', inplace=True)
    
    # Outlier clipping (same as baseline)
    for col in numeric_cols:
        q99 = X_tr[col].quantile(0.99)
        q01 = X_tr[col].quantile(0.01)
        X_tr[col] = X_tr[col].clip(q01, q99)
    
    # Target encoding (same as baseline)
    le_temp = LabelEncoder()
    y_temp = le_temp.fit_transform(y)
    
    if len(categorical_cols) > 0:
        # Create dummy test for encoding
        X_te_dummy = test_exp.drop([col for col in cols_to_drop if col in test_exp.columns], axis=1).copy()
        for col in numeric_cols:
            if col in X_te_dummy.columns and X_te_dummy[col].isnull().sum() > 0:
                X_te_dummy[col].fillna(X_tr[col].median(), inplace=True)
        for col in categorical_cols:
            if col in X_te_dummy.columns and X_te_dummy[col].isnull().sum() > 0:
                X_te_dummy[col].fillna(X_tr[col].mode()[0] if len(X_tr[col].mode()) > 0 else 'Unknown', inplace=True)
        
        X_tr, X_te_dummy, _ = encode_categorical_target(X_tr, X_te_dummy, categorical_cols, y_temp, smoothing=10)
    
    # Run experiment
    cv_score, fold_scores = run_single_experiment(
        X_tr, y_temp, exp_name, **exp_config
    )
    
    experiment_results[exp_name] = {
        'cv_score': cv_score,
        'fold_scores': fold_scores,
        'improvement': cv_score - BASELINE_SCORE
    }
    
    gc.collect()

# ============================================================================
# FINAL SUMMARY
# ============================================================================

print("\n" + "="*80)
print("EXPERIMENT SUMMARY")
print("="*80)
print(f"{'Experiment':<35} {'CV Score':<12} {'Improvement':<12} {'Status'}")
print("-"*80)

for exp_name, results in experiment_results.items():
    cv_score = results['cv_score']
    improvement = results['improvement']
    status = "✓ BETTER" if improvement > 0 else ("✗ WORSE" if improvement < 0 else "= SAME")
    
    print(f"{exp_name:<35} {cv_score:<12.6f} {improvement:+.6f}     {status}")

print("="*80)

# Find best experiment
best_exp = max(experiment_results.items(), key=lambda x: x[1]['cv_score'])
print(f"\n🏆 BEST EXPERIMENT: {best_exp[0]}")
print(f"   Score: {best_exp[1]['cv_score']:.6f}")
print(f"   Improvement over baseline: {best_exp[1]['improvement']:+.6f}")
print("="*80)

In [16]:
def train_xgboost(X_train, y_train, X_test, n_folds=5, use_gpu=False, best_params=None):
    print("\n" + "="*80)
    print("Training XGBoost Models")
    print("="*80)
    
    if best_params is not None:
        print("Using tuned best parameters")
        params = config.get_xgboost_params(use_gpu=use_gpu)
        params.update(best_params)
    else:
        params = config.get_xgboost_params(use_gpu=use_gpu)
    
    params['num_class'] = len(np.unique(y_train))
    
    skf = StratifiedKFold(n_splits=n_folds, shuffle=True, random_state=config.RANDOM_STATE)
    
    oof_predictions = np.zeros((len(X_train), len(np.unique(y_train))))
    test_predictions = np.zeros((len(X_test), len(np.unique(y_train))))
    
    fold_scores = []
    models = []
    
    pbar = tqdm(enumerate(skf.split(X_train, y_train), 1), total=n_folds, desc="XGBoost Folds")
    for fold, (train_idx, val_idx) in pbar:
        pbar.set_description(f"XGBoost Fold {fold}/{n_folds}")
        
        X_tr, X_val = X_train.iloc[train_idx], X_train.iloc[val_idx]
        y_tr, y_val = y_train[train_idx], y_train[val_idx]
        
        dtrain = xgb.DMatrix(X_tr, label=y_tr)
        dval = xgb.DMatrix(X_val, label=y_val)
        dtest = xgb.DMatrix(X_test)
        
        model = xgb.train(
            params,
            dtrain,
            num_boost_round=1000,
            evals=[(dtrain, 'train'), (dval, 'valid')],
            custom_metric=macro_f1_eval,
            early_stopping_rounds=50,
            verbose_eval=100
        )
        
        oof_predictions[val_idx] = model.predict(dval)
        test_predictions += model.predict(dtest) / n_folds
        
        oof_pred_labels = np.argmax(oof_predictions[val_idx], axis=1)
        fold_score = f1_score(y_val, oof_pred_labels, average='macro')
        fold_scores.append(fold_score)

        print(f"\n{'='*60}")
        print(f"FOLD {fold} SUMMARY:")
        print(f"{'='*60}")
        print(f"  Validation F1 (Macro): {fold_score:.6f}")
        print(f"  Best Iteration: {model.best_iteration}")
        print(f"  Best Score (mlogloss): {model.best_score:.6f}")
        
        # Detailed classification report
        oof_fold_pred = np.argmax(oof_predictions[val_idx], axis=1)
        print(f"\n  Per-Class F1 Scores:")
        from sklearn.metrics import classification_report
        print(classification_report(y_val, oof_fold_pred, 
                                   target_names=[f"Class_{i}" for i in range(len(np.unique(y_train)))],
                                   digits=4))
        print(f"{'='*60}\n")
        
        pbar.set_postfix({'F1': f'{fold_score:.6f}'})
        
        models.append(model)
        gc.collect()
    
    oof_pred_labels = np.argmax(oof_predictions, axis=1)
    overall_score = f1_score(y_train, oof_pred_labels, average='macro')
    
        
    print("\n" + "="*80)
    print("XGBOOST TRAINING SUMMARY")
    print("="*80)
    print(f"Overall CV Score (Macro F1): {overall_score:.6f}")
    print(f"Standard Deviation: {np.std(fold_scores):.6f}")
    print(f"Min F1 Score: {np.min(fold_scores):.6f}")
    print(f"Max F1 Score: {np.max(fold_scores):.6f}")
    print(f"\nFold-by-Fold Scores:")
    for i, score in enumerate(fold_scores, 1):
        print(f"  Fold {i}: {score:.6f}")
    
    # Overall classification report pada OOF predictions
    print(f"\n" + "="*80)
    print("OVERALL OUT-OF-FOLD PREDICTIONS REPORT")
    print("="*80)
    print(classification_report(y_train, oof_pred_labels, 
                               target_names=[f"Class_{i}" for i in range(len(np.unique(y_train)))],
                               digits=4))
    print("="*80)

    return test_predictions, models, overall_score


xgboost_test_pred, xgboost_models, xgboost_cv_score = train_xgboost(
    X_train, y_train, X_test,
    n_folds=config.N_FOLDS,
    use_gpu=gpu_config['xgboost_gpu'],
    best_params=best_params
)


Training XGBoost Models
Using tuned best parameters
  XGBoost: GPU mode activated


XGBoost Folds:   0%|          | 0/5 [00:00<?, ?it/s]

[0]	train-mlogloss:1.27964	train-macro_f1:0.34425	valid-mlogloss:1.27966	valid-macro_f1:0.34399
[49]	train-mlogloss:0.59828	train-macro_f1:0.57634	valid-mlogloss:0.59871	valid-macro_f1:0.57644

FOLD 1 SUMMARY:
  Validation F1 (Macro): 0.576754
  Best Iteration: 0
  Best Score (mlogloss): 0.343988

  Per-Class F1 Scores:
              precision    recall  f1-score   support

     Class_0     0.9791    0.9760    0.9776     80062
     Class_1     0.9898    0.0048    0.0096    160358
     Class_2     0.7266    0.9926    0.8390    879522
     Class_3     0.9950    0.9502    0.9721    320319
     Class_4     0.6104    0.0460    0.0855    159739

    accuracy                         0.7898   1600000
   macro avg     0.8602    0.5939    0.5768   1600000
weighted avg     0.8077    0.7898    0.7142   1600000


[0]	train-mlogloss:1.27951	train-macro_f1:0.34391	valid-mlogloss:1.27954	valid-macro_f1:0.34384
[50]	train-mlogloss:0.59782	train-macro_f1:0.57551	valid-mlogloss:0.59768	valid-macro_f1:0.5

### Hyperparameter Tuning

In [None]:
# import optuna
# from optuna.samplers import TPESampler
# from optuna.pruners import MedianPruner

# def objective_xgboost(trial, X_train, y_train, use_gpu=True, stage='coarse'):
#     if stage == 'coarse':
#         param = {
#             'max_depth': trial.suggest_int('max_depth', 3, 12),
#             'min_child_weight': trial.suggest_int('min_child_weight', 1, 10),
#             'max_delta_step': trial.suggest_int('max_delta_step', 0, 5),
#             'gamma': trial.suggest_float('gamma', 0.0, 1.0),
#             'learning_rate': trial.suggest_float('learning_rate', 0.005, 0.3, log=True),
#             'subsample': trial.suggest_float('subsample', 0.5, 1.0),
#             'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0),
#             'colsample_bylevel': trial.suggest_float('colsample_bylevel', 0.5, 1.0),
#             'colsample_bynode': trial.suggest_float('colsample_bynode', 0.5, 1.0),
#             'reg_alpha': trial.suggest_float('reg_alpha', 0.0, 2.0),
#             'reg_lambda': trial.suggest_float('reg_lambda', 0.0, 2.0),
#         }
#     elif stage == 'refined':
#         param = {
#             'max_depth': trial.suggest_int('max_depth', 5, 9),
#             'min_child_weight': trial.suggest_int('min_child_weight', 1, 7),
#             'max_delta_step': trial.suggest_int('max_delta_step', 0, 3),
#             'gamma': trial.suggest_float('gamma', 0.0, 0.5),
#             'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.2, log=True),
#             'subsample': trial.suggest_float('subsample', 0.6, 1.0),
#             'colsample_bytree': trial.suggest_float('colsample_bytree', 0.6, 1.0),
#             'colsample_bylevel': trial.suggest_float('colsample_bylevel', 0.6, 1.0),
#             'colsample_bynode': trial.suggest_float('colsample_bynode', 0.6, 1.0),
#             'reg_alpha': trial.suggest_float('reg_alpha', 0.0, 1.5),
#             'reg_lambda': trial.suggest_float('reg_lambda', 0.0, 1.5),
#         }
#     else:
#         param = {
#             'max_depth': trial.suggest_int('max_depth', 6, 8),
#             'min_child_weight': trial.suggest_int('min_child_weight', 2, 5),
#             'max_delta_step': trial.suggest_int('max_delta_step', 0, 2),
#             'gamma': trial.suggest_float('gamma', 0.0, 0.3),
#             'learning_rate': trial.suggest_float('learning_rate', 0.02, 0.15, log=True),
#             'subsample': trial.suggest_float('subsample', 0.7, 0.95),
#             'colsample_bytree': trial.suggest_float('colsample_bytree', 0.7, 0.95),
#             'colsample_bylevel': trial.suggest_float('colsample_bylevel', 0.7, 0.95),
#             'colsample_bynode': trial.suggest_float('colsample_bynode', 0.7, 0.95),
#             'reg_alpha': trial.suggest_float('reg_alpha', 0.0, 1.0),
#             'reg_lambda': trial.suggest_float('reg_lambda', 0.0, 1.0),
#         }
    
#     param.update({
#         'objective': 'multi:softprob',
#         'num_class': len(np.unique(y_train)),
#         'eval_metric': 'mlogloss',
#         'device': 'cuda' if use_gpu else 'cpu',
#         'verbosity': 0,
#         'random_state': 42,
#     })
    
#     skf = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)
#     cv_scores = []
    
#     for fold_idx, (train_idx, val_idx) in enumerate(skf.split(X_train, y_train)):
#         X_tr, X_val = X_train.iloc[train_idx], X_train.iloc[val_idx]
#         y_tr, y_val = y_train[train_idx], y_train[val_idx]
        
#         dtrain = xgb.DMatrix(X_tr, label=y_tr)
#         dval = xgb.DMatrix(X_val, label=y_val)
        
#         model = xgb.train(
#             param,
#             dtrain,
#             num_boost_round=500,
#             evals=[(dval, 'valid')],
#             early_stopping_rounds=30,
#             verbose_eval=False
#         )
        
#         preds = model.predict(dval)
#         pred_labels = np.argmax(preds, axis=1)
#         fold_score = f1_score(y_val, pred_labels, average='macro')
#         cv_scores.append(fold_score)
        
#         trial.report(fold_score, fold_idx)
        
#         if trial.should_prune():
#             raise optuna.TrialPruned()
        
#         del dtrain, dval, model
#         gc.collect()
    
#     return np.mean(cv_scores)


# def run_tuning_pipeline(X_train, y_train, use_gpu=True):
#     print("="*80)
#     print("HYPERPARAMETER TUNING")
#     print("="*80)
    
#     all_studies = {}
#     stage_info = {
#         'coarse': {'trials': 150, 'desc': 'Wide parameter exploration'},
#         'refined': {'trials': 150, 'desc': 'Focused search around best regions'},
#         'fine': {'trials': 150, 'desc': 'Precision tuning'}
#     }
    
#     for stage_idx, (stage_name, info) in enumerate(stage_info.items(), 1):
#         print(f"\n{'='*80}")
#         print(f"STAGE {stage_idx}/3: {stage_name.upper()} SEARCH")
#         print(f"{'='*80}")
#         print(f"Description: {info['desc']}")
#         print(f"Trials: {info['trials']}")
#         print(f"{'='*80}\n")
        
#         sampler = TPESampler(seed=42 + stage_idx)
#         pruner = MedianPruner(n_startup_trials=5, n_warmup_steps=3)
        
#         study = optuna.create_study(
#             direction='maximize',
#             sampler=sampler,
#             pruner=pruner,
#             study_name=f'xgboost_{stage_name}'
#         )
        
#         study.optimize(
#             lambda trial: objective_xgboost(trial, X_train, y_train, use_gpu, stage_name),
#             n_trials=info['trials'],
#             show_progress_bar=True,
#             callbacks=[
#                 lambda study, trial: print(
#                     f"  Trial {trial.number+1}/{info['trials']}: "
#                     f"F1={trial.value:.6f} | Best={study.best_value:.6f}"
#                 ) if trial.value else None
#             ]
#         )
        
#         all_studies[stage_name] = study
        
#         print(f"\n{'='*80}")
#         print(f"STAGE {stage_idx} COMPLETED")
#         print(f"{'='*80}")
#         print(f"Best F1 Score: {study.best_value:.6f}")
#         print(f"Best Trial: #{study.best_trial.number}")
#         print(f"\nBest Parameters:")
#         for key, value in study.best_params.items():
#             print(f"  {key:20s}: {value}")
        
#         print(f"\nStage Statistics:")
#         print(f"  Complete: {len([t for t in study.trials if t.state == optuna.trial.TrialState.COMPLETE])}")
#         print(f"  Pruned:   {len([t for t in study.trials if t.state == optuna.trial.TrialState.PRUNED])}")
#         print(f"  Failed:   {len([t for t in study.trials if t.state == optuna.trial.TrialState.FAIL])}")
#         print(f"{'='*80}")
        
#         import time
#         time.sleep(2)
    
#     print(f"\n{'='*80}")
#     print("FINAL RESULTS")
#     print(f"{'='*80}\n")
    
#     best_stage = None
#     best_score = -1
#     best_params = None
    
#     for stage_name, study in all_studies.items():
#         print(f"{stage_name.upper():10s} Best: {study.best_value:.6f}")
#         if study.best_value > best_score:
#             best_score = study.best_value
#             best_stage = stage_name
#             best_params = study.best_params
    
#     print(f"\n{'='*80}")
#     print(f"OVERALL BEST: {best_stage.upper()} Stage")
#     print(f"{'='*80}")
#     print(f"Best F1 Score: {best_score:.6f}")
#     print(f"\nFinal Best Parameters:")
#     for key, value in best_params.items():
#         print(f"  {key:20s}: {value}")
    
#     print(f"\n{'='*80}")
#     print("STAGE PROGRESSION:")
#     print(f"{'='*80}")
#     scores = [all_studies[s].best_value for s in ['coarse', 'refined', 'fine']]
#     for i, (stage, score) in enumerate(zip(['Coarse', 'Refined', 'Fine'], scores), 1):
#         improvement = (score - scores[0]) if i > 1 else 0
#         print(f"Stage {i} ({stage:7s}): {score:.6f} ({improvement:+.6f})")
    
#     print(f"\nTotal Improvement: {scores[-1] - scores[0]:+.6f} ({(scores[-1] - scores[0])/scores[0]*100:+.2f}%)")
#     print(f"{'='*80}")
    
#     return best_params, all_studies, best_score


# def train_with_best_params(X_train, y_train, X_test, best_params, n_folds=5, use_gpu=True):
#     print("\n" + "="*80)
#     print("TRAINING FINAL MODEL WITH BEST PARAMETERS")
#     print("="*80)
    
#     params = {
#         'objective': 'multi:softprob',
#         'num_class': len(np.unique(y_train)),
#         'eval_metric': 'mlogloss',
#         'device': 'cuda' if use_gpu else 'cpu',
#         'verbosity': 1,
#         'random_state': 42,
#         **best_params
#     }
    
#     print("\nFinal Parameters:")
#     for key, value in params.items():
#         print(f"  {key:20s}: {value}")
#     print("="*80)
    
#     skf = StratifiedKFold(n_splits=n_folds, shuffle=True, random_state=42)
    
#     oof_predictions = np.zeros((len(X_train), len(np.unique(y_train))))
#     test_predictions = np.zeros((len(X_test), len(np.unique(y_train))))
#     fold_scores = []
#     models = []
    
#     for fold, (train_idx, val_idx) in enumerate(skf.split(X_train, y_train), 1):
#         print(f"\n{'='*60}")
#         print(f"Training Fold {fold}/{n_folds}")
#         print(f"{'='*60}")
        
#         X_tr, X_val = X_train.iloc[train_idx], X_train.iloc[val_idx]
#         y_tr, y_val = y_train[train_idx], y_train[val_idx]
        
#         dtrain = xgb.DMatrix(X_tr, label=y_tr)
#         dval = xgb.DMatrix(X_val, label=y_val)
#         dtest = xgb.DMatrix(X_test)
        
#         model = xgb.train(
#             params,
#             dtrain,
#             num_boost_round=1000,
#             evals=[(dtrain, 'train'), (dval, 'valid')],
#             custom_metric=macro_f1_eval,
#             early_stopping_rounds=50,
#             verbose_eval=100
#         )
        
#         oof_predictions[val_idx] = model.predict(dval)
#         test_predictions += model.predict(dtest) / n_folds
        
#         oof_pred_labels = np.argmax(oof_predictions[val_idx], axis=1)
#         fold_score = f1_score(y_val, oof_pred_labels, average='macro')
#         fold_scores.append(fold_score)
        
#         print(f"\nFold {fold} F1 Score: {fold_score:.6f}")
        
#         models.append(model)
#         del dtrain, dval, dtest
#         gc.collect()
    
#     oof_pred_labels = np.argmax(oof_predictions, axis=1)
#     overall_score = f1_score(y_train, oof_pred_labels, average='macro')
    
#     print(f"\n{'='*80}")
#     print("TUNED MODEL TRAINING SUMMARY")
#     print(f"{'='*80}")
#     print(f"Overall CV Score: {overall_score:.6f}")
#     print(f"Standard Deviation: {np.std(fold_scores):.6f}")
#     print(f"Min F1: {np.min(fold_scores):.6f}")
#     print(f"Max F1: {np.max(fold_scores):.6f}")
#     print("\nFold Scores:")
#     for i, score in enumerate(fold_scores, 1):
#         print(f"  Fold {i}: {score:.6f}")
#     print("="*80)
    
#     return test_predictions, models, overall_score

In [None]:
# best_params, all_studies, tuning_best_score = run_tuning_pipeline(
#     X_train, y_train, 
#     use_gpu=gpu_config['xgboost_gpu']
# )

# import json
# with open('best_params.json', 'w') as f:
#     json.dump(best_params, f, indent=2)
# print(f"\nBest parameters saved to best_params.json")

# tuned_test_pred, tuned_models, tuned_cv_score = train_with_best_params(
#     X_train, y_train, X_test, 
#     best_params, 
#     n_folds=config.N_FOLDS,
#     use_gpu=gpu_config['xgboost_gpu']
# )

# xgboost_test_pred = tuned_test_pred
# xgboost_cv_score = tuned_cv_score

## 6. Model Evaluation and Inference

In [17]:
print("\n" + "="*80)
print("MODEL EVALUATION")
print("="*80)

if xgboost_test_pred is not None:
    print(f"\n✓ XGBoost Model Successfully Trained")
    print(f"  Cross-Validation Score: {xgboost_cv_score:.6f}")
    print(f"  Model Type: XGBoost with GPU acceleration")
    
    print("\n" + "="*80)
    print("GENERATING PREDICTIONS ON TEST SET")
    print("="*80)
    
    final_predictions = xgboost_test_pred
    final_pred_labels = np.argmax(final_predictions, axis=1)
    
    print(f"\n✓ Predictions Generated Successfully")
    print(f"  Total test samples: {len(final_pred_labels):,}")
    print(f"  Prediction shape: {final_predictions.shape}")
    print(f"  Classes predicted: {len(np.unique(final_pred_labels))}")
    
    print("\n" + "="*80)
    print("PREDICTION DISTRIBUTION")
    print("="*80)
    unique, counts = np.unique(final_pred_labels, return_counts=True)
    for class_idx, count in zip(unique, counts):
        percentage = (count / len(final_pred_labels)) * 100
        print(f"  Class {class_idx}: {count:,} samples ({percentage:.2f}%)")
    
else:
    raise ValueError("XGBoost model training failed! Cannot generate predictions.")

print("\n" + "="*80)


MODEL EVALUATION

✓ XGBoost Model Successfully Trained
  Cross-Validation Score: 0.576275
  Model Type: XGBoost with GPU acceleration

GENERATING PREDICTIONS ON TEST SET

✓ Predictions Generated Successfully
  Total test samples: 4,000,000
  Prediction shape: (4000000, 5)
  Classes predicted: 5

PREDICTION DISTRIBUTION
  Class 0: 201,062 samples (5.03%)
  Class 1: 5,528 samples (0.14%)
  Class 2: 2,971,685 samples (74.29%)
  Class 3: 789,746 samples (19.74%)
  Class 4: 31,979 samples (0.80%)



## 7. Generate Submission

In [18]:
def create_submission(test_ids, predictions, le_target, filename='submission.csv'):
    pred_labels = le_target.inverse_transform(predictions)
    
    submission = pd.DataFrame({
        config.ID_COL: test_ids,
        config.TARGET_COL: pred_labels
    })
    
    submission.to_csv(filename, index=False)
    
    print(f"\nSubmission saved to: {filename}")
    print(f"Submission shape: {submission.shape}")
    print(f"\nPrediction distribution:")
    print(submission[config.TARGET_COL].value_counts())
    
    return submission

test_ids = test[config.ID_COL].values
submission = create_submission(test_ids, final_pred_labels, le_target, 'submission.csv')
submission.head(10)


Submission saved to: submission.csv
Submission shape: (4000000, 2)

Prediction distribution:
Trip_Label
Perfect_Trip         2971685
Safety_Violation      789746
Fraud_Indication      201062
Service_Complaint      31979
Navigation_Issue        5528
Name: count, dtype: int64


Unnamed: 0,Trip_ID,Trip_Label
0,TRIP-06583736,Perfect_Trip
1,TRIP-11356251,Perfect_Trip
2,TRIP-03320505,Perfect_Trip
3,TRIP-07188814,Perfect_Trip
4,TRIP-06994869,Perfect_Trip
5,TRIP-03232331,Perfect_Trip
6,TRIP-03536120,Perfect_Trip
7,TRIP-06411895,Perfect_Trip
8,TRIP-00132176,Perfect_Trip
9,TRIP-00298208,Perfect_Trip


## 8. Validation & Analysis

In [19]:
print("\n" + "="*80)
print("Final Validation Checks")
print("="*80)

assert submission.shape[0] == test.shape[0], "Submission size mismatch!"
assert submission.columns.tolist() == [config.ID_COL, config.TARGET_COL], "Column names mismatch!"
assert submission[config.TARGET_COL].isnull().sum() == 0, "Null predictions found!"

expected_labels = set(le_target.classes_)
submission_labels = set(submission[config.TARGET_COL].unique())
assert submission_labels.issubset(expected_labels), "Invalid labels in submission!"
print("All validation checks passed!")
print("="*80)


Final Validation Checks
All validation checks passed!
