## 1. Configuration & Setup

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
from datetime import datetime
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import f1_score, classification_report, confusion_matrix
import gc
from tqdm.auto import tqdm
tqdm.pandas()

warnings.filterwarnings('ignore')
pd.set_option('display.max_columns', None)

print("Libraries imported successfully.")

Libraries imported successfully.


In [2]:
USE_GPU = True  

gpu_config = {
    'xgboost_gpu': USE_GPU
}

In [3]:
class Config:
    DATA_PATH = '/kaggle/input/ride-hailing-trip-classification-dataset/'
    
    N_FOLDS = 5
    RANDOM_STATE = 42
    TARGET_COL = 'Trip_Label'
    ID_COL = 'Trip_ID'
    
    USE_TEMPORAL = False
    USE_DISTANCE = False
    USE_SENSOR_AGG = False
    USE_ECONOMIC = False
    USE_INTERACTION = False
    
    @staticmethod
    def get_catboost_params(use_gpu=False):
        params = {
            'iterations': 1000,
            'learning_rate': 0.05,
            'depth': 6,
            'loss_function': 'MultiClass',
            'eval_metric': 'TotalF1:average=Macro',
            'auto_class_weights': 'Balanced',
            'random_seed': 42,
            'verbose': 100,
            'early_stopping_rounds': 50
        }
        if use_gpu:
            params['task_type'] = 'GPU'
            params['devices'] = '0'
            print("  CatBoost: GPU mode activated")
        else:
            params['task_type'] = 'CPU'
            print("  CatBoost: CPU mode")
        return params
    
    @staticmethod
    def get_lightgbm_params(use_gpu=False):
        params = {
            'objective': 'multiclass',
            'metric': 'multi_logloss',
            'boosting_type': 'gbdt',
            'num_leaves': 31,
            'learning_rate': 0.05,
            'feature_fraction': 0.8,
            'bagging_fraction': 0.8,
            'bagging_freq': 5,
            'random_state': 42,
            'verbose': -1,
            'min_data_in_leaf': 100,
            'min_sum_hessian_in_leaf': 1e-2
        }
        if use_gpu:
            params['device'] = 'gpu'
            print("  LightGBM: GPU mode activated")
        else:
            params['device'] = 'cpu'
            print("  LightGBM: CPU mode")
        return params
    
    @staticmethod
    def get_xgboost_params(use_gpu=False):
        params = {
            'objective': 'multi:softprob',
            'eval_metric': 'mlogloss',
            'max_depth': 6,
            'learning_rate': 0.05,
            'subsample': 0.8,
            'colsample_bytree': 0.8,
            'random_state': 42,
            'verbosity': 1
        }
        if use_gpu:
            params['device'] = 'cuda'
            print("  XGBoost: GPU mode activated")
        else:
            params['device'] = 'cpu'
            print("  XGBoost: CPU mode")
        return params

config = Config()
print("\nConfiguration loaded successfully.")


Configuration loaded successfully.


## 2. Data Loading & Validation

In [4]:
def load_data():
    print("Loading data...")
    files = ['train.csv', 'test.csv', 'sample_submission.csv']
    data = {}
    
    for file in tqdm(files, desc="Loading files"):
        data[file.replace('.csv', '')] = pd.read_csv(config.DATA_PATH + file)
    
    train = data['train']
    test = data['test']
    sample_submission = data['sample_submission']
    
    print(f"Train shape: {train.shape}")
    print(f"Test shape: {test.shape}")
    print(f"Sample submission shape: {sample_submission.shape}")
    
    if config.TARGET_COL in train.columns:
        print(f"\nTarget distribution:")
        print(train[config.TARGET_COL].value_counts())
    
    return train, test, sample_submission

train, test, sample_submission = load_data()

Loading data...


Loading files:   0%|          | 0/3 [00:00<?, ?it/s]

Train shape: (8000000, 25)
Test shape: (4000000, 24)
Sample submission shape: (4000000, 2)

Target distribution:
Trip_Label
Perfect_Trip         4397607
Safety_Violation     1601595
Navigation_Issue      801790
Service_Complaint     798695
Fraud_Indication      400313
Name: count, dtype: int64


In [5]:
train

Unnamed: 0,Trip_ID,Timestamp,Pickup_Lat,Pickup_Long,Dropoff_Lat,Dropoff_Long,GPS_Accuracy_M,Distance_KM,Est_Price_IDR,Surge_Multiplier,Accel_X,Accel_Y,Accel_Z,Gyro_Z,Pickup_Zone,Dropoff_Zone,Device_FP,Promo_Code,Car_Model,Payment_Method,Weather,Traffic,Battery_Level,Signal_Strength,Trip_Label
0,TRIP-00383699,,-6.171481,106.890715,,106.961226,7.602804,21.586724,328518.276846,3.093531,-3.637907,0.304190,,0.235675,JKT-ZONE-06399,JKT-ZONE-03083,Apple-iPhone-7-v3.0,PCP6AQAY,Honda Brio,Cash,Rain,Lancar,14%,3G,Safety_Violation
1,TRIP-11839677,2024-05-17 00:47:56,,106.962939,-6.317811,106.936982,3.567463,2.900550,57636.166476,,0.398592,0.595000,9.918528,0.105604,JKT-ZONE-18157,JKT-ZONE-00734,Oppo-Reno-6-v4.0,NO_VOUCHER,,Credit_Card,Cloudy,Lancar,25%,3G,Perfect_Trip
2,TRIP-00401267,2024-01-05 15:27:46,-6.103016,106.716991,-6.002158,106.614401,12.665372,16.015038,141390.172639,1.726858,-0.125994,0.424621,9.867769,-0.034304,JKT-ZONE-08919,JKT-ZONE-17755,Samsung-iPhone-5-v7.0,NO_VOUCHER,Toyota Avanza,,Clear,,46%,3G,Perfect_Trip
3,TRIP-07296718,2024-03-25 10:51:57,-6.128631,,-6.025428,106.972658,24.973850,11.497623,169474.278890,3.031589,0.656659,-0.211875,9.891046,-0.076039,JKT-ZONE-02891,JKT-ZONE-16716,Vivo-Hot-10-v7.0,,Honda Brio,OVO,Clear,Padat,,Edge,Perfect_Trip
4,TRIP-10098925,2024-04-26 21:15:24,,106.930359,-5.958007,,13.835665,19.666645,102339.029906,1.055553,1.105691,0.245058,9.658218,-0.120996,JKT-ZONE-04617,JKT-ZONE-11542,Apple-Hot-8-v4.0,NO_VOUCHER,Honda Brio,Credit_Card,Storm,Lancar,5%,3G,Perfect_Trip
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7999995,TRIP-01353088,,-6.112390,107.044023,-6.226453,106.919490,21.854880,18.799213,109668.178265,1.143535,-2.939686,-0.132234,10.060896,0.676360,JKT-ZONE-03505,JKT-ZONE-02747,Oppo-iPhone-6-v6.0,V8MIQRKA,Honda Brio,Credit_Card,Cloudy,Lancar,22%,Edge,Safety_Violation
7999996,TRIP-02587105,,,106.834473,-6.170596,106.831777,3.460803,2.883191,75433.883346,3.125755,,1.179822,9.997362,-0.122138,,JKT-ZONE-07653,Apple-Reno-11-v5.0,NO_VOUCHER,Daihatsu Sigra,Gopay,Storm,Padat,,Edge,Perfect_Trip
7999997,TRIP-03695100,2024-02-12 18:24:59,-6.312129,106.750740,,,11.436401,16.256813,88926.496956,1.245230,,,,0.126295,JKT-ZONE-05303,JKT-ZONE-10856,Apple-Reno-11-v6.0,NO_VOUCHER,,Credit_Card,Cloudy,,,Edge,Service_Complaint
7999998,TRIP-05040403,2024-02-28 08:06:42,-6.239998,106.763947,-6.318290,106.835430,18.077910,0.001180,20404.680619,1.699778,-1.353521,0.398377,9.948944,0.241210,,JKT-ZONE-12772,Oppo-iPhone-6-v3.0,NO_VOUCHER,Honda Brio,,Storm,Padat,6%,3G,Fraud_Indication


In [6]:
def optimize_memory(df):
    print(f"Optimizing memory for dataframe with {len(df.columns)} columns...")
    for col in tqdm(df.columns, desc="Optimizing columns"):
        col_type = df[col].dtype
        if col_type != 'object':
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
            else:
                if c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
    return df

print("Optimizing memory...")
train = optimize_memory(train)
test = optimize_memory(test)
print("Memory optimization completed.")

Optimizing memory...
Optimizing memory for dataframe with 25 columns...


Optimizing columns:   0%|          | 0/25 [00:00<?, ?it/s]

Optimizing memory for dataframe with 24 columns...


Optimizing columns:   0%|          | 0/24 [00:00<?, ?it/s]

Memory optimization completed.


## 3. Feature Engineering

In [7]:
def haversine_distance(lat1, lon1, lat2, lon2):
    R = 6371
    lat1_rad = np.radians(lat1)
    lat2_rad = np.radians(lat2)
    delta_lat = np.radians(lat2 - lat1)
    delta_lon = np.radians(lon2 - lon1)
    
    a = np.sin(delta_lat/2)**2 + np.cos(lat1_rad) * np.cos(lat2_rad) * np.sin(delta_lon/2)**2
    c = 2 * np.arctan2(np.sqrt(a), np.sqrt(1-a))
    
    return R * c

def engineer_features(df, is_train=True):
    print(f"Engineering features for {'train' if is_train else 'test'} set...")
    df = df.copy()
    
    if config.USE_TEMPORAL and 'Timestamp' in df.columns:
        print("  - Creating temporal features...")
        df['Timestamp_parsed'] = pd.to_datetime(df['Timestamp'], errors='coerce')
        df['Hour'] = df['Timestamp_parsed'].dt.hour
        df['DayOfWeek'] = df['Timestamp_parsed'].dt.dayofweek
        df['Month'] = df['Timestamp_parsed'].dt.month
        df['IsWeekend'] = (df['DayOfWeek'] >= 5).astype(float)
        df['IsRushHour'] = ((df['Hour'] >= 7) & (df['Hour'] <= 9) | 
                            (df['Hour'] >= 17) & (df['Hour'] <= 19)).astype(float)
        df['IsLateNight'] = ((df['Hour'] >= 22) | (df['Hour'] <= 5)).astype(float)
        df.drop('Timestamp_parsed', axis=1, inplace=True)
    
    if config.USE_DISTANCE:
        print("  - Creating distance features...")
        if all(col in df.columns for col in ['Pickup_Lat', 'Pickup_Long', 'Dropoff_Lat', 'Dropoff_Long']):
            df['Haversine_Distance'] = haversine_distance(
                df['Pickup_Lat'], df['Pickup_Long'],
                df['Dropoff_Lat'], df['Dropoff_Long']
            )
            
            if 'Distance_KM' in df.columns:
                df['Distance_Ratio'] = df['Distance_KM'] / (df['Haversine_Distance'] + 1e-6)
                df['Distance_Difference'] = np.abs(df['Distance_KM'] - df['Haversine_Distance'])
        
        if 'Pickup_Zone' in df.columns and 'Dropoff_Zone' in df.columns:
            df['Is_Same_Zone'] = (df['Pickup_Zone'] == df['Dropoff_Zone']).astype(np.int8)
    
    if config.USE_SENSOR_AGG:
        print("  - Creating sensor aggregation features...")
        if all(col in df.columns for col in ['Accel_X', 'Accel_Y', 'Accel_Z']):
            df['Accel_Magnitude'] = np.sqrt(df['Accel_X']**2 + df['Accel_Y']**2 + df['Accel_Z']**2)
            df['Accel_Max'] = df[['Accel_X', 'Accel_Y', 'Accel_Z']].max(axis=1)
            df['Accel_Min'] = df[['Accel_X', 'Accel_Y', 'Accel_Z']].min(axis=1)
            df['Accel_Std'] = df[['Accel_X', 'Accel_Y', 'Accel_Z']].std(axis=1)
            df['Accel_Range'] = df['Accel_Max'] - df['Accel_Min']
        
        if 'Gyro_Z' in df.columns:
            df['Gyro_Abs'] = np.abs(df['Gyro_Z'])
    
    if config.USE_ECONOMIC:
        print("  - Creating economic features...")
        if 'Est_Price_IDR' in df.columns and 'Distance_KM' in df.columns:
            df['Price_per_KM'] = df['Est_Price_IDR'] / (df['Distance_KM'] + 1e-6)
        
        if 'Promo_Code' in df.columns:
            df['Has_Promo'] = (df['Promo_Code'].notna()).astype(np.int8)
        
        if 'Surge_Multiplier' in df.columns:
            surge_filled = df['Surge_Multiplier'].fillna(1.0)
            df['Surge_Category'] = pd.cut(surge_filled, 
                                          bins=[0, 1, 1.5, 2, 10], 
                                          labels=[0, 1, 2, 3]).astype(np.int8)
    
    if config.USE_INTERACTION:
        print("  - Creating interaction features...")
        if 'Surge_Multiplier' in df.columns and 'Hour' in df.columns:
            df['Surge_Hour_Interaction'] = df['Surge_Multiplier'] * df['Hour']
        
        if 'Distance_KM' in df.columns and 'Traffic' in df.columns:
            traffic_map = {'Light': 1, 'Moderate': 2, 'Heavy': 3}
            df['Traffic_Numeric'] = df['Traffic'].map(traffic_map).fillna(0).astype(np.int8)
            df['Distance_Traffic'] = df['Distance_KM'] * df['Traffic_Numeric']
    
    print(f"Feature engineering completed. Shape: {df.shape}")
    return df

train = engineer_features(train, is_train=True)
test = engineer_features(test, is_train=False)
gc.collect()

Engineering features for train set...
Feature engineering completed. Shape: (8000000, 25)
Engineering features for test set...
Feature engineering completed. Shape: (4000000, 24)


0

## 4. Preprocessing

In [8]:
def encode_categorical_frequency(train, test, categorical_cols):
    """
    Encodes based on value frequency in training data
    """
    print("\n" + "="*80)
    print("FREQUENCY ENCODING")
    print("="*80)
    
    encoders = {}
    
    for col in tqdm(categorical_cols, desc="Frequency encoding"):
        freq_map = train[col].value_counts(dropna=False).to_dict()
        
        train[col] = train[col].map(freq_map).fillna(0).astype(np.int32)
        test[col] = test[col].map(freq_map).fillna(0).astype(np.int32)
        
        encoders[col] = {
            'type': 'frequency',
            'unique_values': len(freq_map),
            'unseen_in_test': (test[col] == 0).sum()
        }
    
    print(f"\nEncoded {len(categorical_cols)} categorical features")
    return train, test, encoders

In [9]:
def encode_categorical_target(train, test, categorical_cols, y_train, smoothing=10):
    """
    Encodes based on target mean, with smoothing to prevent overfitting
    """
    print("\n" + "="*80)
    print("TARGET ENCODING")
    print("="*80)
    
    encoders = {}
    global_mean = y_train.mean()
    
    for col in tqdm(categorical_cols, desc="Target encoding"):
        temp_df = pd.DataFrame({col: train[col], 'target': y_train})
        
        agg = temp_df.groupby(col)['target'].agg(['mean', 'count'])
        smoothed_mean = (agg['mean'] * agg['count'] + global_mean * smoothing) / (agg['count'] + smoothing)
        
        encoding_map = smoothed_mean.to_dict()
        
        train[col] = train[col].map(encoding_map).fillna(global_mean).astype(np.float32)
        test[col] = test[col].map(encoding_map).fillna(global_mean).astype(np.float32)
        
        encoders[col] = {
            'type': 'target',
            'unique_values': len(encoding_map),
            'global_mean': global_mean
        }
    
    print(f"\nEncoded {len(categorical_cols)} categorical features")
    return train, test, encoders

In [10]:
def encode_categorical_label_optimized(train, test, categorical_cols):
    """
    Uses map() for vectorized operations
    """
    print("\n" + "="*80)
    print("LABEL ENCODING ")
    print("="*80)
    
    encoders = {}
    
    for col in tqdm(categorical_cols, desc="Label encoding"):
        train[col] = train[col].astype(str)
        test[col] = test[col].astype(str)
        
        le = LabelEncoder()
        train[col] = le.fit_transform(train[col])
        
        mapping = {label: idx for idx, label in enumerate(le.classes_)}
        test[col] = test[col].map(mapping).fillna(-1).astype(np.int32)
        
        encoders[col] = {
            'type': 'label',
            'encoder': le,
            'unique_values': len(le.classes_),
            'unseen_in_test': (test[col] == -1).sum()
        }
    
    print(f"\nEncoded {len(categorical_cols)} categorical features")
    return train, test, encoders

In [11]:
def preprocess_data(train, test, encoding_method='frequency'):
    """
    Main preprocessing pipeline
    
    Parameters:
    -----------
    train : pd.DataFrame
        Training dataset
    test : pd.DataFrame
        Test dataset
    encoding_method : str
        Encoding method to use: 'frequency', 'target', or 'label'
    Returns:
    --------
    X_train, X_test, y_train, le_target, encoders
    """
    print("\n" + "="*80)
    print("DATA PREPROCESSING PIPELINE")
    print("="*80)
    print(f"Encoding method: {encoding_method.upper()}")
    
    cols_to_drop = [config.ID_COL, 'Timestamp']
    if config.TARGET_COL in train.columns:
        y = train[config.TARGET_COL].copy()
        cols_to_drop.append(config.TARGET_COL)
    else:
        y = None
    
    cols_to_drop = [col for col in cols_to_drop if col in train.columns]
    X_train = train.drop(cols_to_drop, axis=1).copy()
    X_test = test.drop([col for col in cols_to_drop if col in test.columns], axis=1).copy()
    
    print(f"\nInitial shapes:")
    print(f"  X_train: {X_train.shape}")
    print(f"  X_test: {X_test.shape}")
    
    print(f"\nMissing values before imputation:")
    train_missing = X_train.isnull().sum()
    if train_missing.sum() > 0:
        print(train_missing[train_missing > 0])
    else:
        print("  No missing values found")
    
    numeric_cols = X_train.select_dtypes(include=[np.number]).columns.tolist()
    categorical_cols = X_train.select_dtypes(include=['object']).columns.tolist()
    
    print(f"\nFeature types detected:")
    print(f"  Numeric features: {len(numeric_cols)}")
    print(f"  Categorical features: {len(categorical_cols)}")
    
    print("\n" + "="*80)
    print("STEP 1: Missing Value Imputation")
    print("="*80)
    
    for col in tqdm(numeric_cols, desc="Imputing numeric features"):
        if X_train[col].isnull().sum() > 0:
            median_val = X_train[col].median()
            X_train[col].fillna(median_val, inplace=True)
            X_test[col].fillna(median_val, inplace=True)
    
    for col in tqdm(categorical_cols, desc="Imputing categorical features"):
        if X_train[col].isnull().sum() > 0:
            X_train[col].fillna('Unknown', inplace=True)
            X_test[col].fillna('Unknown', inplace=True)
    
    print("\n" + "="*80)
    print("STEP 2: Outlier Clipping")
    print("="*80)
    
    for col in tqdm(numeric_cols, desc="Clipping outliers"):
        q99 = X_train[col].quantile(0.99)
        q01 = X_train[col].quantile(0.01)
        X_train[col] = X_train[col].clip(q01, q99)
        X_test[col] = X_test[col].clip(q01, q99)
    
    print("\n" + "="*80)
    print("STEP 3: Categorical Encoding")
    print("="*80)
    
    if len(categorical_cols) > 0:
        if encoding_method == 'frequency':
            X_train, X_test, encoders = encode_categorical_frequency(
                X_train, X_test, categorical_cols
            )
        elif encoding_method == 'target':
            if y is None:
                raise ValueError("Target encoding requires target variable")
            le_target_temp = LabelEncoder()
            y_temp = le_target_temp.fit_transform(y)
            X_train, X_test, encoders = encode_categorical_target(
                X_train, X_test, categorical_cols, y_temp
            )
        elif encoding_method == 'label':
            X_train, X_test, encoders = encode_categorical_label_optimized(
                X_train, X_test, categorical_cols
            )
        else:
            raise ValueError(f"Unknown encoding method: {encoding_method}")
    else:
        encoders = {}
        print("No categorical features to encode")
    
    print("\n" + "="*80)
    print("STEP 4: Target Encoding")
    print("="*80)
    
    if y is not None:
        le_target = LabelEncoder()
        y_encoded = le_target.fit_transform(y)
        print(f"\nTarget classes encoded:")
        for i, label in enumerate(le_target.classes_):
            count = (y_encoded == i).sum()
            print(f"  {i}: {label:20s} - {count:,} samples ({count/len(y_encoded)*100:.2f}%)")
    else:
        y_encoded = None
        le_target = None
    
    print("\n" + "="*80)
    print("PREPROCESSING COMPLETED")
    print("="*80)
    print(f"Final shapes:")
    print(f"  X_train: {X_train.shape}")
    print(f"  X_test: {X_test.shape}")
    if y_encoded is not None:
        print(f"  y_train: {y_encoded.shape}")
    print("="*80)
    
    return X_train, X_test, y_encoded, le_target, encoders

X_train, X_test, y_train, le_target, encoders = preprocess_data(train, test, encoding_method='target')
gc.collect()


DATA PREPROCESSING PIPELINE
Encoding method: TARGET

Initial shapes:
  X_train: (8000000, 22)
  X_test: (4000000, 22)

Missing values before imputation:
Pickup_Lat           929348
Pickup_Long          611083
Dropoff_Lat         1914440
Dropoff_Long        1556029
GPS_Accuracy_M      1504090
Distance_KM          941317
Est_Price_IDR       1151401
Surge_Multiplier     612307
Accel_X             1608442
Accel_Y             1081337
Accel_Z             1840229
Gyro_Z               701714
Pickup_Zone         1381274
Dropoff_Zone         765445
Device_FP           1499497
Promo_Code          1107810
Car_Model           1721777
Payment_Method      1141190
Weather              514649
Traffic             1808403
Battery_Level       1746399
Signal_Strength      408588
dtype: int64

Feature types detected:
  Numeric features: 12
  Categorical features: 10

STEP 1: Missing Value Imputation


Imputing numeric features:   0%|          | 0/12 [00:00<?, ?it/s]

Imputing categorical features:   0%|          | 0/10 [00:00<?, ?it/s]


STEP 2: Outlier Clipping


Clipping outliers:   0%|          | 0/12 [00:00<?, ?it/s]


STEP 3: Categorical Encoding

TARGET ENCODING


Target encoding:   0%|          | 0/10 [00:00<?, ?it/s]


Encoded 10 categorical features

STEP 4: Target Encoding

Target classes encoded:
  0: Fraud_Indication     - 400,313 samples (5.00%)
  1: Navigation_Issue     - 801,790 samples (10.02%)
  2: Perfect_Trip         - 4,397,607 samples (54.97%)
  3: Safety_Violation     - 1,601,595 samples (20.02%)
  4: Service_Complaint    - 798,695 samples (9.98%)

PREPROCESSING COMPLETED
Final shapes:
  X_train: (8000000, 22)
  X_test: (4000000, 22)
  y_train: (8000000,)


88

## 5. Model Training

In [12]:
def macro_f1_eval(preds, dtrain):
    """
    Custom evaluation function for XGBoost to calculate Macro F1
    """
    labels = dtrain.get_label()
    preds_reshaped = preds.reshape(len(labels), -1)
    pred_labels = np.argmax(preds_reshaped, axis=1)
    # Calculate macro F1
    score = f1_score(labels, pred_labels, average='macro')
    return 'macro_f1', score

In [13]:
try:
    import xgboost as xgb
    XGBOOST_AVAILABLE = True
except ImportError:
    print("XGBoost not available. Install with: pip install xgboost")
    XGBOOST_AVAILABLE = False

def train_xgboost(X_train, y_train, X_test, n_folds=5, use_gpu=False):
    if not XGBOOST_AVAILABLE:
        return None, None, None
    
    print("\n" + "="*80)
    print("Training XGBoost Models")
    print("="*80)
    
    params = config.get_xgboost_params(use_gpu=use_gpu)
    params['num_class'] = len(np.unique(y_train))
    
    skf = StratifiedKFold(n_splits=n_folds, shuffle=True, random_state=config.RANDOM_STATE)
    
    oof_predictions = np.zeros((len(X_train), len(np.unique(y_train))))
    test_predictions = np.zeros((len(X_test), len(np.unique(y_train))))
    
    fold_scores = []
    models = []
    
    pbar = tqdm(enumerate(skf.split(X_train, y_train), 1), total=n_folds, desc="XGBoost Folds")
    for fold, (train_idx, val_idx) in pbar:
        pbar.set_description(f"XGBoost Fold {fold}/{n_folds}")
        
        X_tr, X_val = X_train.iloc[train_idx], X_train.iloc[val_idx]
        y_tr, y_val = y_train[train_idx], y_train[val_idx]
        
        dtrain = xgb.DMatrix(X_tr, label=y_tr)
        dval = xgb.DMatrix(X_val, label=y_val)
        dtest = xgb.DMatrix(X_test)
        
        model = xgb.train(
            params,
            dtrain,
            num_boost_round=1000,
            evals=[(dtrain, 'train'), (dval, 'valid')],
            custom_metric=macro_f1_eval,
            early_stopping_rounds=50,
            verbose_eval=100
        )
        
        oof_predictions[val_idx] = model.predict(dval)
        test_predictions += model.predict(dtest) / n_folds
        
        oof_pred_labels = np.argmax(oof_predictions[val_idx], axis=1)
        fold_score = f1_score(y_val, oof_pred_labels, average='macro')
        fold_scores.append(fold_score)

        print(f"\n{'='*60}")
        print(f"FOLD {fold} SUMMARY:")
        print(f"{'='*60}")
        print(f"  Validation F1 (Macro): {fold_score:.6f}")
        print(f"  Best Iteration: {model.best_iteration}")
        print(f"  Best Score (mlogloss): {model.best_score:.6f}")
        
        # Detailed classification report
        oof_fold_pred = np.argmax(oof_predictions[val_idx], axis=1)
        print(f"\n  Per-Class F1 Scores:")
        from sklearn.metrics import classification_report
        print(classification_report(y_val, oof_fold_pred, 
                                   target_names=[f"Class_{i}" for i in range(len(np.unique(y_train)))],
                                   digits=4))
        print(f"{'='*60}\n")
        
        pbar.set_postfix({'F1': f'{fold_score:.6f}'})
        
        models.append(model)
        gc.collect()
    
    oof_pred_labels = np.argmax(oof_predictions, axis=1)
    overall_score = f1_score(y_train, oof_pred_labels, average='macro')
    
        
    print("\n" + "="*80)
    print("XGBOOST TRAINING SUMMARY")
    print("="*80)
    print(f"Overall CV Score (Macro F1): {overall_score:.6f}")
    print(f"Standard Deviation: {np.std(fold_scores):.6f}")
    print(f"Min F1 Score: {np.min(fold_scores):.6f}")
    print(f"Max F1 Score: {np.max(fold_scores):.6f}")
    print(f"\nFold-by-Fold Scores:")
    for i, score in enumerate(fold_scores, 1):
        print(f"  Fold {i}: {score:.6f}")
    
    # Overall classification report pada OOF predictions
    print(f"\n" + "="*80)
    print("OVERALL OUT-OF-FOLD PREDICTIONS REPORT")
    print("="*80)
    print(classification_report(y_train, oof_pred_labels, 
                               target_names=[f"Class_{i}" for i in range(len(np.unique(y_train)))],
                               digits=4))
    print("="*80)

    return test_predictions, models, overall_score


if XGBOOST_AVAILABLE:
    xgboost_test_pred, xgboost_models, xgboost_cv_score = train_xgboost(
        X_train, y_train, X_test,
        n_folds=config.N_FOLDS,
        use_gpu=gpu_config['xgboost_gpu']
    )
else:
    xgboost_test_pred, xgboost_models, xgboost_cv_score = None, None, 0.0


Training XGBoost Models
  XGBoost: GPU mode activated


XGBoost Folds:   0%|          | 0/5 [00:00<?, ?it/s]

[0]	train-mlogloss:1.40859	train-macro_f1:0.14189	valid-mlogloss:1.40861	valid-macro_f1:0.14189
[49]	train-mlogloss:0.69303	train-macro_f1:0.57215	valid-mlogloss:0.69332	valid-macro_f1:0.57213

FOLD 1 SUMMARY:
  Validation F1 (Macro): 0.572468
  Best Iteration: 0
  Best Score (mlogloss): 0.141886

  Per-Class F1 Scores:
              precision    recall  f1-score   support

     Class_0     0.9950    0.9657    0.9801     80062
     Class_1     1.0000    0.0039    0.0078    160358
     Class_2     0.7245    0.9964    0.8390    879522
     Class_3     0.9974    0.9470    0.9716    320319
     Class_4     0.6775    0.0335    0.0638    159739

    accuracy                         0.7894   1600000
   macro avg     0.8789    0.5893    0.5725   1600000
weighted avg     0.8156    0.7894    0.7119   1600000


[0]	train-mlogloss:1.40857	train-macro_f1:0.14189	valid-mlogloss:1.40859	valid-macro_f1:0.14189
[50]	train-mlogloss:0.68972	train-macro_f1:0.57195	valid-mlogloss:0.68968	valid-macro_f1:0.5

### 5.1 Experiment Suite

In [14]:
EXPERIMENTS = [
    {'name': 'freq_baseline', 'encoding': 'frequency', 'temporal': False, 'distance': False, 'sensor': False, 'economic': False, 'interaction': False},
    {'name': 'freq_distance', 'encoding': 'frequency', 'temporal': False, 'distance': True, 'sensor': False, 'economic': False, 'interaction': False},
    {'name': 'freq_sensor', 'encoding': 'frequency', 'temporal': False, 'distance': False, 'sensor': True, 'economic': False, 'interaction': False},
    {'name': 'freq_temporal', 'encoding': 'frequency', 'temporal': True, 'distance': False, 'sensor': False, 'economic': False, 'interaction': False},
    {'name': 'freq_dist_sens', 'encoding': 'frequency', 'temporal': False, 'distance': True, 'sensor': True, 'economic': False, 'interaction': False},
    {'name': 'freq_all', 'encoding': 'frequency', 'temporal': True, 'distance': True, 'sensor': True, 'economic': True, 'interaction': True},
    
    {'name': 'target_baseline', 'encoding': 'target', 'temporal': False, 'distance': False, 'sensor': False, 'economic': False, 'interaction': False},
    {'name': 'target_all', 'encoding': 'target', 'temporal': True, 'distance': True, 'sensor': True, 'economic': True, 'interaction': True},
    
    {'name': 'label_baseline', 'encoding': 'label', 'temporal': False, 'distance': False, 'sensor': False, 'economic': False, 'interaction': False},
    {'name': 'label_all', 'encoding': 'label', 'temporal': True, 'distance': True, 'sensor': True, 'economic': True, 'interaction': True},
]

def run_experiments(experiments, train_data, test_data, n_folds=5):
    original_state = {
        'USE_TEMPORAL': config.USE_TEMPORAL,
        'USE_DISTANCE': config.USE_DISTANCE,
        'USE_SENSOR_AGG': config.USE_SENSOR_AGG,
        'USE_ECONOMIC': config.USE_ECONOMIC,
        'USE_INTERACTION': config.USE_INTERACTION,
        'N_FOLDS': config.N_FOLDS
    }
    
    config.N_FOLDS = n_folds
    results = []
    
    for exp in experiments:
        print(f"\n{'='*80}")
        print(f"Running: {exp['name']}")
        print(f"{'='*80}")
        
        config.USE_TEMPORAL = exp['temporal']
        config.USE_DISTANCE = exp['distance']
        config.USE_SENSOR_AGG = exp['sensor']
        config.USE_ECONOMIC = exp['economic']
        config.USE_INTERACTION = exp['interaction']
        
        try:
            train_fe = engineer_features(train_data.copy(), is_train=True)
            test_fe = engineer_features(test_data.copy(), is_train=False)
            X_tr, X_te, y_tr, le_tgt, enc = preprocess_data(train_fe, test_fe, encoding_method=exp['encoding'])
            _, _, cv_score = train_xgboost(X_tr, y_tr, X_te, n_folds=config.N_FOLDS, use_gpu=gpu_config['xgboost_gpu'])
            
            results.append({'name': exp['name'], 'encoding': exp['encoding'], 'temporal': exp['temporal'], 
                          'distance': exp['distance'], 'sensor': exp['sensor'], 'economic': exp['economic'],
                          'interaction': exp['interaction'], 'macro_f1': cv_score})
            
            del X_tr, X_te, y_tr, train_fe, test_fe
            gc.collect()
            
        except Exception as e:
            results.append({'name': exp['name'], 'encoding': exp['encoding'], 'temporal': exp['temporal'],
                          'distance': exp['distance'], 'sensor': exp['sensor'], 'economic': exp['economic'],
                          'interaction': exp['interaction'], 'macro_f1': 0.0})
            print(f"Error: {str(e)}")
    
    config.USE_TEMPORAL = original_state['USE_TEMPORAL']
    config.USE_DISTANCE = original_state['USE_DISTANCE']
    config.USE_SENSOR_AGG = original_state['USE_SENSOR_AGG']
    config.USE_ECONOMIC = original_state['USE_ECONOMIC']
    config.USE_INTERACTION = original_state['USE_INTERACTION']
    config.N_FOLDS = original_state['N_FOLDS']
    
    results_df = pd.DataFrame(results).sort_values('macro_f1', ascending=False).reset_index(drop=True)
    
    print(f"\n{'='*80}")
    print("RESULTS")
    print(f"{'='*80}\n")
    print(results_df.to_string(index=False))
    
    print(f"\n{'='*80}")
    print(f"Best: {results_df.iloc[0]['name']} with F1={results_df.iloc[0]['macro_f1']:.6f}")
    print(f"{'='*80}")
    
    results_df.to_csv('experiment_results.csv', index=False)
    return results_df

# experiment_results = run_experiments(EXPERIMENTS, train, test, n_folds=5)

## 6. Model Evaluation and Inference

In [15]:
print("\n" + "="*80)
print("MODEL EVALUATION")
print("="*80)

if xgboost_test_pred is not None:
    print(f"\n✓ XGBoost Model Successfully Trained")
    print(f"  Cross-Validation Score: {xgboost_cv_score:.6f}")
    print(f"  Model Type: XGBoost with GPU acceleration")
    
    print("\n" + "="*80)
    print("GENERATING PREDICTIONS ON TEST SET")
    print("="*80)
    
    final_predictions = xgboost_test_pred
    final_pred_labels = np.argmax(final_predictions, axis=1)
    
    print(f"\n✓ Predictions Generated Successfully")
    print(f"  Total test samples: {len(final_pred_labels):,}")
    print(f"  Prediction shape: {final_predictions.shape}")
    print(f"  Classes predicted: {len(np.unique(final_pred_labels))}")
    
    print("\n" + "="*80)
    print("PREDICTION DISTRIBUTION")
    print("="*80)
    unique, counts = np.unique(final_pred_labels, return_counts=True)
    for class_idx, count in zip(unique, counts):
        percentage = (count / len(final_pred_labels)) * 100
        print(f"  Class {class_idx}: {count:,} samples ({percentage:.2f}%)")
    
else:
    raise ValueError("XGBoost model training failed! Cannot generate predictions.")

print("\n" + "="*80)


MODEL EVALUATION

✓ XGBoost Model Successfully Trained
  Cross-Validation Score: 0.572712
  Model Type: XGBoost with GPU acceleration

GENERATING PREDICTIONS ON TEST SET

✓ Predictions Generated Successfully
  Total test samples: 4,000,000
  Prediction shape: (4000000, 5)
  Classes predicted: 5

PREDICTION DISTRIBUTION
  Class 0: 196,754 samples (4.92%)
  Class 1: 5,461 samples (0.14%)
  Class 2: 2,986,758 samples (74.67%)
  Class 3: 787,533 samples (19.69%)
  Class 4: 23,494 samples (0.59%)



## 7. Generate Submission

In [16]:
def create_submission(test_ids, predictions, le_target, filename='submission.csv'):
    pred_labels = le_target.inverse_transform(predictions)
    
    submission = pd.DataFrame({
        config.ID_COL: test_ids,
        config.TARGET_COL: pred_labels
    })
    
    submission.to_csv(filename, index=False)
    
    print(f"\nSubmission saved to: {filename}")
    print(f"Submission shape: {submission.shape}")
    print(f"\nPrediction distribution:")
    print(submission[config.TARGET_COL].value_counts())
    
    return submission

test_ids = test[config.ID_COL].values
submission = create_submission(test_ids, final_pred_labels, le_target, 'submission.csv')
submission.head(10)


Submission saved to: submission.csv
Submission shape: (4000000, 2)

Prediction distribution:
Trip_Label
Perfect_Trip         2986758
Safety_Violation      787533
Fraud_Indication      196754
Service_Complaint      23494
Navigation_Issue        5461
Name: count, dtype: int64


Unnamed: 0,Trip_ID,Trip_Label
0,TRIP-06583736,Perfect_Trip
1,TRIP-11356251,Perfect_Trip
2,TRIP-03320505,Perfect_Trip
3,TRIP-07188814,Perfect_Trip
4,TRIP-06994869,Perfect_Trip
5,TRIP-03232331,Perfect_Trip
6,TRIP-03536120,Perfect_Trip
7,TRIP-06411895,Perfect_Trip
8,TRIP-00132176,Perfect_Trip
9,TRIP-00298208,Perfect_Trip


## 8. Validation & Analysis

In [17]:
print("\n" + "="*80)
print("Final Validation Checks")
print("="*80)

assert submission.shape[0] == test.shape[0], "Submission size mismatch!"
assert submission.columns.tolist() == [config.ID_COL, config.TARGET_COL], "Column names mismatch!"
assert submission[config.TARGET_COL].isnull().sum() == 0, "Null predictions found!"

expected_labels = set(le_target.classes_)
submission_labels = set(submission[config.TARGET_COL].unique())
assert submission_labels.issubset(expected_labels), "Invalid labels in submission!"
print("All validation checks passed!")
print("="*80)


Final Validation Checks
All validation checks passed!
