## 1. Configuration & Setup

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
from datetime import datetime
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import f1_score, classification_report, confusion_matrix
import gc
from tqdm.auto import tqdm
tqdm.pandas()

warnings.filterwarnings('ignore')
pd.set_option('display.max_columns', None)

print("Libraries imported successfully.")

Libraries imported successfully.


In [2]:
USE_GPU = True  

gpu_config = {
    'xgboost_gpu': USE_GPU
}

In [3]:
class Config:
    DATA_PATH = '/kaggle/input/ride-hailing-trip-classification-dataset/'
    
    N_FOLDS = 5
    RANDOM_STATE = 42
    TARGET_COL = 'Trip_Label'
    ID_COL = 'Trip_ID'
    
    USE_TEMPORAL = False
    USE_DISTANCE = True
    USE_SENSOR_AGG = True
    USE_ECONOMIC = False
    USE_INTERACTION = True
    
    @staticmethod
    def get_catboost_params(use_gpu=False):
        params = {
            'iterations': 1000,
            'learning_rate': 0.05,
            'depth': 6,
            'loss_function': 'MultiClass',
            'eval_metric': 'TotalF1:average=Macro',
            'auto_class_weights': 'Balanced',
            'random_seed': 42,
            'verbose': 100,
            'early_stopping_rounds': 50
        }
        if use_gpu:
            params['task_type'] = 'GPU'
            params['devices'] = '0'
            print("  CatBoost: GPU mode activated")
        else:
            params['task_type'] = 'CPU'
            print("  CatBoost: CPU mode")
        return params
    
    @staticmethod
    def get_lightgbm_params(use_gpu=False):
        params = {
            'objective': 'multiclass',
            'metric': 'multi_logloss',
            'boosting_type': 'gbdt',
            'num_leaves': 31,
            'learning_rate': 0.05,
            'feature_fraction': 0.8,
            'bagging_fraction': 0.8,
            'bagging_freq': 5,
            'random_state': 42,
            'verbose': -1,
            'min_data_in_leaf': 100,
            'min_sum_hessian_in_leaf': 1e-2
        }
        if use_gpu:
            params['device'] = 'gpu'
            print("  LightGBM: GPU mode activated")
        else:
            params['device'] = 'cpu'
            print("  LightGBM: CPU mode")
        return params
    
    @staticmethod
    def get_xgboost_params(use_gpu=False):
        params = {
            'objective': 'multi:softprob',
            'eval_metric': 'mlogloss',
            'max_depth': 6,
            'learning_rate': 0.05,
            'subsample': 0.8,
            'colsample_bytree': 0.8,
            'random_state': 42,
            'verbosity': 1
        }
        if use_gpu:
            params['device'] = 'cuda'
            print("  XGBoost: GPU mode activated")
        else:
            params['device'] = 'cpu'
            print("  XGBoost: CPU mode")
        return params

config = Config()
print("\nConfiguration loaded successfully.")


Configuration loaded successfully.


## 2. Data Loading & Validation

In [4]:
def load_data():
    print("Loading data...")
    files = ['train.csv', 'test.csv', 'sample_submission.csv']
    data = {}
    
    for file in tqdm(files, desc="Loading files"):
        data[file.replace('.csv', '')] = pd.read_csv(config.DATA_PATH + file)
    
    train = data['train']
    test = data['test']
    sample_submission = data['sample_submission']
    
    print(f"Train shape: {train.shape}")
    print(f"Test shape: {test.shape}")
    print(f"Sample submission shape: {sample_submission.shape}")
    
    if config.TARGET_COL in train.columns:
        print(f"\nTarget distribution:")
        print(train[config.TARGET_COL].value_counts())
    
    return train, test, sample_submission

train, test, sample_submission = load_data()

Loading data...


Loading files:   0%|          | 0/3 [00:00<?, ?it/s]

Train shape: (8000000, 25)
Test shape: (4000000, 24)
Sample submission shape: (4000000, 2)

Target distribution:
Trip_Label
Perfect_Trip         4397607
Safety_Violation     1601595
Navigation_Issue      801790
Service_Complaint     798695
Fraud_Indication      400313
Name: count, dtype: int64


In [5]:
def optimize_memory(df):
    print(f"Optimizing memory for dataframe with {len(df.columns)} columns...")
    for col in tqdm(df.columns, desc="Optimizing columns"):
        col_type = df[col].dtype
        if col_type != 'object':
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
            else:
                if c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
    return df

print("Optimizing memory...")
train = optimize_memory(train)
test = optimize_memory(test)
print("Memory optimization completed.")

Optimizing memory...
Optimizing memory for dataframe with 25 columns...


Optimizing columns:   0%|          | 0/25 [00:00<?, ?it/s]

Optimizing memory for dataframe with 24 columns...


Optimizing columns:   0%|          | 0/24 [00:00<?, ?it/s]

Memory optimization completed.


## 3. Feature Engineering

In [6]:
def haversine_distance(lat1, lon1, lat2, lon2):
    R = 6371
    lat1_rad = np.radians(lat1)
    lat2_rad = np.radians(lat2)
    delta_lat = np.radians(lat2 - lat1)
    delta_lon = np.radians(lon2 - lon1)
    
    a = np.sin(delta_lat/2)**2 + np.cos(lat1_rad) * np.cos(lat2_rad) * np.sin(delta_lon/2)**2
    c = 2 * np.arctan2(np.sqrt(a), np.sqrt(1-a))
    
    return R * c

def engineer_features(df, is_train=True):
    print(f"Engineering features for {'train' if is_train else 'test'} set...")
    df = df.copy()
    
    if config.USE_TEMPORAL and 'Timestamp' in df.columns:
        print("  - Creating temporal features...")
        df['Timestamp_parsed'] = pd.to_datetime(df['Timestamp'])
        df['Hour'] = df['Timestamp_parsed'].dt.hour.astype(np.int8)
        df['DayOfWeek'] = df['Timestamp_parsed'].dt.dayofweek.astype(np.int8)
        df['Month'] = df['Timestamp_parsed'].dt.month.astype(np.int8)
        df['IsWeekend'] = (df['DayOfWeek'] >= 5).astype(np.int8)
        df['IsRushHour'] = ((df['Hour'] >= 7) & (df['Hour'] <= 9) | 
                            (df['Hour'] >= 17) & (df['Hour'] <= 19)).astype(np.int8)
        df['IsLateNight'] = ((df['Hour'] >= 22) | (df['Hour'] <= 5)).astype(np.int8)
        df.drop('Timestamp_parsed', axis=1, inplace=True)
    
    if config.USE_DISTANCE:
        print("  - Creating distance features...")
        if all(col in df.columns for col in ['Pickup_Lat', 'Pickup_Long', 'Dropoff_Lat', 'Dropoff_Long']):
            df['Haversine_Distance'] = haversine_distance(
                df['Pickup_Lat'], df['Pickup_Long'],
                df['Dropoff_Lat'], df['Dropoff_Long']
            )
            
            if 'Distance_KM' in df.columns:
                df['Distance_Ratio'] = df['Distance_KM'] / (df['Haversine_Distance'] + 1e-6)
                df['Distance_Difference'] = np.abs(df['Distance_KM'] - df['Haversine_Distance'])
        
        if 'Pickup_Zone' in df.columns and 'Dropoff_Zone' in df.columns:
            df['Is_Same_Zone'] = (df['Pickup_Zone'] == df['Dropoff_Zone']).astype(np.int8)
    
    if config.USE_SENSOR_AGG:
        print("  - Creating sensor aggregation features...")
        if all(col in df.columns for col in ['Accel_X', 'Accel_Y', 'Accel_Z']):
            df['Accel_Magnitude'] = np.sqrt(df['Accel_X']**2 + df['Accel_Y']**2 + df['Accel_Z']**2)
            df['Accel_Max'] = df[['Accel_X', 'Accel_Y', 'Accel_Z']].max(axis=1)
            df['Accel_Min'] = df[['Accel_X', 'Accel_Y', 'Accel_Z']].min(axis=1)
            df['Accel_Std'] = df[['Accel_X', 'Accel_Y', 'Accel_Z']].std(axis=1)
            df['Accel_Range'] = df['Accel_Max'] - df['Accel_Min']
        
        if 'Gyro_Z' in df.columns:
            df['Gyro_Abs'] = np.abs(df['Gyro_Z'])
    
    if config.USE_ECONOMIC:
        print("  - Creating economic features...")
        if 'Est_Price_IDR' in df.columns and 'Distance_KM' in df.columns:
            df['Price_per_KM'] = df['Est_Price_IDR'] / (df['Distance_KM'] + 1e-6)
        
        if 'Promo_Code' in df.columns:
            df['Has_Promo'] = (df['Promo_Code'].notna()).astype(np.int8)
        
        if 'Surge_Multiplier' in df.columns:
            df['Surge_Category'] = pd.cut(df['Surge_Multiplier'], 
                                          bins=[0, 1, 1.5, 2, 10], 
                                          labels=[0, 1, 2, 3]).astype(np.int8)
    
    if config.USE_INTERACTION:
        print("  - Creating interaction features...")
        if 'Surge_Multiplier' in df.columns and 'Hour' in df.columns:
            df['Surge_Hour_Interaction'] = df['Surge_Multiplier'] * df['Hour']
        
        if 'Distance_KM' in df.columns and 'Traffic' in df.columns:
            traffic_map = {'Light': 1, 'Moderate': 2, 'Heavy': 3}
            df['Traffic_Numeric'] = df['Traffic'].map(traffic_map).fillna(0).astype(np.int8)
            df['Distance_Traffic'] = df['Distance_KM'] * df['Traffic_Numeric']
    
    print(f"Feature engineering completed. Shape: {df.shape}")
    return df

train = engineer_features(train, is_train=True)
test = engineer_features(test, is_train=False)
gc.collect()

Engineering features for train set...
  - Creating distance features...
  - Creating sensor aggregation features...
  - Creating interaction features...
Feature engineering completed. Shape: (8000000, 37)
Engineering features for test set...
  - Creating distance features...
  - Creating sensor aggregation features...
  - Creating interaction features...
Feature engineering completed. Shape: (4000000, 36)


72

## 4. Preprocessing

In [7]:
def encode_categorical_frequency(train, test, categorical_cols):
    """
    Encodes based on value frequency in training data
    """
    print("\n" + "="*80)
    print("FREQUENCY ENCODING")
    print("="*80)
    
    encoders = {}
    
    for col in tqdm(categorical_cols, desc="Frequency encoding"):
        freq_map = train[col].value_counts(dropna=False).to_dict()
        
        train[col] = train[col].map(freq_map).fillna(0).astype(np.int32)
        test[col] = test[col].map(freq_map).fillna(0).astype(np.int32)
        
        encoders[col] = {
            'type': 'frequency',
            'unique_values': len(freq_map),
            'unseen_in_test': (test[col] == 0).sum()
        }
    
    print(f"\nEncoded {len(categorical_cols)} categorical features")
    return train, test, encoders

In [8]:
def encode_categorical_target(train, test, categorical_cols, y_train, smoothing=10):
    """
    Encodes based on target mean, with smoothing to prevent overfitting
    """
    print("\n" + "="*80)
    print("TARGET ENCODING")
    print("="*80)
    
    encoders = {}
    global_mean = y_train.mean()
    
    for col in tqdm(categorical_cols, desc="Target encoding"):
        temp_df = pd.DataFrame({col: train[col], 'target': y_train})
        
        agg = temp_df.groupby(col)['target'].agg(['mean', 'count'])
        smoothed_mean = (agg['mean'] * agg['count'] + global_mean * smoothing) / (agg['count'] + smoothing)
        
        encoding_map = smoothed_mean.to_dict()
        
        train[col] = train[col].map(encoding_map).fillna(global_mean).astype(np.float32)
        test[col] = test[col].map(encoding_map).fillna(global_mean).astype(np.float32)
        
        encoders[col] = {
            'type': 'target',
            'unique_values': len(encoding_map),
            'global_mean': global_mean
        }
    
    print(f"\nEncoded {len(categorical_cols)} categorical features")
    return train, test, encoders

In [9]:
def encode_categorical_label_optimized(train, test, categorical_cols):
    """
    Uses map() for vectorized operations
    """
    print("\n" + "="*80)
    print("LABEL ENCODING ")
    print("="*80)
    
    encoders = {}
    
    for col in tqdm(categorical_cols, desc="Label encoding"):
        train[col] = train[col].astype(str)
        test[col] = test[col].astype(str)
        
        le = LabelEncoder()
        train[col] = le.fit_transform(train[col])
        
        mapping = {label: idx for idx, label in enumerate(le.classes_)}
        test[col] = test[col].map(mapping).fillna(-1).astype(np.int32)
        
        encoders[col] = {
            'type': 'label',
            'encoder': le,
            'unique_values': len(le.classes_),
            'unseen_in_test': (test[col] == -1).sum()
        }
    
    print(f"\nEncoded {len(categorical_cols)} categorical features")
    return train, test, encoders

In [10]:
def preprocess_data(train, test, encoding_method='frequency'):
    """
    Main preprocessing pipeline
    
    Parameters:
    -----------
    train : pd.DataFrame
        Training dataset
    test : pd.DataFrame
        Test dataset
    encoding_method : str
        Encoding method to use: 'frequency', 'target', or 'label'
    Returns:
    --------
    X_train, X_test, y_train, le_target, encoders
    """
    print("\n" + "="*80)
    print("DATA PREPROCESSING PIPELINE")
    print("="*80)
    print(f"Encoding method: {encoding_method.upper()}")
    
    cols_to_drop = [config.ID_COL, 'Timestamp']
    if config.TARGET_COL in train.columns:
        y = train[config.TARGET_COL].copy()
        cols_to_drop.append(config.TARGET_COL)
    else:
        y = None
    
    cols_to_drop = [col for col in cols_to_drop if col in train.columns]
    X_train = train.drop(cols_to_drop, axis=1).copy()
    X_test = test.drop([col for col in cols_to_drop if col in test.columns], axis=1).copy()
    
    print(f"\nInitial shapes:")
    print(f"  X_train: {X_train.shape}")
    print(f"  X_test: {X_test.shape}")
    
    print(f"\nMissing values before imputation:")
    train_missing = X_train.isnull().sum()
    if train_missing.sum() > 0:
        print(train_missing[train_missing > 0])
    else:
        print("  No missing values found")
    
    numeric_cols = X_train.select_dtypes(include=[np.number]).columns.tolist()
    categorical_cols = X_train.select_dtypes(include=['object']).columns.tolist()
    
    print(f"\nFeature types detected:")
    print(f"  Numeric features: {len(numeric_cols)}")
    print(f"  Categorical features: {len(categorical_cols)}")
    
    print("\n" + "="*80)
    print("STEP 1: Missing Value Imputation")
    print("="*80)
    
    for col in tqdm(numeric_cols, desc="Imputing numeric features"):
        if X_train[col].isnull().sum() > 0:
            median_val = X_train[col].median()
            X_train[col].fillna(median_val, inplace=True)
            X_test[col].fillna(median_val, inplace=True)
    
    for col in tqdm(categorical_cols, desc="Imputing categorical features"):
        if X_train[col].isnull().sum() > 0:
            X_train[col].fillna('Unknown', inplace=True)
            X_test[col].fillna('Unknown', inplace=True)
    
    print("\n" + "="*80)
    print("STEP 2: Outlier Clipping")
    print("="*80)
    
    for col in tqdm(numeric_cols, desc="Clipping outliers"):
        q99 = X_train[col].quantile(0.99)
        q01 = X_train[col].quantile(0.01)
        X_train[col] = X_train[col].clip(q01, q99)
        X_test[col] = X_test[col].clip(q01, q99)
    
    print("\n" + "="*80)
    print("STEP 3: Categorical Encoding")
    print("="*80)
    
    if len(categorical_cols) > 0:
        if encoding_method == 'frequency':
            X_train, X_test, encoders = encode_categorical_frequency(
                X_train, X_test, categorical_cols
            )
        elif encoding_method == 'target':
            if y is None:
                raise ValueError("Target encoding requires target variable")
            le_target_temp = LabelEncoder()
            y_temp = le_target_temp.fit_transform(y)
            X_train, X_test, encoders = encode_categorical_target(
                X_train, X_test, categorical_cols, y_temp
            )
        elif encoding_method == 'label':
            X_train, X_test, encoders = encode_categorical_label_optimized(
                X_train, X_test, categorical_cols
            )
        else:
            raise ValueError(f"Unknown encoding method: {encoding_method}")
    else:
        encoders = {}
        print("No categorical features to encode")
    
    print("\n" + "="*80)
    print("STEP 4: Target Encoding")
    print("="*80)
    
    if y is not None:
        le_target = LabelEncoder()
        y_encoded = le_target.fit_transform(y)
        print(f"\nTarget classes encoded:")
        for i, label in enumerate(le_target.classes_):
            count = (y_encoded == i).sum()
            print(f"  {i}: {label:20s} - {count:,} samples ({count/len(y_encoded)*100:.2f}%)")
    else:
        y_encoded = None
        le_target = None
    
    print("\n" + "="*80)
    print("PREPROCESSING COMPLETED")
    print("="*80)
    print(f"Final shapes:")
    print(f"  X_train: {X_train.shape}")
    print(f"  X_test: {X_test.shape}")
    if y_encoded is not None:
        print(f"  y_train: {y_encoded.shape}")
    print("="*80)
    
    return X_train, X_test, y_encoded, le_target, encoders

X_train, X_test, y_train, le_target, encoders = preprocess_data(train, test, encoding_method='frequency')
gc.collect()


DATA PREPROCESSING PIPELINE
Encoding method: FREQUENCY

Initial shapes:
  X_train: (8000000, 34)
  X_test: (4000000, 34)

Missing values before imputation:
Pickup_Lat              929348
Pickup_Long             611083
Dropoff_Lat            1914440
Dropoff_Long           1556029
GPS_Accuracy_M         1504090
Distance_KM             941317
Est_Price_IDR          1151401
Surge_Multiplier        612307
Accel_X                1608442
Accel_Y                1081337
Accel_Z                1840229
Gyro_Z                  701714
Pickup_Zone            1381274
Dropoff_Zone            765445
Device_FP              1499497
Promo_Code             1107810
Car_Model              1721777
Payment_Method         1141190
Weather                 514649
Traffic                1808403
Battery_Level          1746399
Signal_Strength         408588
Haversine_Distance     3998467
Distance_Ratio         4469127
Distance_Difference    4469127
Accel_Magnitude        3744751
Accel_Max                50209
Accel_

Imputing numeric features:   0%|          | 0/24 [00:00<?, ?it/s]

Imputing categorical features:   0%|          | 0/10 [00:00<?, ?it/s]


STEP 2: Outlier Clipping


Clipping outliers:   0%|          | 0/24 [00:00<?, ?it/s]


STEP 3: Categorical Encoding

FREQUENCY ENCODING


Frequency encoding:   0%|          | 0/10 [00:00<?, ?it/s]


Encoded 10 categorical features

STEP 4: Target Encoding

Target classes encoded:
  0: Fraud_Indication     - 400,313 samples (5.00%)
  1: Navigation_Issue     - 801,790 samples (10.02%)
  2: Perfect_Trip         - 4,397,607 samples (54.97%)
  3: Safety_Violation     - 1,601,595 samples (20.02%)
  4: Service_Complaint    - 798,695 samples (9.98%)

PREPROCESSING COMPLETED
Final shapes:
  X_train: (8000000, 34)
  X_test: (4000000, 34)
  y_train: (8000000,)


122

## 5. Model Training

In [11]:
try:
    import xgboost as xgb
    XGBOOST_AVAILABLE = True
except ImportError:
    print("XGBoost not available. Install with: pip install xgboost")
    XGBOOST_AVAILABLE = False

def train_xgboost(X_train, y_train, X_test, n_folds=5, use_gpu=False):
    if not XGBOOST_AVAILABLE:
        return None, None, None
    
    print("\n" + "="*80)
    print("Training XGBoost Models")
    print("="*80)
    
    params = config.get_xgboost_params(use_gpu=use_gpu)
    params['num_class'] = len(np.unique(y_train))
    
    skf = StratifiedKFold(n_splits=n_folds, shuffle=True, random_state=config.RANDOM_STATE)
    
    oof_predictions = np.zeros((len(X_train), len(np.unique(y_train))))
    test_predictions = np.zeros((len(X_test), len(np.unique(y_train))))
    
    fold_scores = []
    models = []
    
    pbar = tqdm(enumerate(skf.split(X_train, y_train), 1), total=n_folds, desc="XGBoost Folds")
    for fold, (train_idx, val_idx) in pbar:
        pbar.set_description(f"XGBoost Fold {fold}/{n_folds}")
        
        X_tr, X_val = X_train.iloc[train_idx], X_train.iloc[val_idx]
        y_tr, y_val = y_train[train_idx], y_train[val_idx]
        
        dtrain = xgb.DMatrix(X_tr, label=y_tr)
        dval = xgb.DMatrix(X_val, label=y_val)
        dtest = xgb.DMatrix(X_test)
        
        model = xgb.train(
            params,
            dtrain,
            num_boost_round=1000,
            evals=[(dtrain, 'train'), (dval, 'valid')],
            early_stopping_rounds=50,
            verbose_eval=100
        )
        
        oof_predictions[val_idx] = model.predict(dval)
        test_predictions += model.predict(dtest) / n_folds
        
        oof_pred_labels = np.argmax(oof_predictions[val_idx], axis=1)
        fold_score = f1_score(y_val, oof_pred_labels, average='macro')
        fold_scores.append(fold_score)
        
        pbar.set_postfix({'F1': f'{fold_score:.6f}'})
        
        models.append(model)
        gc.collect()
    
    oof_pred_labels = np.argmax(oof_predictions, axis=1)
    overall_score = f1_score(y_train, oof_pred_labels, average='macro')
    
    print("\n" + "="*80)
    print(f"XGBoost Overall CV Score: {overall_score:.6f} (+/- {np.std(fold_scores):.6f})")
    print("="*80)
    
    return test_predictions, models, overall_score

if XGBOOST_AVAILABLE:
    xgboost_test_pred, xgboost_models, xgboost_cv_score = train_xgboost(
        X_train, y_train, X_test,
        n_folds=config.N_FOLDS,
        use_gpu=gpu_config['xgboost_gpu']
    )
else:
    xgboost_test_pred, xgboost_models, xgboost_cv_score = None, None, 0.0


Training XGBoost Models
  XGBoost: GPU mode activated


XGBoost Folds:   0%|          | 0/5 [00:00<?, ?it/s]

[0]	train-mlogloss:1.40560	valid-mlogloss:1.40563
[100]	train-mlogloss:0.63047	valid-mlogloss:0.63111
[200]	train-mlogloss:0.61239	valid-mlogloss:0.61311
[300]	train-mlogloss:0.61031	valid-mlogloss:0.61148
[400]	train-mlogloss:0.60945	valid-mlogloss:0.61122
[500]	train-mlogloss:0.60860	valid-mlogloss:0.61108
[600]	train-mlogloss:0.60777	valid-mlogloss:0.61103
[700]	train-mlogloss:0.60697	valid-mlogloss:0.61101
[790]	train-mlogloss:0.60627	valid-mlogloss:0.61101
[0]	train-mlogloss:1.40559	valid-mlogloss:1.40559
[100]	train-mlogloss:0.63068	valid-mlogloss:0.63058
[200]	train-mlogloss:0.61259	valid-mlogloss:0.61250
[300]	train-mlogloss:0.61049	valid-mlogloss:0.61088
[400]	train-mlogloss:0.60963	valid-mlogloss:0.61062
[500]	train-mlogloss:0.60876	valid-mlogloss:0.61048
[600]	train-mlogloss:0.60791	valid-mlogloss:0.61040
[700]	train-mlogloss:0.60710	valid-mlogloss:0.61039
[731]	train-mlogloss:0.60686	valid-mlogloss:0.61039
[0]	train-mlogloss:1.40560	valid-mlogloss:1.40560
[100]	train-mloglo

## 6. Model Evaluation and Inference

In [12]:
print("\n" + "="*80)
print("MODEL EVALUATION")
print("="*80)

if xgboost_test_pred is not None:
    print(f"\n✓ XGBoost Model Successfully Trained")
    print(f"  Cross-Validation Score: {xgboost_cv_score:.6f}")
    print(f"  Model Type: XGBoost with GPU acceleration")
    
    print("\n" + "="*80)
    print("GENERATING PREDICTIONS ON TEST SET")
    print("="*80)
    
    final_predictions = xgboost_test_pred
    final_pred_labels = np.argmax(final_predictions, axis=1)
    
    print(f"\n✓ Predictions Generated Successfully")
    print(f"  Total test samples: {len(final_pred_labels):,}")
    print(f"  Prediction shape: {final_predictions.shape}")
    print(f"  Classes predicted: {len(np.unique(final_pred_labels))}")
    
    print("\n" + "="*80)
    print("PREDICTION DISTRIBUTION")
    print("="*80)
    unique, counts = np.unique(final_pred_labels, return_counts=True)
    for class_idx, count in zip(unique, counts):
        percentage = (count / len(final_pred_labels)) * 100
        print(f"  Class {class_idx}: {count:,} samples ({percentage:.2f}%)")
    
else:
    raise ValueError("XGBoost model training failed! Cannot generate predictions.")

print("\n" + "="*80)


MODEL EVALUATION

✓ XGBoost Model Successfully Trained
  Cross-Validation Score: 0.560085
  Model Type: XGBoost with GPU acceleration

GENERATING PREDICTIONS ON TEST SET

✓ Predictions Generated Successfully
  Total test samples: 4,000,000
  Prediction shape: (4000000, 5)
  Classes predicted: 3

PREDICTION DISTRIBUTION
  Class 0: 198,432 samples (4.96%)
  Class 2: 3,010,341 samples (75.26%)
  Class 3: 791,227 samples (19.78%)



## 7. Generate Submission

In [13]:
def create_submission(test_ids, predictions, le_target, filename='submission.csv'):
    pred_labels = le_target.inverse_transform(predictions)
    
    submission = pd.DataFrame({
        config.ID_COL: test_ids,
        config.TARGET_COL: pred_labels
    })
    
    submission.to_csv(filename, index=False)
    
    print(f"\nSubmission saved to: {filename}")
    print(f"Submission shape: {submission.shape}")
    print(f"\nPrediction distribution:")
    print(submission[config.TARGET_COL].value_counts())
    
    return submission

test_ids = test[config.ID_COL].values
submission = create_submission(test_ids, final_pred_labels, le_target, 'submission.csv')
submission.head(10)


Submission saved to: submission.csv
Submission shape: (4000000, 2)

Prediction distribution:
Trip_Label
Perfect_Trip        3010341
Safety_Violation     791227
Fraud_Indication     198432
Name: count, dtype: int64


Unnamed: 0,Trip_ID,Trip_Label
0,TRIP-06583736,Perfect_Trip
1,TRIP-11356251,Perfect_Trip
2,TRIP-03320505,Perfect_Trip
3,TRIP-07188814,Perfect_Trip
4,TRIP-06994869,Perfect_Trip
5,TRIP-03232331,Perfect_Trip
6,TRIP-03536120,Perfect_Trip
7,TRIP-06411895,Perfect_Trip
8,TRIP-00132176,Perfect_Trip
9,TRIP-00298208,Perfect_Trip


## 8. Validation & Analysis

In [14]:
print("\n" + "="*80)
print("Final Validation Checks")
print("="*80)

assert submission.shape[0] == test.shape[0], "Submission size mismatch!"
assert submission.columns.tolist() == [config.ID_COL, config.TARGET_COL], "Column names mismatch!"
assert submission[config.TARGET_COL].isnull().sum() == 0, "Null predictions found!"

expected_labels = set(le_target.classes_)
submission_labels = set(submission[config.TARGET_COL].unique())
assert submission_labels.issubset(expected_labels), "Invalid labels in submission!"

print("All validation checks passed!")
print("="*80)


Final Validation Checks
All validation checks passed!
