## 1. Configuration & Setup

In [1]:
import numpy as np
import pandas as pd
import warnings
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import LabelEncoder, OrdinalEncoder
from sklearn.metrics import f1_score, classification_report
import gc
from tqdm.auto import tqdm
import xgboost as xgb
from catboost import CatBoostClassifier, Pool

tqdm.pandas()
warnings.filterwarnings('ignore')
pd.set_option('display.max_columns', None)

print("Libraries imported.")

Libraries imported.


In [2]:
class Config:
    DATA_PATH = '/kaggle/input/ride-hailing-trip-classification-dataset/'
    N_FOLDS = 5
    RANDOM_STATE = 42
    TARGET_COL = 'Trip_Label'
    ID_COL = 'Trip_ID'
    USE_GPU = True
    
    XGBOOST_PARAMS = {
        "max_depth": 7,
        "min_child_weight": 7,
        "max_delta_step": 5,
        "gamma": 0.1737507723343592,
        "learning_rate": 0.034002141615166376,
        "subsample": 0.9516920090100376,
        "colsample_bytree": 0.5993238619880857,
        "colsample_bylevel": 0.9963742019132593,
        "colsample_bynode": 0.9228789464520328,
        "reg_alpha": 1.0572914540483875,
        "reg_lambda": 2.1685276721731586
    }
    
    CLASS_WEIGHTS = {
        0: 1.0,
        1: 4.509194597384958,
        2: 1.0,
        3: 1.0,
        4: 4.96455487505969
    }
    
    THRESHOLDS = np.array([2.1188, 1.8953, 1.6760, 1.7246, 1.6181])

config = Config()
print("Configuration loaded.")

Configuration loaded.


## 2. Data Loading

In [3]:
def load_data():
    print("Loading data...")
    train = pd.read_csv(config.DATA_PATH + 'train.csv')
    test = pd.read_csv(config.DATA_PATH + 'test.csv')
    
    print(f"Train shape: {train.shape}")
    print(f"Test shape: {test.shape}")
    print(f"\nTarget distribution:")
    print(train[config.TARGET_COL].value_counts())
    
    return train, test

train, test = load_data()
train_distribution = train[config.TARGET_COL].value_counts(normalize=True).sort_index()
test_ids = test[config.ID_COL].values

Loading data...
Train shape: (8000000, 25)
Test shape: (4000000, 24)

Target distribution:
Trip_Label
Perfect_Trip         4397607
Safety_Violation     1601595
Navigation_Issue      801790
Service_Complaint     798695
Fraud_Indication      400313
Name: count, dtype: int64


In [4]:
def optimize_memory(df):
    for col in df.columns:
        col_type = df[col].dtype
        if col_type != 'object':
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
            else:
                if c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
    return df

train = optimize_memory(train)
test = optimize_memory(test)
print("Memory optimization completed.")

Memory optimization completed.


## 3. Feature Engineering

In [5]:
ROUTE_FREQ = {}
PICKUP_ZONE_FREQ = {}
DROPOFF_ZONE_FREQ = {}

def compute_bearing(lat1, lon1, lat2, lon2):
    lat1, lat2 = np.radians(lat1), np.radians(lat2)
    diff = np.radians(lon2 - lon1)
    x = np.sin(diff) * np.cos(lat2)
    y = np.cos(lat1) * np.sin(lat2) - np.sin(lat1) * np.cos(lat2) * np.cos(diff)
    return np.degrees(np.arctan2(x, y))

def engineer_features(df, is_train=True):
    global ROUTE_FREQ, PICKUP_ZONE_FREQ, DROPOFF_ZONE_FREQ
    df = df.copy()
    
    # Temporal features
    if 'Timestamp' in df.columns:
        df['Timestamp_parsed'] = pd.to_datetime(df['Timestamp'], errors='coerce')
        df['Hour'] = df['Timestamp_parsed'].dt.hour
        df['DayOfWeek'] = df['Timestamp_parsed'].dt.dayofweek
        df['Month'] = df['Timestamp_parsed'].dt.month
        df['IsWeekend'] = (df['DayOfWeek'] >= 5).astype(np.int8)
        df['IsRushHour'] = ((df['Hour'] >= 7) & (df['Hour'] <= 9) | 
                           (df['Hour'] >= 17) & (df['Hour'] <= 19)).astype(np.int8)
        df['IsLateNight'] = ((df['Hour'] >= 22) | (df['Hour'] <= 5)).astype(np.int8)
        df.drop('Timestamp_parsed', axis=1, inplace=True)
    
    # Distance and bearing features
    coord_cols = ['Pickup_Lat', 'Pickup_Long', 'Dropoff_Lat', 'Dropoff_Long']
    if all(col in df.columns for col in coord_cols):
        R = 6371
        lat1_rad = np.radians(df['Pickup_Lat'])
        lat2_rad = np.radians(df['Dropoff_Lat'])
        delta_lat = np.radians(df['Dropoff_Lat'] - df['Pickup_Lat'])
        delta_lon = np.radians(df['Dropoff_Long'] - df['Pickup_Long'])
        a = np.sin(delta_lat/2)**2 + np.cos(lat1_rad) * np.cos(lat2_rad) * np.sin(delta_lon/2)**2
        c = 2 * np.arctan2(np.sqrt(a), np.sqrt(1-a))
        df['Haversine_Distance'] = R * c
        
        df['Bearing'] = compute_bearing(df['Pickup_Lat'], df['Pickup_Long'], 
                                        df['Dropoff_Lat'], df['Dropoff_Long'])
        df['bearing_sin'] = np.sin(np.radians(df['Bearing']))
        df['bearing_cos'] = np.cos(np.radians(df['Bearing']))
        
        df['Delta_Lat'] = df['Dropoff_Lat'] - df['Pickup_Lat']
        df['Delta_Long'] = df['Dropoff_Long'] - df['Pickup_Long']
        
        if 'Distance_KM' in df.columns:
            df['Distance_Ratio'] = df['Distance_KM'] / (df['Haversine_Distance'] + 1e-6)
            df['Distance_Difference'] = np.abs(df['Distance_KM'] - df['Haversine_Distance'])
            df['Is_Ultra_Short'] = (df['Distance_KM'] < 0.024).astype(np.int8)
    
    # Zone features
    if 'Pickup_Zone' in df.columns and 'Dropoff_Zone' in df.columns:
        df['Is_Same_Zone'] = (df['Pickup_Zone'] == df['Dropoff_Zone']).astype(np.int8)
        
        route = df['Pickup_Zone'].astype(str) + '__' + df['Dropoff_Zone'].astype(str)
        if is_train:
            ROUTE_FREQ = route.value_counts().to_dict()
            PICKUP_ZONE_FREQ = df['Pickup_Zone'].value_counts().to_dict()
            DROPOFF_ZONE_FREQ = df['Dropoff_Zone'].value_counts().to_dict()
        
        df['route_count'] = route.map(ROUTE_FREQ).fillna(0).astype(np.int32)
        df['pickup_zone_count'] = df['Pickup_Zone'].map(PICKUP_ZONE_FREQ).fillna(0).astype(np.int32)
        df['dropoff_zone_count'] = df['Dropoff_Zone'].map(DROPOFF_ZONE_FREQ).fillna(0).astype(np.int32)
    
    # Sensor features
    accel_cols = ['Accel_X', 'Accel_Y', 'Accel_Z']
    if all(col in df.columns for col in accel_cols):
        df['Accel_Magnitude'] = np.sqrt(df['Accel_X']**2 + df['Accel_Y']**2 + df['Accel_Z']**2)
        df['Accel_Max'] = df[accel_cols].max(axis=1)
        df['Accel_Min'] = df[accel_cols].min(axis=1)
        df['Accel_Range'] = df['Accel_Max'] - df['Accel_Min']
        df['Accel_Std'] = df[accel_cols].std(axis=1)
    
    if 'Gyro_Z' in df.columns:
        df['Gyro_Abs'] = np.abs(df['Gyro_Z'])
        df['Is_Gyro_Z_Outlier'] = (df['Gyro_Z'].abs() >= 0.672).astype(np.int8)
    
    # Economic features
    if 'Est_Price_IDR' in df.columns and 'Distance_KM' in df.columns:
        df['Price_per_KM'] = df['Est_Price_IDR'] / (df['Distance_KM'] + 1e-6)
    
    if 'Surge_Multiplier' in df.columns:
        df['Surge_Category'] = pd.cut(
            df['Surge_Multiplier'].fillna(1.0),
            bins=[0, 1, 1.5, 2, 10],
            labels=[0, 1, 2, 3]
        ).astype(np.int8)
    
    print(f"Feature engineering completed. Shape: {df.shape}")
    return df

train = engineer_features(train, is_train=True)
test = engineer_features(test, is_train=False)
gc.collect()

Feature engineering completed. Shape: (8000000, 53)
Feature engineering completed. Shape: (4000000, 52)


40

## 4. Preprocessing

In [6]:
def encode_target(train, test, categorical_cols, y_train, smoothing=10):
    global_mean = y_train.mean()
    
    for col in categorical_cols:
        temp_df = pd.DataFrame({col: train[col], 'target': y_train})
        agg = temp_df.groupby(col)['target'].agg(['mean', 'count'])
        smoothed_mean = (agg['mean'] * agg['count'] + global_mean * smoothing) / (agg['count'] + smoothing)
        encoding_map = smoothed_mean.to_dict()
        
        train[col] = train[col].map(encoding_map).fillna(global_mean).astype(np.float32)
        test[col] = test[col].map(encoding_map).fillna(global_mean).astype(np.float32)
    
    return train, test

def preprocess_data(train, test):
    print("Preprocessing data...")
    
    cols_to_drop = [config.ID_COL, 'Timestamp']
    y = train[config.TARGET_COL].copy()
    cols_to_drop.append(config.TARGET_COL)
    
    cols_to_drop = [col for col in cols_to_drop if col in train.columns]
    X_train = train.drop(cols_to_drop, axis=1).copy()
    X_test = test.drop([col for col in cols_to_drop if col in test.columns], axis=1).copy()
    
    numeric_cols = X_train.select_dtypes(include=[np.number]).columns.tolist()
    categorical_cols = X_train.select_dtypes(include=['object']).columns.tolist()
    
    # Imputation
    mean_impute_cols = ['Accel_Y', 'Accel_Z', 'Dropoff_Lat', 'Dropoff_Long', 'Gyro_Z']
    mean_impute_cols = [c for c in mean_impute_cols if c in numeric_cols]
    
    for col in mean_impute_cols:
        val = X_train[col].mean()
        X_train[col].fillna(val, inplace=True)
        X_test[col].fillna(val, inplace=True)
    
    for col in numeric_cols:
        if X_train[col].isnull().sum() > 0:
            val = X_train[col].median()
            X_train[col].fillna(val, inplace=True)
            X_test[col].fillna(val, inplace=True)
    
    for col in categorical_cols:
        if X_train[col].isnull().sum() > 0:
            X_train[col].fillna('Unknown', inplace=True)
            X_test[col].fillna('Unknown', inplace=True)
    
    # Outlier clipping
    for col in numeric_cols:
        q01, q99 = X_train[col].quantile([0.01, 0.99])
        X_train[col] = X_train[col].clip(q01, q99)
        X_test[col] = X_test[col].clip(q01, q99)
    
    # Ordinal encoding
    ordinal_cols = ['Weather', 'Traffic', 'Payment_Method', 'Signal_Strength']
    ordinal_cols = [c for c in ordinal_cols if c in categorical_cols]
    
    if ordinal_cols:
        oe = OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1)
        X_train[ordinal_cols] = oe.fit_transform(X_train[ordinal_cols])
        X_test[ordinal_cols] = oe.transform(X_test[ordinal_cols])
        categorical_cols = [c for c in categorical_cols if c not in ordinal_cols]
    
    # Target encoding
    if categorical_cols:
        le_temp = LabelEncoder()
        y_temp = le_temp.fit_transform(y)
        X_train, X_test = encode_target(X_train, X_test, categorical_cols, y_temp, smoothing=10)
    
    # Target label encoding
    le_target = LabelEncoder()
    y_encoded = le_target.fit_transform(y)
    
    print(f"Final shapes: X_train={X_train.shape}, X_test={X_test.shape}")
    return X_train, X_test, y_encoded, le_target

X_train, X_test, y_train, le_target = preprocess_data(train, test)
gc.collect()

Preprocessing data...
Final shapes: X_train=(8000000, 50), X_test=(4000000, 50)


44

In [7]:
# Class weights
sample_weights = np.ones(len(y_train), dtype=np.float32)
sample_weights[y_train == 1] = config.CLASS_WEIGHTS[1]
sample_weights[y_train == 4] = config.CLASS_WEIGHTS[4]

print(f"Class weights applied:")
for cls, weight in config.CLASS_WEIGHTS.items():
    print(f"  Class {cls}: {weight}")

Class weights applied:
  Class 0: 1.0
  Class 1: 4.509194597384958
  Class 2: 1.0
  Class 3: 1.0
  Class 4: 4.96455487505969


## 5. Model Training

In [8]:
def macro_f1_eval(preds, dtrain):
    labels = dtrain.get_label()
    preds_reshaped = preds.reshape(len(labels), -1)
    pred_labels = np.argmax(preds_reshaped, axis=1)
    score = f1_score(labels, pred_labels, average='macro')
    return 'macro_f1', score

def train_xgboost(X_train, y_train, X_test, sample_weights):
    print("Training XGBoost...")
    
    params = {
        'objective': 'multi:softprob',
        'eval_metric': 'mlogloss',
        'verbosity': 1,
        'num_class': len(np.unique(y_train)),
        'random_state': config.RANDOM_STATE
    }
    params.update(config.XGBOOST_PARAMS)
    
    if config.USE_GPU:
        params['device'] = 'cuda'
    
    skf = StratifiedKFold(n_splits=config.N_FOLDS, shuffle=True, random_state=config.RANDOM_STATE)
    
    oof_predictions = np.zeros((len(X_train), len(np.unique(y_train))))
    test_predictions = np.zeros((len(X_test), len(np.unique(y_train))))
    fold_scores = []
    
    for fold, (train_idx, val_idx) in enumerate(tqdm(skf.split(X_train, y_train), total=config.N_FOLDS, desc="XGBoost Folds"), 1):
        X_tr, X_val = X_train.iloc[train_idx], X_train.iloc[val_idx]
        y_tr, y_val = y_train[train_idx], y_train[val_idx]
        
        dtrain = xgb.DMatrix(X_tr, label=y_tr, weight=sample_weights[train_idx])
        dval = xgb.DMatrix(X_val, label=y_val, weight=sample_weights[val_idx])
        dtest = xgb.DMatrix(X_test)
        
        model = xgb.train(
            params,
            dtrain,
            num_boost_round=1000,
            evals=[(dtrain, 'train'), (dval, 'valid')],
            custom_metric=macro_f1_eval,
            early_stopping_rounds=50,
            verbose_eval=100
        )
        
        oof_predictions[val_idx] = model.predict(dval)
        test_predictions += model.predict(dtest) / config.N_FOLDS
        
        fold_score = f1_score(y_val, np.argmax(oof_predictions[val_idx], axis=1), average='macro')
        fold_scores.append(fold_score)
        print(f"Fold {fold}: Macro F1 = {fold_score:.6f}")
        
        gc.collect()
    
    cv_score = f1_score(y_train, np.argmax(oof_predictions, axis=1), average='macro')
    print(f"\nXGBoost CV Score: {cv_score:.6f}")
    
    return test_predictions, cv_score, oof_predictions

In [9]:
xgb_test_pred, xgb_cv_score, xgb_oof = train_xgboost(
    X_train, y_train, X_test, sample_weights
)

Training XGBoost...


XGBoost Folds:   0%|          | 0/5 [00:00<?, ?it/s]

[0]	train-mlogloss:1.53274	train-macro_f1:0.14554	valid-mlogloss:1.53276	valid-macro_f1:0.14548
[49]	train-mlogloss:1.08152	train-macro_f1:0.63226	valid-mlogloss:1.08156	valid-macro_f1:0.63241
Fold 1: Macro F1 = 0.632421
[0]	train-mlogloss:1.53131	train-macro_f1:0.18079	valid-mlogloss:1.53130	valid-macro_f1:0.18063
[50]	train-mlogloss:1.07612	train-macro_f1:0.63228	valid-mlogloss:1.07602	valid-macro_f1:0.63245
Fold 2: Macro F1 = 0.632447
[0]	train-mlogloss:1.53246	train-macro_f1:0.14499	valid-mlogloss:1.53248	valid-macro_f1:0.14493
[50]	train-mlogloss:1.07647	train-macro_f1:0.63218	valid-mlogloss:1.07716	valid-macro_f1:0.63145
Fold 3: Macro F1 = 0.631453
[0]	train-mlogloss:1.53114	train-macro_f1:0.17616	valid-mlogloss:1.53113	valid-macro_f1:0.17616
[49]	train-mlogloss:1.08012	train-macro_f1:0.63232	valid-mlogloss:1.08016	valid-macro_f1:0.63204
Fold 4: Macro F1 = 0.632003
[0]	train-mlogloss:1.53179	train-macro_f1:0.18143	valid-mlogloss:1.53178	valid-macro_f1:0.18146
[49]	train-mlogloss:

In [10]:
def train_catboost(X_train, y_train, X_test):
    print("Training CatBoost...")
    
    cb_params = {
        'iterations': 1000,
        'learning_rate': 0.05,
        'depth': 7,
        'l2_leaf_reg': 3,
        'loss_function': 'MultiClass',
        'eval_metric': 'TotalF1:average=Macro',
        'random_seed': config.RANDOM_STATE,
        'verbose': 100,
        'early_stopping_rounds': 50,
        'class_weights': config.CLASS_WEIGHTS
    }
    
    if config.USE_GPU:
        cb_params['task_type'] = 'GPU'
    
    skf = StratifiedKFold(n_splits=config.N_FOLDS, shuffle=True, random_state=config.RANDOM_STATE)
    
    oof_predictions = np.zeros((len(X_train), 5))
    test_predictions = np.zeros((len(X_test), 5))
    fold_scores = []
    
    for fold, (train_idx, val_idx) in enumerate(tqdm(skf.split(X_train, y_train), total=config.N_FOLDS, desc="CatBoost Folds"), 1):
        X_tr, X_val = X_train.iloc[train_idx], X_train.iloc[val_idx]
        y_tr, y_val = y_train[train_idx], y_train[val_idx]
        
        train_pool = Pool(X_tr, y_tr)
        val_pool = Pool(X_val, y_val)
        
        model = CatBoostClassifier(**cb_params)
        model.fit(train_pool, eval_set=val_pool, use_best_model=True)
        
        oof_predictions[val_idx] = model.predict_proba(X_val)
        test_predictions += model.predict_proba(X_test) / config.N_FOLDS
        
        fold_score = f1_score(y_val, np.argmax(oof_predictions[val_idx], axis=1), average='macro')
        fold_scores.append(fold_score)
        print(f"Fold {fold}: Macro F1 = {fold_score:.6f}")
        
        del model, train_pool, val_pool
        gc.collect()
    
    cv_score = f1_score(y_train, np.argmax(oof_predictions, axis=1), average='macro')
    print(f"\nCatBoost CV Score: {cv_score:.6f}")
    
    return test_predictions, cv_score, oof_predictions

In [11]:
cb_test_pred, cb_cv_score, cb_oof = train_catboost(
    X_train, y_train, X_test
)

Training CatBoost...


CatBoost Folds:   0%|          | 0/5 [00:00<?, ?it/s]

0:	learn: 0.5763012	test: 0.5768562	best: 0.5768562 (0)	total: 17.1s	remaining: 4h 44m 18s
100:	learn: 0.6097476	test: 0.6100038	best: 0.6100038 (100)	total: 24s	remaining: 3m 33s
200:	learn: 0.6146405	test: 0.6147245	best: 0.6147245 (200)	total: 30.7s	remaining: 2m 1s
300:	learn: 0.6163473	test: 0.6162462	best: 0.6162462 (300)	total: 37.5s	remaining: 1m 27s
400:	learn: 0.6173809	test: 0.6168042	best: 0.6168042 (400)	total: 44.7s	remaining: 1m 6s
500:	learn: 0.6181920	test: 0.6170780	best: 0.6171027 (496)	total: 52.1s	remaining: 51.9s
600:	learn: 0.6189663	test: 0.6174807	best: 0.6174966 (589)	total: 59.6s	remaining: 39.6s
700:	learn: 0.6196536	test: 0.6176417	best: 0.6176956 (683)	total: 1m 7s	remaining: 28.7s
bestTest = 0.6176956144
bestIteration = 683
Shrink model to first 684 iterations.
Fold 1: Macro F1 = 0.625378
0:	learn: 0.5765774	test: 0.5765635	best: 0.5765635 (0)	total: 69.5ms	remaining: 1m 9s
100:	learn: 0.6089163	test: 0.6089590	best: 0.6089708 (99)	total: 6.97s	remaining:

## 6. Ensemble

In [12]:
print("Finding optimal ensemble weights...")
print(f"\nIndividual Model CV Scores:")
print(f"  XGBoost:  {xgb_cv_score:.6f}")
print(f"  CatBoost: {cb_cv_score:.6f}")

best_ensemble_score = 0
best_weights = None

for w_xgb in np.arange(0.3, 0.8, 0.05):
    w_cb = 1.0 - w_xgb
    ensemble_oof = w_xgb * xgb_oof + w_cb * cb_oof
    ensemble_pred = np.argmax(ensemble_oof, axis=1)
    ensemble_score = f1_score(y_train, ensemble_pred, average='macro')
    
    if ensemble_score > best_ensemble_score:
        best_ensemble_score = ensemble_score
        best_weights = (w_xgb, w_cb)
        print(f"  XGB={w_xgb:.2f}, CB={w_cb:.2f} -> F1={ensemble_score:.6f}")

print(f"\nBest Ensemble Weights: XGB={best_weights[0]:.2f}, CB={best_weights[1]:.2f}")
print(f"Best Ensemble CV Score: {best_ensemble_score:.6f}")

Finding optimal ensemble weights...

Individual Model CV Scores:
  XGBoost:  0.632045
  CatBoost: 0.625193
  XGB=0.30, CB=0.70 -> F1=0.627336
  XGB=0.35, CB=0.65 -> F1=0.627775
  XGB=0.40, CB=0.60 -> F1=0.628208
  XGB=0.45, CB=0.55 -> F1=0.628668
  XGB=0.50, CB=0.50 -> F1=0.629127
  XGB=0.55, CB=0.45 -> F1=0.629562
  XGB=0.60, CB=0.40 -> F1=0.630016
  XGB=0.65, CB=0.35 -> F1=0.630495
  XGB=0.70, CB=0.30 -> F1=0.630927
  XGB=0.75, CB=0.25 -> F1=0.631381

Best Ensemble Weights: XGB=0.75, CB=0.25
Best Ensemble CV Score: 0.631381


In [13]:
w_xgb, w_cb = best_weights

# OOF ensemble with thresholds
ensemble_oof = w_xgb * xgb_oof + w_cb * cb_oof
ensemble_oof_thresh = ensemble_oof * config.THRESHOLDS
ensemble_pred_thresh = np.argmax(ensemble_oof_thresh, axis=1)
ensemble_score_thresh = f1_score(y_train, ensemble_pred_thresh, average='macro')

print(f"Ensemble + Thresholds CV Score: {ensemble_score_thresh:.6f}")
print(f"Improvement from thresholds: {ensemble_score_thresh - best_ensemble_score:+.6f}")

# Test set ensemble with thresholds
ensemble_test = w_xgb * xgb_test_pred + w_cb * cb_test_pred
ensemble_test_thresh = ensemble_test * config.THRESHOLDS
ensemble_test_pred_labels = np.argmax(ensemble_test_thresh, axis=1)

Ensemble + Thresholds CV Score: 0.633935
Improvement from thresholds: +0.002554


## 7. Generate Submission

In [14]:
def create_submission(test_ids, predictions, le_target, filename):
    pred_labels = le_target.inverse_transform(predictions)
    
    submission = pd.DataFrame({
        config.ID_COL: test_ids,
        config.TARGET_COL: pred_labels
    })
    
    submission.to_csv(filename, index=False)
    
    print(f"Submission saved to: {filename}")
    print(f"Shape: {submission.shape}")
    print(f"\nPrediction distribution:")
    print(submission[config.TARGET_COL].value_counts())
    
    return submission

submission = create_submission(
    test_ids,
    ensemble_test_pred_labels,
    le_target,
    'submission-14.csv'
)

Submission saved to: submission-14.csv
Shape: (4000000, 2)

Prediction distribution:
Trip_Label
Perfect_Trip         2308408
Safety_Violation      790259
Navigation_Issue      362295
Service_Complaint     340911
Fraud_Indication      198127
Name: count, dtype: int64


In [15]:
!zip submission-14.zip submission-14.csv
from IPython.display import FileLink
FileLink('submission-14.zip')

  adding: submission-14.csv (deflated 82%)


In [16]:
# Distribution comparison
print("Prediction Distribution vs Training Distribution:")
print("-" * 60)

pred_dist = submission[config.TARGET_COL].value_counts(normalize=True).sort_index()

for label in train_distribution.index:
    pred_pct = pred_dist.get(label, 0) * 100
    train_pct = train_distribution.get(label, 0) * 100
    diff = pred_pct - train_pct
    print(f"  {label}: {pred_pct:.2f}% (train: {train_pct:.2f}%, diff: {diff:+.2f}%)")

Prediction Distribution vs Training Distribution:
------------------------------------------------------------
  Fraud_Indication: 4.95% (train: 5.00%, diff: -0.05%)
  Navigation_Issue: 9.06% (train: 10.02%, diff: -0.97%)
  Perfect_Trip: 57.71% (train: 54.97%, diff: +2.74%)
  Safety_Violation: 19.76% (train: 20.02%, diff: -0.26%)
  Service_Complaint: 8.52% (train: 9.98%, diff: -1.46%)


## 8. Validation

In [17]:
print("Validation Checks")
print("=" * 60)

assert submission.shape[0] == len(test_ids), "Submission size mismatch"
assert submission.columns.tolist() == [config.ID_COL, config.TARGET_COL], "Column names mismatch"
assert submission[config.TARGET_COL].isnull().sum() == 0, "Null predictions found"

expected_labels = set(le_target.classes_)
submission_labels = set(submission[config.TARGET_COL].unique())
assert submission_labels.issubset(expected_labels), "Invalid labels in submission"

print("All validation checks passed.")

Validation Checks
All validation checks passed.


In [18]:
# Compare with highest latest submission
print("Comparing highest latest submission")
print("=" * 60)

try:
    original_submission = pd.read_csv('/kaggle/input/highest-score/submission-14.csv')
    
    # Check if both have same shape
    print(f"Original shape: {original_submission.shape}")
    print(f"New shape: {submission.shape}")
    
    # Merge and compare
    comparison = submission.merge(
        original_submission, 
        on=config.ID_COL, 
        suffixes=('_new', '_original')
    )
    
    matches = (comparison[f'{config.TARGET_COL}_new'] == comparison[f'{config.TARGET_COL}_original']).sum()
    total = len(comparison)
    match_pct = matches / total * 100
    
    print(f"\nMatching predictions: {matches:,} / {total:,} ({match_pct:.2f}%)")
    
    if matches == total:
        print("\nSUCCESS: New pipeline produces identical results.")
    else:
        print(f"\nDifferences found: {total - matches:,} predictions differ.")
        
        # Show distribution of differences
        diff_mask = comparison[f'{config.TARGET_COL}_new'] != comparison[f'{config.TARGET_COL}_original']
        if diff_mask.sum() > 0:
            print("\nDifference breakdown:")
            diff_df = comparison[diff_mask]
            print(diff_df[[f'{config.TARGET_COL}_new', f'{config.TARGET_COL}_original']].value_counts().head(10))

except FileNotFoundError:
    print("Original submission file not found at /kaggle/working/submission-14.csv")
    print("Skipping comparison.")

Comparing highest latest submission
Original shape: (4000000, 2)
New shape: (4000000, 2)

Matching predictions: 3,974,687 / 4,000,000 (99.37%)

Differences found: 25,313 predictions differ.

Difference breakdown:
Trip_Label_new     Trip_Label_original
Navigation_Issue   Perfect_Trip           11388
Perfect_Trip       Service_Complaint       7543
                   Navigation_Issue        5806
Service_Complaint  Perfect_Trip             297
Perfect_Trip       Safety_Violation          94
Safety_Violation   Perfect_Trip              52
Navigation_Issue   Safety_Violation          33
Fraud_Indication   Perfect_Trip              28
Safety_Violation   Navigation_Issue          16
                   Service_Complaint         14
Name: count, dtype: int64


In [19]:
print("\nPipeline Summary")
print("=" * 60)
print(f"XGBoost CV Score:           {xgb_cv_score:.6f}")
print(f"CatBoost CV Score:          {cb_cv_score:.6f}")
print(f"Ensemble CV Score:          {best_ensemble_score:.6f}")
print(f"Ensemble + Thresholds CV:   {ensemble_score_thresh:.6f}")
print(f"\nEnsemble Weights: XGB={best_weights[0]:.2f}, CB={best_weights[1]:.2f}")
print(f"Thresholds: {config.THRESHOLDS}")
print(f"\nSubmission saved to: submission-ensemble.csv")


Pipeline Summary
XGBoost CV Score:           0.632045
CatBoost CV Score:          0.625193
Ensemble CV Score:          0.631381
Ensemble + Thresholds CV:   0.633935

Ensemble Weights: XGB=0.75, CB=0.25
Thresholds: [2.1188 1.8953 1.676  1.7246 1.6181]

Submission saved to: submission-ensemble.csv
