## 1. Configuration & Setup

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
from datetime import datetime
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import f1_score, classification_report, confusion_matrix
import gc
from tqdm.auto import tqdm
import xgboost as xgb
tqdm.pandas()

warnings.filterwarnings('ignore')
pd.set_option('display.max_columns', None)

print("Libraries imported successfully.")

Libraries imported successfully.


In [3]:
USE_GPU = True  

gpu_config = {
    'xgboost_gpu': USE_GPU,
}

In [4]:
class Config:
    DATA_PATH = '/kaggle/input/ride-hailing-trip-classification-dataset/'
    
    N_FOLDS = 5
    RANDOM_STATE = 42
    TARGET_COL = 'Trip_Label'
    ID_COL = 'Trip_ID'
    
    @staticmethod
    def get_xgboost_params(use_gpu=False):
        params = {
            'objective': 'multi:softprob',
            'eval_metric': 'mlogloss',
            'max_depth': 6,
            'learning_rate': 0.05,
            'subsample': 0.8,
            'colsample_bytree': 0.8,
            'random_state': 42,
            'verbosity': 1
        }
        if use_gpu:
            params['device'] = 'cuda'
            print("  XGBoost: GPU mode activated")
        else:
            params['device'] = 'cpu'
            print("  XGBoost: CPU mode")
        return params

config = Config()
print("\nConfiguration loaded successfully.")


Configuration loaded successfully.


## 2. Data Loading & Validation

In [5]:
def load_data():
    print("Loading data...")
    files = ['train.csv', 'test.csv', 'sample_submission.csv']
    data = {}
    
    for file in tqdm(files, desc="Loading files"):
        data[file.replace('.csv', '')] = pd.read_csv(config.DATA_PATH + file)
    
    train = data['train']
    test = data['test']
    sample_submission = data['sample_submission']
    
    print(f"Train shape: {train.shape}")
    print(f"Test shape: {test.shape}")
    print(f"Sample submission shape: {sample_submission.shape}")
    
    if config.TARGET_COL in train.columns:
        print(f"\nTarget distribution:")
        print(train[config.TARGET_COL].value_counts())
    
    return train, test, sample_submission

train, test, sample_submission = load_data()

Loading data...


Loading files:   0%|          | 0/3 [00:00<?, ?it/s]

Train shape: (8000000, 25)
Test shape: (4000000, 24)
Sample submission shape: (4000000, 2)

Target distribution:
Trip_Label
Perfect_Trip         4397607
Safety_Violation     1601595
Navigation_Issue      801790
Service_Complaint     798695
Fraud_Indication      400313
Name: count, dtype: int64


In [6]:
train_df = train.copy()

In [7]:
def optimize_memory(df):
    print(f"Optimizing memory for dataframe with {len(df.columns)} columns...")
    for col in tqdm(df.columns, desc="Optimizing columns"):
        col_type = df[col].dtype
        if col_type != 'object':
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
            else:
                if c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
    return df

print("Optimizing memory...")
train = optimize_memory(train)
test = optimize_memory(test)
print("Memory optimization completed.")

Optimizing memory...
Optimizing memory for dataframe with 25 columns...


Optimizing columns:   0%|          | 0/25 [00:00<?, ?it/s]

Optimizing memory for dataframe with 24 columns...


Optimizing columns:   0%|          | 0/24 [00:00<?, ?it/s]

Memory optimization completed.


## 3. Feature Engineering Experiments

In [8]:
def bearing(lat1, lon1, lat2, lon2):
    lat1, lat2 = np.radians(lat1), np.radians(lat2)
    diff = np.radians(lon2 - lon1)
    x = np.sin(diff) * np.cos(lat2)
    y = np.cos(lat1)*np.sin(lat2) - np.sin(lat1)*np.cos(lat2)*np.cos(diff)
    return np.degrees(np.arctan2(x, y))

# Global storage 
ROUTE_FREQ = {}
PICKUP_ZONE_FREQ = {}
DROPOFF_ZONE_FREQ = {}

def engineer_features(df, is_train=True):
    """
    Feature engineering dengan null_count, bearing, route features
    """
    global ROUTE_FREQ, PICKUP_ZONE_FREQ, DROPOFF_ZONE_FREQ
    df = df.copy()
    
    # 1. Null count features
    df['null_count'] = df.isnull().sum(axis=1)
    
    temporal_cols = ['Timestamp']
    df['null_count_temporal'] = df[[c for c in temporal_cols if c in df.columns]].isnull().sum(axis=1)
    
    distance_cols = ['Pickup_Lat', 'Pickup_Long', 'Dropoff_Lat', 'Dropoff_Long', 'Distance_KM']
    df['null_count_distance'] = df[[c for c in distance_cols if c in df.columns]].isnull().sum(axis=1)
    
    sensor_cols = ['Accel_X', 'Accel_Y', 'Accel_Z', 'Gyro_Z']
    df['null_count_sensor'] = df[[c for c in sensor_cols if c in df.columns]].isnull().sum(axis=1)
    
    econ_cols = ['Est_Price_IDR', 'Promo_Code', 'Surge_Multiplier']
    df['null_count_economic'] = df[[c for c in econ_cols if c in df.columns]].isnull().sum(axis=1)
    
    # 2. Temporal features
    if 'Timestamp' in df.columns:
        df['Timestamp_parsed'] = pd.to_datetime(df['Timestamp'], errors='coerce')
        df['Hour'] = df['Timestamp_parsed'].dt.hour
        df['DayOfWeek'] = df['Timestamp_parsed'].dt.dayofweek
        df['Month'] = df['Timestamp_parsed'].dt.month
        df['IsWeekend'] = (df['DayOfWeek'] >= 5).astype(np.int8)
        df['IsRushHour'] = ((df['Hour'] >= 7) & (df['Hour'] <= 9) | 
                           (df['Hour'] >= 17) & (df['Hour'] <= 19)).astype(np.int8)
        df['IsLateNight'] = ((df['Hour'] >= 22) | (df['Hour'] <= 5)).astype(np.int8)
        df.drop('Timestamp_parsed', axis=1, inplace=True)
    
    # 3. Distance & bearing features
    coord_cols = ['Pickup_Lat', 'Pickup_Long', 'Dropoff_Lat', 'Dropoff_Long']
    if all(col in df.columns for col in coord_cols):
        R = 6371
        lat1_rad = np.radians(df['Pickup_Lat'])
        lat2_rad = np.radians(df['Dropoff_Lat'])
        delta_lat = np.radians(df['Dropoff_Lat'] - df['Pickup_Lat'])
        delta_lon = np.radians(df['Dropoff_Long'] - df['Pickup_Long'])
        a = np.sin(delta_lat/2)**2 + np.cos(lat1_rad) * np.cos(lat2_rad) * np.sin(delta_lon/2)**2
        c = 2 * np.arctan2(np.sqrt(a), np.sqrt(1-a))
        df['Haversine_Distance'] = R * c
        
        df['Bearing'] = bearing(df['Pickup_Lat'], df['Pickup_Long'], 
                                df['Dropoff_Lat'], df['Dropoff_Long'])
        df['bearing_sin'] = np.sin(np.radians(df['Bearing']))
        df['bearing_cos'] = np.cos(np.radians(df['Bearing']))
        
        df['Delta_Lat'] = df['Dropoff_Lat'] - df['Pickup_Lat']
        df['Delta_Long'] = df['Dropoff_Long'] - df['Pickup_Long']
        
        if 'Distance_KM' in df.columns:
            df['Distance_Ratio'] = df['Distance_KM'] / (df['Haversine_Distance'] + 1e-6)
            df['Distance_Difference'] = np.abs(df['Distance_KM'] - df['Haversine_Distance'])
            df['Is_Ultra_Short'] = (df['Distance_KM'] < 0.024).astype(np.int8)
    
    # 4. Zone features
    if 'Pickup_Zone' in df.columns and 'Dropoff_Zone' in df.columns:
        df['Is_Same_Zone'] = (df['Pickup_Zone'] == df['Dropoff_Zone']).astype(np.int8)
        
        # Route frequency
        route = df['Pickup_Zone'].astype(str) + '__' + df['Dropoff_Zone'].astype(str)
        if is_train:
            ROUTE_FREQ = route.value_counts().to_dict()
            PICKUP_ZONE_FREQ = df['Pickup_Zone'].value_counts().to_dict()
            DROPOFF_ZONE_FREQ = df['Dropoff_Zone'].value_counts().to_dict()
        
        df['route_count'] = route.map(ROUTE_FREQ).fillna(0).astype(np.int32)
        df['pickup_zone_count'] = df['Pickup_Zone'].map(PICKUP_ZONE_FREQ).fillna(0).astype(np.int32)
        df['dropoff_zone_count'] = df['Dropoff_Zone'].map(DROPOFF_ZONE_FREQ).fillna(0).astype(np.int32)
    
    # 5. Sensor features
    accel_cols = ['Accel_X', 'Accel_Y', 'Accel_Z']
    if all(col in df.columns for col in accel_cols):
        df['Accel_Magnitude'] = np.sqrt(df['Accel_X']**2 + df['Accel_Y']**2 + df['Accel_Z']**2)
        df['Accel_Max'] = df[accel_cols].max(axis=1)
        df['Accel_Min'] = df[accel_cols].min(axis=1)
        df['Accel_Range'] = df['Accel_Max'] - df['Accel_Min']
        # Accel_Std
        df['Accel_Std'] = df[accel_cols].std(axis=1)
    
    if 'Gyro_Z' in df.columns:
        df['Gyro_Abs'] = np.abs(df['Gyro_Z'])
        # Is_Gyro_Z_Outlier
        df['Is_Gyro_Z_Outlier'] = (df['Gyro_Z'].abs() >= 0.672).astype(np.int8)
    
    # 6. Economic features
    if 'Est_Price_IDR' in df.columns and 'Distance_KM' in df.columns:
        df['Price_per_KM'] = df['Est_Price_IDR'] / (df['Distance_KM'] + 1e-6)
    
    if 'Surge_Multiplier' in df.columns:
        df['Surge_Category'] = pd.cut(
            df['Surge_Multiplier'].fillna(1.0),
            bins=[0, 1, 1.5, 2, 10],
            labels=[0, 1, 2, 3]
        ).astype(np.int8)    
    
    print(f"Feature engineering completed. Shape: {df.shape}")
    return df

# Apply feature engineering
print("Applying feature engineering...")
train = engineer_features(train, is_train=True)
test = engineer_features(test, is_train=False)
gc.collect()

Applying feature engineering...
Feature engineering completed. Shape: (8000000, 58)
Feature engineering completed. Shape: (4000000, 57)


0

## 4. Preprocessing

In [9]:
def encode_categorical_target(train, test, categorical_cols, y_train, smoothing=10):
    """
    Encodes based on target mean, with smoothing to prevent overfitting
    """
    print("\n" + "="*80)
    print("TARGET ENCODING")
    print("="*80)
    
    encoders = {}
    global_mean = y_train.mean()
    
    for col in tqdm(categorical_cols, desc="Target encoding"):
        temp_df = pd.DataFrame({col: train[col], 'target': y_train})
        
        agg = temp_df.groupby(col)['target'].agg(['mean', 'count'])
        smoothed_mean = (agg['mean'] * agg['count'] + global_mean * smoothing) / (agg['count'] + smoothing)
        
        encoding_map = smoothed_mean.to_dict()
        
        train[col] = train[col].map(encoding_map).fillna(global_mean).astype(np.float32)
        test[col] = test[col].map(encoding_map).fillna(global_mean).astype(np.float32)
        
        encoders[col] = {
            'type': 'target',
            'unique_values': len(encoding_map),
            'global_mean': global_mean
        }
    
    print(f"\nEncoded {len(categorical_cols)} categorical features")
    return train, test, encoders

In [10]:

def preprocess_data(train, test):
    """
    Preprocessing data
    """
    print("\n" + "="*80)
    print("DATA PREPROCESSING PIPELINE")
    print("="*80)
    
    cols_to_drop = [config.ID_COL, 'Timestamp']
    if config.TARGET_COL in train.columns:
        y = train[config.TARGET_COL].copy()
        cols_to_drop.append(config.TARGET_COL)
    else:
        y = None
    
    cols_to_drop = [col for col in cols_to_drop if col in train.columns]
    X_train = train.drop(cols_to_drop, axis=1).copy()
    X_test = test.drop([col for col in cols_to_drop if col in test.columns], axis=1).copy()
    
    print(f"Initial shapes: X_train={X_train.shape}, X_test={X_test.shape}")
    
    numeric_cols = X_train.select_dtypes(include=[np.number]).columns.tolist()
    categorical_cols = X_train.select_dtypes(include=['object']).columns.tolist()
    
    # Hybrid imputation
    mean_impute_cols = ['Accel_Y', 'Accel_Z', 'Dropoff_Lat', 'Dropoff_Long', 'Gyro_Z']
    mean_impute_cols = [c for c in mean_impute_cols if c in numeric_cols]
    
    for col in mean_impute_cols:
        val = X_train[col].mean()
        X_train[col].fillna(val, inplace=True)
        X_test[col].fillna(val, inplace=True)
    
    for col in numeric_cols:
        if X_train[col].isnull().sum() > 0:
            val = X_train[col].median()
            X_train[col].fillna(val, inplace=True)
            X_test[col].fillna(val, inplace=True)
    
    for col in categorical_cols:
        if X_train[col].isnull().sum() > 0:
            X_train[col].fillna('Unknown', inplace=True)
            X_test[col].fillna('Unknown', inplace=True)
    
    # Outlier clipping
    for col in numeric_cols:
        q01, q99 = X_train[col].quantile([0.01, 0.99])
        X_train[col] = X_train[col].clip(q01, q99)
        X_test[col] = X_test[col].clip(q01, q99)
    
    # Ordinal encoding 
    from sklearn.preprocessing import OrdinalEncoder
    ordinal_cols = ['Weather', 'Traffic', 'Payment_Method', 'Signal_Strength']
    ordinal_cols = [c for c in ordinal_cols if c in categorical_cols]
    
    if ordinal_cols:
        oe = OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1)
        X_train[ordinal_cols] = oe.fit_transform(X_train[ordinal_cols])
        X_test[ordinal_cols] = oe.transform(X_test[ordinal_cols])
        categorical_cols = [c for c in categorical_cols if c not in ordinal_cols]
    
    # Target encoding
    if categorical_cols and y is not None:
        le_temp = LabelEncoder()
        y_temp = le_temp.fit_transform(y)
        X_train, X_test, _ = encode_categorical_target(X_train, X_test, categorical_cols, y_temp, smoothing=10)
    
    # Target label encoding
    if y is not None:
        le_target = LabelEncoder()
        y_encoded = le_target.fit_transform(y)
        print(f"\nTarget classes: {dict(zip(range(len(le_target.classes_)), le_target.classes_))}")
    else:
        y_encoded, le_target = None, None
    
    print(f"Final shapes: X_train={X_train.shape}, X_test={X_test.shape}")
    return X_train, X_test, y_encoded, le_target

X_train, X_test, y_train, le_target = preprocess_data(train, test)
gc.collect()


DATA PREPROCESSING PIPELINE
Initial shapes: X_train=(8000000, 55), X_test=(4000000, 55)

TARGET ENCODING


Target encoding:   0%|          | 0/6 [00:00<?, ?it/s]


Encoded 6 categorical features

Target classes: {0: 'Fraud_Indication', 1: 'Navigation_Issue', 2: 'Perfect_Trip', 3: 'Safety_Violation', 4: 'Service_Complaint'}
Final shapes: X_train=(8000000, 55), X_test=(4000000, 55)


20

### Class Imbalance Handling

In [11]:
def analyze_class_distribution(y, label_encoder=None):
    """Analyze and report class distribution"""
    print("\n" + "="*80)
    print("CLASS DISTRIBUTION ANALYSIS")
    print("="*80)
    
    unique, counts = np.unique(y, return_counts=True)
    total = len(y)
    
    print(f"\nTotal samples: {total:,}")
    print(f"Number of classes: {len(unique)}")
    print("\nClass distribution:")
    
    for cls, count in zip(unique, counts):
        percentage = (count / total) * 100
        if label_encoder is not None:
            original_label = label_encoder.classes_[cls]
            print(f"  Class {cls} ({original_label}): {count:,} samples ({percentage:.2f}%)")
        else:
            print(f"  Class {cls}: {count:,} samples ({percentage:.2f}%)")
    
    imbalance_ratio = counts.max() / counts.min()
    print(f"\nImbalance ratio (max/min): {imbalance_ratio:.2f}:1")
    
    if imbalance_ratio > 1.5:
        print("\nWARNING: Significant class imbalance detected!")
        print("Recommendation: Apply class balancing techniques")
    
    print("="*80)
    
    return unique, counts, imbalance_ratio
    
unique_classes, class_counts, imbalance_ratio = analyze_class_distribution(y_train, le_target)


CLASS DISTRIBUTION ANALYSIS

Total samples: 8,000,000
Number of classes: 5

Class distribution:
  Class 0 (Fraud_Indication): 400,313 samples (5.00%)
  Class 1 (Navigation_Issue): 801,790 samples (10.02%)
  Class 2 (Perfect_Trip): 4,397,607 samples (54.97%)
  Class 3 (Safety_Violation): 1,601,595 samples (20.02%)
  Class 4 (Service_Complaint): 798,695 samples (9.98%)

Imbalance ratio (max/min): 10.99:1

Recommendation: Apply class balancing techniques


In [12]:
sample_weights = np.ones(len(y_train), dtype=np.float32)

WEIGHT_CLASS_1 = 4.509194597384958
WEIGHT_CLASS_4 = 4.96455487505969

sample_weights[y_train == 1] = WEIGHT_CLASS_1
sample_weights[y_train == 4] = WEIGHT_CLASS_4

print(f"\nWeights applied:")
print(f"  Class 0 (Fraud): 1.0 (no adjustment)")
print(f"  Class 1 (Navigation): {WEIGHT_CLASS_1} ({(y_train == 1).sum():,} samples)")
print(f"  Class 2 (Perfect): 1.0 (no adjustment)")
print(f"  Class 3 (Safety): 1.0 (no adjustment)")
print(f"  Class 4 (Service): {WEIGHT_CLASS_4} ({(y_train == 4).sum():,} samples)")

scale_weights = sample_weights
X_train_balanced = X_train
y_train_balanced = y_train

print(f"\nFinal training set shape: {X_train_balanced.shape}")
print(f"Sample weights shape: {scale_weights.shape}")
print("="*80)

gc.collect()


Weights applied:
  Class 0 (Fraud): 1.0 (no adjustment)
  Class 1 (Navigation): 4.509194597384958 (801,790 samples)
  Class 2 (Perfect): 1.0 (no adjustment)
  Class 3 (Safety): 1.0 (no adjustment)
  Class 4 (Service): 4.96455487505969 (798,695 samples)

Final training set shape: (8000000, 55)
Sample weights shape: (8000000,)


0

## 5. Model Training

In [13]:
def macro_f1_eval(preds, dtrain):
    """
    Custom evaluation function for XGBoost to calculate Macro F1
    """
    labels = dtrain.get_label()
    preds_reshaped = preds.reshape(len(labels), -1)
    pred_labels = np.argmax(preds_reshaped, axis=1)
    # Calculate macro F1
    score = f1_score(labels, pred_labels, average='macro')
    return 'macro_f1', score

In [14]:
import json

best_params = {
    "max_depth": 7,
    "min_child_weight": 7,
    "max_delta_step": 5,
    "gamma": 0.1737507723343592,
    "learning_rate": 0.034002141615166376,
    "subsample": 0.9516920090100376,
    "colsample_bytree": 0.5993238619880857,
    "colsample_bylevel": 0.9963742019132593,
    "colsample_bynode": 0.9228789464520328,
    "reg_alpha": 1.0572914540483875,
    "reg_lambda": 2.1685276721731586
}

In [15]:
def train_xgboost(X_train, y_train, X_test, n_folds=5, use_gpu=False, best_params=None, scale_weights=None):
    print("\n" + "="*80)
    print("Training XGBoost Models")
    print("="*80)
    
    if best_params is not None:
        print("Using tuned best parameters")
        params = config.get_xgboost_params(use_gpu=use_gpu)
        params.update(best_params)
    else:
        params = config.get_xgboost_params(use_gpu=use_gpu)
    
    params['num_class'] = len(np.unique(y_train))
    
    if scale_weights is not None:
        print(f"\nApplying sample weights for class balancing")
        print(f"Sample weights shape: {scale_weights.shape}")
        print(f"Unique weights: {np.unique(scale_weights)}")
    
    skf = StratifiedKFold(n_splits=n_folds, shuffle=True, random_state=config.RANDOM_STATE)
    
    oof_predictions = np.zeros((len(X_train), len(np.unique(y_train))))
    test_predictions = np.zeros((len(X_test), len(np.unique(y_train))))
    
    fold_scores = []
    models = []
    
    pbar = tqdm(enumerate(skf.split(X_train, y_train), 1), total=n_folds, desc="XGBoost Folds")
    for fold, (train_idx, val_idx) in pbar:
        pbar.set_description(f"XGBoost Fold {fold}/{n_folds}")
        
        X_tr, X_val = X_train.iloc[train_idx], X_train.iloc[val_idx]
        y_tr, y_val = y_train[train_idx], y_train[val_idx]
        
        if scale_weights is not None:
            dtrain = xgb.DMatrix(X_tr, label=y_tr, weight=scale_weights[train_idx])
            dval = xgb.DMatrix(X_val, label=y_val, weight=scale_weights[val_idx])
        else:
            dtrain = xgb.DMatrix(X_tr, label=y_tr)
            dval = xgb.DMatrix(X_val, label=y_val)
        
        dtest = xgb.DMatrix(X_test)
        
        if fold == 1:
            if scale_weights is not None:
                print(f"\nDEBUG - Fold {fold}:")
                print(f"  Train weights shape: {scale_weights[train_idx].shape}")
                print(f"  Train weights unique: {np.unique(scale_weights[train_idx])}")
                print(f"  Samples with weight {WEIGHT_CLASS_1}: {(scale_weights[train_idx] == WEIGHT_CLASS_1).sum()}")
                print(f"  Samples with weight {WEIGHT_CLASS_4}: {(scale_weights[train_idx] == WEIGHT_CLASS_4).sum()}")
                print(f"  Expected class 1 samples: {(y_tr == 1).sum()}")
                print(f"  Expected class 4 samples: {(y_tr == 4).sum()}")
        
        model = xgb.train(
            params,
            dtrain,
            num_boost_round=1000,
            evals=[(dtrain, 'train'), (dval, 'valid')],
            custom_metric=macro_f1_eval,
            early_stopping_rounds=50,
            verbose_eval=100
        )
        
        oof_predictions[val_idx] = model.predict(dval)
        test_predictions += model.predict(dtest) / n_folds
        
        oof_pred_labels = np.argmax(oof_predictions[val_idx], axis=1)
        fold_score = f1_score(y_val, oof_pred_labels, average='macro')
        fold_scores.append(fold_score)

        print(f"\n{'='*60}")
        print(f"FOLD {fold} SUMMARY:")
        print(f"{'='*60}")
        print(f"  Validation F1 (Macro): {fold_score:.6f}")
        print(f"  Best Iteration: {model.best_iteration}")
        print(f"  Best Score (mlogloss): {model.best_score:.6f}")
        
        oof_fold_pred = np.argmax(oof_predictions[val_idx], axis=1)
        print(f"\n  Per-Class F1 Scores:")
        from sklearn.metrics import classification_report
        print(classification_report(y_val, oof_fold_pred, 
                                   target_names=[f"Class_{i}" for i in range(len(np.unique(y_train)))],
                                   digits=4))
        print(f"{'='*60}\n")
        
        pbar.set_postfix({'F1': f'{fold_score:.6f}'})
        
        models.append(model)
        gc.collect()
    
    oof_pred_labels = np.argmax(oof_predictions, axis=1)
    overall_score = f1_score(y_train, oof_pred_labels, average='macro')
    
    print("\n" + "="*80)
    print("XGBOOST TRAINING SUMMARY")
    print("="*80)
    print(f"Overall CV Score (Macro F1): {overall_score:.6f}")
    print(f"Standard Deviation: {np.std(fold_scores):.6f}")
    print(f"Min F1 Score: {np.min(fold_scores):.6f}")
    print(f"Max F1 Score: {np.max(fold_scores):.6f}")
    print(f"\nFold-by-Fold Scores:")
    for i, score in enumerate(fold_scores, 1):
        print(f"  Fold {i}: {score:.6f}")
    
    print(f"\n" + "="*80)
    print("OVERALL OUT-OF-FOLD PREDICTIONS REPORT")
    print("="*80)
    print(classification_report(y_train, oof_pred_labels, 
                               target_names=[f"Class_{i}" for i in range(len(np.unique(y_train)))],
                               digits=4))
    print("="*80)

    return test_predictions, models, overall_score, oof_predictions

### Hyperparameter Tuning

In [16]:
import optuna

def objective_xgboost(trial):
    """
    Optuna objective function for XGBoost hyperparameter tuning with class balancing
    """
    params = {
        'max_depth': trial.suggest_int('max_depth', 3, 8),
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 10),
        'max_delta_step': trial.suggest_int('max_delta_step', 0, 5),
        'gamma': trial.suggest_float('gamma', 0.0, 2.0),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3),
        'subsample': trial.suggest_float('subsample', 0.5, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0),
        'colsample_bylevel': trial.suggest_float('colsample_bylevel', 0.5, 1.0),
        'colsample_bynode': trial.suggest_float('colsample_bynode', 0.5, 1.0),
        'reg_alpha': trial.suggest_float('reg_alpha', 0.0, 5.0),
        'reg_lambda': trial.suggest_float('reg_lambda', 0.0, 5.0)
    }
    
    base_params = config.get_xgboost_params(use_gpu=gpu_config['xgboost_gpu'])
    base_params.update(params)
    base_params['num_class'] = len(np.unique(y_train_balanced))
    
    skf = StratifiedKFold(n_splits=3, shuffle=True, random_state=config.RANDOM_STATE)
    fold_scores = []
    
    for fold, (train_idx, val_idx) in enumerate(skf.split(X_train_balanced, y_train_balanced), 1):
        X_tr, X_val = X_train_balanced.iloc[train_idx], X_train_balanced.iloc[val_idx]
        y_tr, y_val = y_train_balanced[train_idx], y_train_balanced[val_idx]
        
        if scale_weights is not None:
            dtrain = xgb.DMatrix(X_tr, label=y_tr, weight=scale_weights[train_idx])
            dval = xgb.DMatrix(X_val, label=y_val, weight=scale_weights[val_idx])
        else:
            dtrain = xgb.DMatrix(X_tr, label=y_tr)
            dval = xgb.DMatrix(X_val, label=y_val)
        
        model = xgb.train(
            base_params,
            dtrain,
            num_boost_round=1000,
            evals=[(dtrain, 'train'), (dval, 'valid')],
            custom_metric=macro_f1_eval,
            early_stopping_rounds=50,
            verbose_eval=False
        )
        
        val_pred = model.predict(dval)
        val_pred_labels = np.argmax(val_pred, axis=1)
        fold_score = f1_score(y_val, val_pred_labels, average='macro')
        fold_scores.append(fold_score)
        
        del dtrain, dval, model
        gc.collect()
    
    mean_score = np.mean(fold_scores)
    return mean_score


# print("="*80)
# print("OPTUNA HYPERPARAMETER TUNING FOR XGBOOST")
# print("="*80)
# print(f"Dataset: {X_train_balanced.shape[0]:,} samples, {X_train_balanced.shape[1]} features")
# print(f"Cross-validation: 3-fold StratifiedKFold")
# print(f"Optimization metric: Macro F1 Score")
# print(f"Number of trials: 50")
# print(f"Class balancing: {'Enabled' if scale_weights is not None else 'Disabled'}")
# print("="*80)

# study = optuna.create_study(
#     direction='maximize',
#     study_name='xgboost_class_balanced',
#     sampler=optuna.samplers.TPESampler(seed=config.RANDOM_STATE)
# )

# study.optimize(
#     objective_xgboost,
#     n_trials=50,
#     timeout=None,
#     show_progress_bar=True,
#     n_jobs=1
# )

# print("\n" + "="*80)
# print("OPTIMIZATION RESULTS")
# print("="*80)
# print(f"Best trial number: {study.best_trial.number}")
# print(f"Best Macro F1 Score: {study.best_value:.6f}")
# print("\nBest hyperparameters:")
# for key, value in study.best_params.items():
#     print(f"  {key}: {value}")

# print("\n" + "="*80)
# print("TOP 5 TRIALS")
# print("="*80)
# trials_df = study.trials_dataframe()
# trials_df = trials_df.sort_values('value', ascending=False).head(5)
# print(trials_df[['number', 'value', 'params_max_depth', 'params_learning_rate', 
#                   'params_subsample', 'params_colsample_bytree']].to_string(index=False))

# xgboost_tuned_params = study.best_params
# print("\n" + "="*80)
# print("Best parameters saved to variable: xgboost_tuned_params")
# print("You can now use these parameters for final training:")
# print("train_xgboost(..., best_params=xgboost_tuned_params, ...)")
# print("="*80)

In [17]:
xgboost_test_pred, xgboost_models, xgboost_cv_score, oof_predictions = train_xgboost(
    X_train_balanced, y_train_balanced, X_test,
    n_folds=config.N_FOLDS,
    use_gpu=gpu_config['xgboost_gpu'],
    best_params=best_params,
    scale_weights=scale_weights
)


Training XGBoost Models
Using tuned best parameters
  XGBoost: GPU mode activated

Applying sample weights for class balancing
Sample weights shape: (8000000,)
Unique weights: [1.        4.5091944 4.964555 ]


XGBoost Folds:   0%|          | 0/5 [00:00<?, ?it/s]


DEBUG - Fold 1:
  Train weights shape: (6400000,)
  Train weights unique: [1.        4.5091944 4.964555 ]
  Samples with weight 4.509194597384958: 641432
  Samples with weight 4.96455487505969: 638956
  Expected class 1 samples: 641432
  Expected class 4 samples: 638956
[0]	train-mlogloss:1.53233	train-macro_f1:0.14640	valid-mlogloss:1.53235	valid-macro_f1:0.14629
[50]	train-mlogloss:1.07735	train-macro_f1:0.63233	valid-mlogloss:1.07743	valid-macro_f1:0.63237

FOLD 1 SUMMARY:
  Validation F1 (Macro): 0.632371
  Best Iteration: 0
  Best Score (mlogloss): 0.146292

  Per-Class F1 Scores:
              precision    recall  f1-score   support

     Class_0     0.9986    0.9770    0.9877     80062
     Class_1     0.2753    0.1206    0.1678    160358
     Class_2     0.7346    0.8615    0.7930    879522
     Class_3     0.9966    0.9472    0.9713    320319
     Class_4     0.2884    0.2086    0.2421    159739

    accuracy                         0.7450   1600000
   macro avg     0.6587   

In [18]:
# ============================================================================
# THRESHOLD OPTIMIZATION 
# ============================================================================

print("="*80)
print("THRESHOLD OPTIMIZATION")
print("="*80)

# Baseline score
baseline_pred = np.argmax(oof_predictions, axis=1)
baseline_score = f1_score(y_train_balanced, baseline_pred, average='macro')
print(f"Baseline OOF Macro F1: {baseline_score:.6f}")

optimal_thresholds = np.array([2.1188, 1.8953, 1.6760, 1.7246, 1.6181])

print(f"\nUsing pre-computed DE optimal thresholds:")
class_names = {0: 'Fraud', 1: 'Navigation', 2: 'Perfect', 3: 'Safety', 4: 'Service'}
for i, t in enumerate(optimal_thresholds):
    print(f"  {class_names[i]}: {t:.4f}")

# Apply optimal thresholds to OOF predictions
adjusted_oof = oof_predictions * optimal_thresholds
optimized_pred = np.argmax(adjusted_oof, axis=1)
optimized_score = f1_score(y_train_balanced, optimized_pred, average='macro')

print(f"\nBaseline OOF Macro F1:  {baseline_score:.6f}")
print(f"Optimized OOF Macro F1: {optimized_score:.6f}")
print(f"Improvement: +{optimized_score - baseline_score:.6f}")
print("="*80)

THRESHOLD OPTIMIZATION
Baseline OOF Macro F1: 0.632077

Using pre-computed DE optimal thresholds:
  Fraud: 2.1188
  Navigation: 1.8953
  Perfect: 1.6760
  Safety: 1.7246
  Service: 1.6181

Baseline OOF Macro F1:  0.632077
Optimized OOF Macro F1: 0.633060
Improvement: +0.000982


## 6. Model Evaluation and Inference

In [19]:
from collections import defaultdict
import pandas as pd

importance_dict = defaultdict(list)

for model in xgboost_models:
    score = model.get_score(importance_type='gain')
    for feature_name, value in score.items():
        importance_dict[feature_name].append(value)

# Rata-rata across folds
importance_avg = {
    feature: sum(values) / len(values)
    for feature, values in importance_dict.items()
}

importance_df = (
    pd.DataFrame(importance_avg.items(), columns=['feature', 'avg_gain'])
    .sort_values(by='avg_gain', ascending=False)
)

print(importance_df)

                feature       avg_gain
50    Is_Gyro_Z_Outlier  125775.182813
40       Is_Ultra_Short   51406.382812
49             Gyro_Abs   45268.291406
5           Distance_KM   18217.317969
46            Accel_Min   14866.108008
51         Price_per_KM   11643.235352
8               Accel_X   11622.069141
11               Gyro_Z    8468.288477
15           Promo_Code    6906.493359
39  Distance_Difference    4396.950146
47          Accel_Range    4083.097852
6         Est_Price_IDR    3691.481787
44      Accel_Magnitude    2649.036011
38       Distance_Ratio    2510.972656
52       Surge_Category    2501.676245
7      Surge_Multiplier    1980.656592
48            Accel_Std    1270.068176
13         Dropoff_Zone    1260.597485
45            Accel_Max    1104.171777
12          Pickup_Zone    1089.455432
32   Haversine_Distance    1086.097632
25    null_count_sensor    1020.644531
10              Accel_Z     715.841156
9               Accel_Y     412.187891
24  null_count_distance  

In [20]:
print("\n" + "="*80)
print("MODEL EVALUATION ")
print("="*80)

if xgboost_test_pred is not None:
    print(f"\n✓ XGBoost Model Successfully Trained")
    print(f"  Cross-Validation Score (Baseline): {xgboost_cv_score:.6f}")
    print(f"  Cross-Validation Score (Optimized): {optimized_score:.6f}")
    print(f"  Model Type: XGBoost with GPU acceleration")
    
    print("\n" + "="*80)
    print("APPLYING DE THRESHOLDS TO TEST SET")
    print("="*80)
    
    # Apply optimized thresholds 
    print(f"\nUsing DE Optimized Thresholds:")
    class_names = {0: 'Fraud', 1: 'Navigation', 2: 'Perfect', 3: 'Safety', 4: 'Service'}
    for i, t in enumerate(optimal_thresholds):
        print(f"  {class_names.get(i, f'Class {i}')}: {t:.4f}")
    
    # Apply thresholds
    final_predictions = xgboost_test_pred * optimal_thresholds
    final_pred_labels = np.argmax(final_predictions, axis=1)
    
    print(f"\n✓ Optimized Predictions Generated Successfully")
    print(f"  Total test samples: {len(final_pred_labels):,}")
    print(f"  Prediction shape: {final_predictions.shape}")
    print(f"  Classes predicted: {len(np.unique(final_pred_labels))}")
    
    print("\n" + "="*80)
    print("PREDICTION DISTRIBUTION (WITH THRESHOLDS)")
    print("="*80)
    unique, counts = np.unique(final_pred_labels, return_counts=True)
    for class_idx, count in zip(unique, counts):
        percentage = (count / len(final_pred_labels)) * 100
        print(f"  {class_names.get(class_idx, f'Class {class_idx}')}: {count:,} samples ({percentage:.2f}%)")
    
else:
    raise ValueError("XGBoost model training failed! Cannot generate predictions.")

print("\n" + "="*80)


MODEL EVALUATION 

✓ XGBoost Model Successfully Trained
  Cross-Validation Score (Baseline): 0.632077
  Cross-Validation Score (Optimized): 0.633060
  Model Type: XGBoost with GPU acceleration

APPLYING DE THRESHOLDS TO TEST SET

Using DE Optimized Thresholds:
  Fraud: 2.1188
  Navigation: 1.8953
  Perfect: 1.6760
  Safety: 1.7246
  Service: 1.6181

✓ Optimized Predictions Generated Successfully
  Total test samples: 4,000,000
  Prediction shape: (4000000, 5)
  Classes predicted: 5

PREDICTION DISTRIBUTION (WITH THRESHOLDS)
  Fraud: 197,693 samples (4.94%)
  Navigation: 258,784 samples (6.47%)
  Perfect: 2,526,001 samples (63.15%)
  Safety: 788,794 samples (19.72%)
  Service: 228,728 samples (5.72%)



## 7. Generate Submission

In [21]:
def create_submission(test_ids, predictions, le_target, filename='submission.csv'):
    pred_labels = le_target.inverse_transform(predictions)
    
    submission = pd.DataFrame({
        config.ID_COL: test_ids,
        config.TARGET_COL: pred_labels
    })
    
    submission.to_csv(filename, index=False)
    
    print(f"\nSubmission saved to: {filename}")
    print(f"Submission shape: {submission.shape}")
    print(f"\nPrediction distribution:")
    print(submission[config.TARGET_COL].value_counts())
    
    return submission

test_ids = test[config.ID_COL].values
submission = create_submission(test_ids, final_pred_labels, le_target, 'submission-9.csv')
submission.head(10)


Submission saved to: submission-9.csv
Submission shape: (4000000, 2)

Prediction distribution:
Trip_Label
Perfect_Trip         2526001
Safety_Violation      788794
Navigation_Issue      258784
Service_Complaint     228728
Fraud_Indication      197693
Name: count, dtype: int64


Unnamed: 0,Trip_ID,Trip_Label
0,TRIP-06583736,Perfect_Trip
1,TRIP-11356251,Perfect_Trip
2,TRIP-03320505,Service_Complaint
3,TRIP-07188814,Perfect_Trip
4,TRIP-06994869,Perfect_Trip
5,TRIP-03232331,Perfect_Trip
6,TRIP-03536120,Perfect_Trip
7,TRIP-06411895,Perfect_Trip
8,TRIP-00132176,Service_Complaint
9,TRIP-00298208,Perfect_Trip


In [43]:
# !zip submission-13.zip submission-13.csv
# from IPython.display import FileLink
# FileLink('submission-13.zip')

  adding: submission-13.csv (deflated 82%)


## 8. Validation & Analysis

In [23]:
print("\n" + "="*80)
print("Final Validation Checks")
print("="*80)

assert submission.shape[0] == test.shape[0], "Submission size mismatch!"
assert submission.columns.tolist() == [config.ID_COL, config.TARGET_COL], "Column names mismatch!"
assert submission[config.TARGET_COL].isnull().sum() == 0, "Null predictions found!"

expected_labels = set(le_target.classes_)
submission_labels = set(submission[config.TARGET_COL].unique())
assert submission_labels.issubset(expected_labels), "Invalid labels in submission!"
print("All validation checks passed!")
print("="*80)


Final Validation Checks
All validation checks passed!


## Adversarial Validation

In [24]:
# ============================================================================
# STRATEGY 2: ADVERSARIAL VALIDATION
# ============================================================================
# Check if train and test distributions differ significantly

print("="*80)
print("STRATEGY 2: ADVERSARIAL VALIDATION")
print("="*80)

# Use the already processed X_train and X_test (numeric features)
# Create adversarial labels: 0 = train, 1 = test

print(f"\nDataset shapes:")
print(f"  X_train: {X_train.shape}")
print(f"  X_test: {X_test.shape}")

# Sample for faster computation (use 10% of data)
sample_size = min(100000, len(X_train), len(X_test))
np.random.seed(42)

train_sample_idx = np.random.choice(len(X_train), sample_size, replace=False)
test_sample_idx = np.random.choice(len(X_test), sample_size, replace=False)

X_adv_train = X_train.iloc[train_sample_idx].copy()
X_adv_test = X_test.iloc[test_sample_idx].copy()

# Create combined dataset
X_combined = pd.concat([X_adv_train, X_adv_test], axis=0, ignore_index=True)
y_adversarial = np.array([0] * len(X_adv_train) + [1] * len(X_adv_test))

print(f"\nAdversarial dataset:")
print(f"  Combined: {X_combined.shape}")
print(f"  Train samples (label=0): {(y_adversarial == 0).sum()}")
print(f"  Test samples (label=1): {(y_adversarial == 1).sum()}")

STRATEGY 2: ADVERSARIAL VALIDATION

Dataset shapes:
  X_train: (8000000, 55)
  X_test: (4000000, 55)

Adversarial dataset:
  Combined: (200000, 55)
  Train samples (label=0): 100000
  Test samples (label=1): 100000


In [25]:
# ============================================================================
# 2A. TRAIN ADVERSARIAL CLASSIFIER
# ============================================================================
print("\n" + "="*80)
print("2A. TRAINING ADVERSARIAL CLASSIFIER (XGBoost)")
print("="*80)

from sklearn.model_selection import cross_val_score
from sklearn.metrics import roc_auc_score

# Simple XGBoost for adversarial validation
adv_params = {
    'objective': 'binary:logistic',
    'eval_metric': 'auc',
    'max_depth': 5,
    'learning_rate': 0.1,
    'subsample': 0.8,
    'colsample_bytree': 0.8,
    'random_state': 42,
    'verbosity': 0
}

if gpu_config['xgboost_gpu']:
    adv_params['device'] = 'cuda'

# 5-fold CV for adversarial validation
from sklearn.model_selection import StratifiedKFold

skf_adv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
adv_scores = []
adv_feature_importance = []

print("\nRunning 5-fold adversarial validation...")

for fold, (train_idx, val_idx) in enumerate(skf_adv.split(X_combined, y_adversarial), 1):
    X_tr, X_val = X_combined.iloc[train_idx], X_combined.iloc[val_idx]
    y_tr, y_val = y_adversarial[train_idx], y_adversarial[val_idx]
    
    dtrain = xgb.DMatrix(X_tr, label=y_tr)
    dval = xgb.DMatrix(X_val, label=y_val)
    
    model = xgb.train(
        adv_params,
        dtrain,
        num_boost_round=100,
        evals=[(dval, 'val')],
        early_stopping_rounds=20,
        verbose_eval=False
    )
    
    val_pred = model.predict(dval)
    auc = roc_auc_score(y_val, val_pred)
    adv_scores.append(auc)
    
    # Feature importance
    importance = model.get_score(importance_type='gain')
    adv_feature_importance.append(importance)
    
    print(f"  Fold {fold}: AUC = {auc:.6f}")

mean_auc = np.mean(adv_scores)
std_auc = np.std(adv_scores)

print(f"\n" + "-"*60)
print(f"ADVERSARIAL VALIDATION RESULTS:")
print(f"-"*60)
print(f"  Mean AUC: {mean_auc:.6f} ± {std_auc:.6f}")

if mean_auc < 0.52:
    print(f"\n✓ EXCELLENT! Train and test distributions are VERY SIMILAR")
    print(f"  Model cannot distinguish train from test (AUC ≈ 0.5)")
    print(f"  → No significant data leakage or distribution shift")
elif mean_auc < 0.55:
    print(f"\n✓ GOOD! Train and test distributions are SIMILAR")
    print(f"  Minor differences exist but not significant")
elif mean_auc < 0.60:
    print(f"\n⚠ WARNING! Some distribution differences detected")
    print(f"  Check which features differ between train/test")
else:
    print(f"\n❌ ALERT! Significant distribution shift detected!")
    print(f"  Train and test sets come from different distributions")
    print(f"  This may cause poor generalization")


2A. TRAINING ADVERSARIAL CLASSIFIER (XGBoost)

Running 5-fold adversarial validation...
  Fold 1: AUC = 0.863996
  Fold 2: AUC = 0.863461
  Fold 3: AUC = 0.864303
  Fold 4: AUC = 0.863111
  Fold 5: AUC = 0.861848

------------------------------------------------------------
ADVERSARIAL VALIDATION RESULTS:
------------------------------------------------------------
  Mean AUC: 0.863344 ± 0.000855

❌ ALERT! Significant distribution shift detected!
  Train and test sets come from different distributions
  This may cause poor generalization


In [26]:
# ============================================================================
# 2B. ANALYZE FEATURES THAT DIFFER BETWEEN TRAIN AND TEST
# ============================================================================
print("\n" + "="*80)
print("2B. FEATURES THAT DISTINGUISH TRAIN vs TEST")
print("="*80)

# Aggregate feature importance across folds
from collections import defaultdict

combined_importance = defaultdict(list)
for imp_dict in adv_feature_importance:
    for feat, score in imp_dict.items():
        combined_importance[feat].append(score)

avg_importance = {
    feat: np.mean(scores) 
    for feat, scores in combined_importance.items()
}

# Sort by importance
sorted_importance = sorted(avg_importance.items(), key=lambda x: x[1], reverse=True)

print("\nTop 15 features that differ between train and test:")
print("-"*60)
for i, (feat, imp) in enumerate(sorted_importance[:15], 1):
    print(f"  {i:2d}. {feat}: {imp:.4f}")

# Visualize distribution comparison for top features
print("\n" + "-"*60)
print("Distribution comparison for top differentiating features:")
print("-"*60)

top_features = [f[0] for f in sorted_importance[:5]]

for feat in top_features:
    if feat in X_train.columns and feat in X_test.columns:
        train_mean = X_train[feat].mean()
        train_std = X_train[feat].std()
        test_mean = X_test[feat].mean()
        test_std = X_test[feat].std()
        
        diff_pct = abs(train_mean - test_mean) / (train_mean + 1e-6) * 100
        
        print(f"\n{feat}:")
        print(f"  Train: {train_mean:.4f} ± {train_std:.4f}")
        print(f"  Test:  {test_mean:.4f} ± {test_std:.4f}")
        print(f"  Difference: {diff_pct:.2f}%")


2B. FEATURES THAT DISTINGUISH TRAIN vs TEST

Top 15 features that differ between train and test:
------------------------------------------------------------
   1. null_count: 952.1405
   2. null_count_temporal: 156.7713
   3. Traffic: 148.9987
   4. Accel_Range: 112.7895
   5. Payment_Method: 95.2621
   6. Accel_Max: 78.7231
   7. Dropoff_Lat: 69.5727
   8. Pickup_Long: 63.4655
   9. null_count_distance: 60.3300
  10. Surge_Category: 54.2494
  11. Signal_Strength: 51.8417
  12. Is_Ultra_Short: 46.8492
  13. Car_Model: 46.6359
  14. Dropoff_Long: 45.4111
  15. Accel_X: 37.2924

------------------------------------------------------------
Distribution comparison for top differentiating features:
------------------------------------------------------------

null_count:
  Train: 3.5176 ± 1.6882
  Test:  1.5953 ± 1.2150
  Difference: 54.65%

null_count_temporal:
  Train: 0.2047 ± 0.4035
  Test:  0.0408 ± 0.1977
  Difference: 80.09%

Traffic:
  Train: 1.4519 ± 1.1026
  Test:  1.0936 ± 0.90

In [27]:
# ============================================================================
# 2C. STATISTICAL TEST FOR DISTRIBUTION SHIFT
# ============================================================================
print("\n" + "="*80)
print("2C. KOLMOGOROV-SMIRNOV TEST FOR DISTRIBUTION SHIFT")
print("="*80)

from scipy.stats import ks_2samp

print("\nKS Test Results (H0: train and test have same distribution):")
print("-"*60)

ks_results = []

for col in X_train.columns:
    if col in X_test.columns:
        # Sample for faster computation
        train_sample = X_train[col].sample(min(10000, len(X_train)), random_state=42)
        test_sample = X_test[col].sample(min(10000, len(X_test)), random_state=42)
        
        stat, p_value = ks_2samp(train_sample, test_sample)
        ks_results.append({
            'feature': col,
            'ks_statistic': stat,
            'p_value': p_value,
            'significant': p_value < 0.05
        })

ks_df = pd.DataFrame(ks_results).sort_values('ks_statistic', ascending=False)

# Show features with significant distribution shift
significant_shift = ks_df[ks_df['significant']]
print(f"\nFeatures with SIGNIFICANT distribution shift (p < 0.05): {len(significant_shift)}/{len(ks_df)}")

if len(significant_shift) > 0:
    print("\nTop 10 features with highest KS statistic:")
    print(ks_df.head(10).to_string(index=False))
else:
    print("\n✓ No features show significant distribution shift!")

# Summary
print("\n" + "="*80)
print("ADVERSARIAL VALIDATION SUMMARY")
print("="*80)
print(f"  Mean AUC: {mean_auc:.6f}")
print(f"  Features with significant shift: {len(significant_shift)}/{len(ks_df)}")

if mean_auc < 0.52 and len(significant_shift) < len(ks_df) * 0.1:
    print("\n✓ CONCLUSION: Train and test are from SAME distribution")
    print("  → Cross-validation scores should generalize well to test set")
else:
    print("\n⚠ CONCLUSION: Some distribution differences exist")
    print("  → May need to adjust for distribution shift")


2C. KOLMOGOROV-SMIRNOV TEST FOR DISTRIBUTION SHIFT

KS Test Results (H0: train and test have same distribution):
------------------------------------------------------------

Features with SIGNIFICANT distribution shift (p < 0.05): 46/55

Top 10 features with highest KS statistic:
            feature  ks_statistic       p_value  significant
         null_count        0.5056  0.000000e+00         True
  null_count_sensor        0.2764 9.881313e-324         True
null_count_distance        0.2399 8.208532e-253         True
        Accel_Range        0.2035 1.572991e-181         True
          Accel_Max        0.1853 2.106527e-150         True
            Traffic        0.1762 5.866928e-136         True
null_count_temporal        0.1663 4.366767e-121         True
    Accel_Magnitude        0.1570 6.501779e-108         True
 Haversine_Distance        0.1321  1.980228e-76         True
        bearing_cos        0.1290  6.791088e-73         True

ADVERSARIAL VALIDATION SUMMARY
  Mean AUC: 0.

## Force Distribution Matching

In [28]:
# ============================================================================
# STRATEGY 3: FORCE DISTRIBUTION MATCHING
# ============================================================================
print("="*80)
print("STRATEGY 3: FORCE DISTRIBUTION MATCHING")
print("="*80)

# Training distribution
train_distribution = train_df['Trip_Label'].value_counts(normalize=True).sort_index()
print("\nTraining Label Distribution:")
for label, pct in train_distribution.items():
    print(f"  {label}: {pct*100:.2f}%")

# Current test prediction distribution (from model)
current_pred_dist = pd.Series(le_target.inverse_transform(final_pred_labels)).value_counts(normalize=True).sort_index()
print("\nCurrent Test Prediction Distribution:")
for label, pct in current_pred_dist.items():
    print(f"  {label}: {pct*100:.2f}%")

# Calculate difference
print("\n" + "-"*60)
print("Distribution Difference (Train - Predicted):")
print("-"*60)
for label in train_distribution.index:
    train_pct = train_distribution.get(label, 0) * 100
    pred_pct = current_pred_dist.get(label, 0) * 100
    diff = train_pct - pred_pct
    print(f"  {label}: {diff:+.2f}%")

STRATEGY 3: FORCE DISTRIBUTION MATCHING

Training Label Distribution:
  Fraud_Indication: 5.00%
  Navigation_Issue: 10.02%
  Perfect_Trip: 54.97%
  Safety_Violation: 20.02%
  Service_Complaint: 9.98%

Current Test Prediction Distribution:
  Fraud_Indication: 4.94%
  Navigation_Issue: 6.47%
  Perfect_Trip: 63.15%
  Safety_Violation: 19.72%
  Service_Complaint: 5.72%

------------------------------------------------------------
Distribution Difference (Train - Predicted):
------------------------------------------------------------
  Fraud_Indication: +0.06%
  Navigation_Issue: +3.55%
  Perfect_Trip: -8.18%
  Safety_Violation: +0.30%
  Service_Complaint: +4.27%


In [29]:
# ============================================================================
# 3A. PROBABILITY-BASED DISTRIBUTION MATCHING
# ============================================================================
print("\n" + "="*80)
print("3A. PROBABILITY-BASED DISTRIBUTION MATCHING")
print("="*80)

def match_distribution_probabilistic(predictions_proba, target_distribution, le_target):
    """
    Adjust predictions to match target distribution using probability ranking.
    
    For each class, select top-N samples based on probability where N matches
    the expected count from target distribution.
    """
    n_samples = len(predictions_proba)
    n_classes = predictions_proba.shape[1]
    
    # Calculate expected counts per class
    expected_counts = {}
    for i, label in enumerate(le_target.classes_):
        expected_counts[i] = int(target_distribution.get(label, 0) * n_samples)
    
    print("\nExpected counts per class:")
    for i, label in enumerate(le_target.classes_):
        print(f"  {label} (class {i}): {expected_counts[i]:,}")
    
    # Initialize predictions
    final_predictions = np.full(n_samples, -1, dtype=int)
    assigned = np.zeros(n_samples, dtype=bool)
    
    # Sort classes by how "distinctive" they are (Fraud and Safety first)
    # These are the classes we can identify well
    class_order = [0, 3, 1, 4, 2]  # Fraud, Safety, Navigation, Service, Perfect
    
    for class_idx in class_order:
        if expected_counts[class_idx] == 0:
            continue
            
        # Get probability for this class
        class_probs = predictions_proba[:, class_idx].copy()
        
        # Mask already assigned samples
        class_probs[assigned] = -np.inf
        
        # Get top N indices
        n_to_assign = expected_counts[class_idx]
        top_indices = np.argsort(class_probs)[-n_to_assign:]
        
        # Assign
        final_predictions[top_indices] = class_idx
        assigned[top_indices] = True
        
        class_name = le_target.classes_[class_idx]
        print(f"  Assigned {n_to_assign:,} samples to {class_name}")
    
    # Assign remaining samples to most likely class
    remaining = ~assigned
    if remaining.sum() > 0:
        remaining_idx = np.where(remaining)[0]
        for idx in remaining_idx:
            available_classes = [c for c in range(n_classes) if not assigned[idx]]
            probs = predictions_proba[idx, available_classes]
            final_predictions[idx] = available_classes[np.argmax(probs)]
        print(f"  Assigned {remaining.sum():,} remaining samples")
    
    return final_predictions

# Apply distribution matching
# Use raw probabilities (before threshold adjustment)
matched_predictions = match_distribution_probabilistic(
    xgboost_test_pred,  # Raw probabilities
    train_distribution,
    le_target
)

# Check new distribution
matched_labels = le_target.inverse_transform(matched_predictions)
matched_dist = pd.Series(matched_labels).value_counts(normalize=True).sort_index()

print("\n" + "-"*60)
print("NEW Distribution After Matching:")
print("-"*60)
for label in train_distribution.index:
    train_pct = train_distribution.get(label, 0) * 100
    new_pct = matched_dist.get(label, 0) * 100
    diff = abs(train_pct - new_pct)
    print(f"  {label}: {new_pct:.2f}% (target: {train_pct:.2f}%, diff: {diff:.2f}%)")


3A. PROBABILITY-BASED DISTRIBUTION MATCHING

Expected counts per class:
  Fraud_Indication (class 0): 200,156
  Navigation_Issue (class 1): 400,895
  Perfect_Trip (class 2): 2,198,803
  Safety_Violation (class 3): 800,797
  Service_Complaint (class 4): 399,347
  Assigned 200,156 samples to Fraud_Indication
  Assigned 800,797 samples to Safety_Violation
  Assigned 400,895 samples to Navigation_Issue
  Assigned 399,347 samples to Service_Complaint
  Assigned 2,198,803 samples to Perfect_Trip
  Assigned 2 remaining samples

------------------------------------------------------------
NEW Distribution After Matching:
------------------------------------------------------------
  Fraud_Indication: 5.00% (target: 5.00%, diff: 0.00%)
  Navigation_Issue: 10.02% (target: 10.02%, diff: 0.00%)
  Perfect_Trip: 54.97% (target: 54.97%, diff: 0.00%)
  Safety_Violation: 20.02% (target: 20.02%, diff: 0.00%)
  Service_Complaint: 9.98% (target: 9.98%, diff: 0.00%)


In [30]:
# ============================================================================
# 3B. CREATE DISTRIBUTION-MATCHED SUBMISSION
# ============================================================================
print("\n" + "="*80)
print("3B. CREATE DISTRIBUTION-MATCHED SUBMISSION")
print("="*80)

# Compare with original predictions
original_labels = le_target.inverse_transform(final_pred_labels)
changes = (matched_labels != original_labels).sum()
print(f"\nTotal predictions changed: {changes:,} ({changes/len(matched_labels)*100:.2f}%)")

# Breakdown of changes
print("\nChange breakdown:")
for orig_class in le_target.classes_:
    orig_mask = original_labels == orig_class
    for new_class in le_target.classes_:
        if orig_class != new_class:
            changed = ((original_labels == orig_class) & (matched_labels == new_class)).sum()
            if changed > 0:
                print(f"  {orig_class} → {new_class}: {changed:,}")

# Create submission
submission_matched = pd.DataFrame({
    config.ID_COL: test_ids,
    config.TARGET_COL: matched_labels
})

submission_matched.to_csv('submission-10.csv', index=False)

print(f"\n✓ Submission saved to: submission-distribution-matched.csv")
print(f"  Shape: {submission_matched.shape}")

print("\n" + "-"*60)
print("Distribution Comparison:")
print("-"*60)
print("\n                    Original    Matched    Training")
for label in train_distribution.index:
    orig_pct = current_pred_dist.get(label, 0) * 100
    match_pct = matched_dist.get(label, 0) * 100
    train_pct = train_distribution.get(label, 0) * 100
    print(f"  {label:20s} {orig_pct:6.2f}%    {match_pct:6.2f}%    {train_pct:6.2f}%")


3B. CREATE DISTRIBUTION-MATCHED SUBMISSION

Total predictions changed: 347,009 (8.68%)

Change breakdown:
  Navigation_Issue → Fraud_Indication: 495
  Navigation_Issue → Perfect_Trip: 7,131
  Navigation_Issue → Safety_Violation: 3,305
  Navigation_Issue → Service_Complaint: 1
  Perfect_Trip → Fraud_Indication: 1,492
  Perfect_Trip → Navigation_Issue: 153,043
  Perfect_Trip → Safety_Violation: 7,676
  Perfect_Trip → Service_Complaint: 172,214
  Safety_Violation → Fraud_Indication: 57
  Service_Complaint → Fraud_Indication: 419
  Service_Complaint → Perfect_Trip: 97
  Service_Complaint → Safety_Violation: 1,079

✓ Submission saved to: submission-distribution-matched.csv
  Shape: (4000000, 2)

------------------------------------------------------------
Distribution Comparison:
------------------------------------------------------------

                    Original    Matched    Training
  Fraud_Indication       4.94%      5.00%      5.00%
  Navigation_Issue       6.47%     10.02%     

## Remove Adversarial Feature & Retrain

In [31]:
# ============================================================================
# STRATEGY 4: REMOVE ADVERSARIAL FEATURES & RETRAIN
# ============================================================================
print("="*80)
print("STRATEGY 4: REMOVE ADVERSARIAL FEATURES")
print("="*80)

# Features to remove based on adversarial validation
# These have high distribution shift between train and test
ADVERSARIAL_FEATURES = [
    'null_count',
    'null_count_temporal', 
    'null_count_distance',
    'null_count_sensor',
    'null_count_economic',
]

print(f"\nFeatures to REMOVE (high distribution shift):")
for f in ADVERSARIAL_FEATURES:
    if f in X_train.columns:
        print(f"  ✗ {f}")

# Create new datasets without adversarial features
features_to_keep = [c for c in X_train.columns if c not in ADVERSARIAL_FEATURES]

X_train_clean = X_train[features_to_keep].copy()
X_test_clean = X_test[features_to_keep].copy()

print(f"\nOriginal features: {X_train.shape[1]}")
print(f"Cleaned features: {X_train_clean.shape[1]}")
print(f"Removed: {X_train.shape[1] - X_train_clean.shape[1]} features")

STRATEGY 4: REMOVE ADVERSARIAL FEATURES

Features to REMOVE (high distribution shift):
  ✗ null_count
  ✗ null_count_temporal
  ✗ null_count_distance
  ✗ null_count_sensor
  ✗ null_count_economic

Original features: 55
Cleaned features: 50
Removed: 5 features


In [32]:
# ============================================================================
# 4A. VERIFY ADVERSARIAL IMPROVEMENT
# ============================================================================
print("\n" + "="*80)
print("4A. VERIFY DISTRIBUTION SHIFT REDUCTION")
print("="*80)

# Quick adversarial validation on cleaned data
sample_size = min(50000, len(X_train_clean), len(X_test_clean))
np.random.seed(42)

train_idx = np.random.choice(len(X_train_clean), sample_size, replace=False)
test_idx = np.random.choice(len(X_test_clean), sample_size, replace=False)

X_adv_train_clean = X_train_clean.iloc[train_idx]
X_adv_test_clean = X_test_clean.iloc[test_idx]

X_combined_clean = pd.concat([X_adv_train_clean, X_adv_test_clean], axis=0, ignore_index=True)
y_adv_clean = np.array([0] * len(X_adv_train_clean) + [1] * len(X_adv_test_clean))

# Quick 3-fold CV
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestClassifier

rf_adv = RandomForestClassifier(n_estimators=50, max_depth=5, random_state=42, n_jobs=-1)
adv_scores_clean = cross_val_score(rf_adv, X_combined_clean, y_adv_clean, cv=3, scoring='roc_auc')

print(f"\nAdversarial AUC BEFORE removing features: 0.863")
print(f"Adversarial AUC AFTER removing features:  {adv_scores_clean.mean():.3f}")
print(f"\nImprovement: {(0.863 - adv_scores_clean.mean())*100:.1f}% closer to 0.5 (ideal)")

if adv_scores_clean.mean() < 0.80:
    print("\n✓ Good improvement! Distribution shift reduced.")
else:
    print("\n⚠ Still significant shift. May need to remove more features.")


4A. VERIFY DISTRIBUTION SHIFT REDUCTION

Adversarial AUC BEFORE removing features: 0.863
Adversarial AUC AFTER removing features:  0.785

Improvement: 7.8% closer to 0.5 (ideal)

✓ Good improvement! Distribution shift reduced.


In [33]:
# ============================================================================
# 4B. RETRAIN XGBOOST WITHOUT ADVERSARIAL FEATURES
# ============================================================================
print("\n" + "="*80)
print("4B. RETRAIN XGBOOST (WITHOUT ADVERSARIAL FEATURES)")
print("="*80)

# Use the same training function but with cleaned data
xgboost_test_pred_clean, xgboost_models_clean, xgboost_cv_score_clean, oof_predictions_clean = train_xgboost(
    X_train_clean, y_train_balanced, X_test_clean,
    n_folds=config.N_FOLDS,
    use_gpu=gpu_config['xgboost_gpu'],
    best_params=best_params,
    scale_weights=scale_weights
)

print(f"\n" + "="*80)
print("COMPARISON: WITH vs WITHOUT ADVERSARIAL FEATURES")
print("="*80)
print(f"  Original CV Score:    {xgboost_cv_score:.6f}")
print(f"  Clean CV Score:       {xgboost_cv_score_clean:.6f}")
print(f"  Difference:           {xgboost_cv_score_clean - xgboost_cv_score:+.6f}")


4B. RETRAIN XGBOOST (WITHOUT ADVERSARIAL FEATURES)

Training XGBoost Models
Using tuned best parameters
  XGBoost: GPU mode activated

Applying sample weights for class balancing
Sample weights shape: (8000000,)
Unique weights: [1.        4.5091944 4.964555 ]


XGBoost Folds:   0%|          | 0/5 [00:00<?, ?it/s]


DEBUG - Fold 1:
  Train weights shape: (6400000,)
  Train weights unique: [1.        4.5091944 4.964555 ]
  Samples with weight 4.509194597384958: 641432
  Samples with weight 4.96455487505969: 638956
  Expected class 1 samples: 641432
  Expected class 4 samples: 638956
[0]	train-mlogloss:1.53274	train-macro_f1:0.14554	valid-mlogloss:1.53276	valid-macro_f1:0.14548
[49]	train-mlogloss:1.08232	train-macro_f1:0.63168	valid-mlogloss:1.08238	valid-macro_f1:0.63187

FOLD 1 SUMMARY:
  Validation F1 (Macro): 0.631813
  Best Iteration: 0
  Best Score (mlogloss): 0.145476

  Per-Class F1 Scores:
              precision    recall  f1-score   support

     Class_0     0.9989    0.9708    0.9847     80062
     Class_1     0.2750    0.1207    0.1677    160358
     Class_2     0.7347    0.8545    0.7901    879522
     Class_3     0.9967    0.9471    0.9713    320319
     Class_4     0.2800    0.2182    0.2453    159739

    accuracy                         0.7418   1600000
   macro avg     0.6571   

In [34]:
# ============================================================================
# 4C. APPLY DE THRESHOLDS & CREATE SUBMISSION
# ============================================================================
print("\n" + "="*80)
print("4C. APPLY THRESHOLDS & CREATE SUBMISSION")
print("="*80)

# Apply the same DE thresholds
adjusted_oof_clean = oof_predictions_clean * optimal_thresholds
optimized_pred_clean = np.argmax(adjusted_oof_clean, axis=1)
optimized_score_clean = f1_score(y_train_balanced, optimized_pred_clean, average='macro')

print(f"\nBaseline OOF (clean): {xgboost_cv_score_clean:.6f}")
print(f"Optimized OOF (clean): {optimized_score_clean:.6f}")

# Apply to test
final_predictions_clean = xgboost_test_pred_clean * optimal_thresholds
final_pred_labels_clean = np.argmax(final_predictions_clean, axis=1)

# Check distribution
pred_dist_clean = pd.Series(le_target.inverse_transform(final_pred_labels_clean)).value_counts(normalize=True).sort_index()

print("\n" + "-"*60)
print("Prediction Distribution (Clean Model):")
print("-"*60)
for label in train_distribution.index:
    pred_pct = pred_dist_clean.get(label, 0) * 100
    train_pct = train_distribution.get(label, 0) * 100
    diff = pred_pct - train_pct
    print(f"  {label}: {pred_pct:.2f}% (train: {train_pct:.2f}%, diff: {diff:+.2f}%)")

# Create submission
submission_clean = create_submission(
    test_ids, 
    final_pred_labels_clean, 
    le_target, 
    'submission-no-adversarial.csv'
)

print(f"\n✓ Submission saved: submission-no-adversarial.csv")


4C. APPLY THRESHOLDS & CREATE SUBMISSION

Baseline OOF (clean): 0.631891
Optimized OOF (clean): 0.632993

------------------------------------------------------------
Prediction Distribution (Clean Model):
------------------------------------------------------------
  Fraud_Indication: 4.94% (train: 5.00%, diff: -0.06%)
  Navigation_Issue: 7.02% (train: 10.02%, diff: -3.00%)
  Perfect_Trip: 62.52% (train: 54.97%, diff: +7.55%)
  Safety_Violation: 19.73% (train: 20.02%, diff: -0.29%)
  Service_Complaint: 5.79% (train: 9.98%, diff: -4.19%)

Submission saved to: submission-no-adversarial.csv
Submission shape: (4000000, 2)

Prediction distribution:
Trip_Label
Perfect_Trip         2500873
Safety_Violation      789064
Navigation_Issue      280744
Service_Complaint     231579
Fraud_Indication      197740
Name: count, dtype: int64

✓ Submission saved: submission-no-adversarial.csv


In [35]:
# ============================================================================
# 4D. ALSO CREATE DISTRIBUTION-MATCHED VERSION (CLEAN MODEL)
# ============================================================================
print("\n" + "="*80)
print("4D. DISTRIBUTION MATCHING ON CLEAN MODEL")
print("="*80)

# Apply distribution matching to clean model predictions
matched_predictions_clean = match_distribution_probabilistic(
    xgboost_test_pred_clean,
    train_distribution,
    le_target
)

matched_labels_clean = le_target.inverse_transform(matched_predictions_clean)

# Create submission
submission_clean_matched = pd.DataFrame({
    config.ID_COL: test_ids,
    config.TARGET_COL: matched_labels_clean
})

submission_clean_matched.to_csv('submission-no-adversarial-matched.csv', index=False)

print(f"\n✓ Submission saved: submission-no-adversarial-matched.csv")

# Summary
print("\n" + "="*80)
print("STRATEGY 4 SUMMARY - SUBMISSIONS CREATED")
print("="*80)
print("""
Files created:
  1. submission-no-adversarial.csv
     → Model tanpa null_count features + DE thresholds
     
  2. submission-no-adversarial-matched.csv
     → Model tanpa null_count features + distribution matching

Rekomendasi submit:
  - Coba keduanya untuk bandingkan dengan submission sebelumnya
  - Jika LB score naik, berarti removing adversarial features membantu
""")


4D. DISTRIBUTION MATCHING ON CLEAN MODEL

Expected counts per class:
  Fraud_Indication (class 0): 200,156
  Navigation_Issue (class 1): 400,895
  Perfect_Trip (class 2): 2,198,803
  Safety_Violation (class 3): 800,797
  Service_Complaint (class 4): 399,347
  Assigned 200,156 samples to Fraud_Indication
  Assigned 800,797 samples to Safety_Violation
  Assigned 400,895 samples to Navigation_Issue
  Assigned 399,347 samples to Service_Complaint
  Assigned 2,198,803 samples to Perfect_Trip
  Assigned 2 remaining samples

✓ Submission saved: submission-no-adversarial-matched.csv

STRATEGY 4 SUMMARY - SUBMISSIONS CREATED

Files created:
  1. submission-no-adversarial.csv
     → Model tanpa null_count features + DE thresholds
     
  2. submission-no-adversarial-matched.csv
     → Model tanpa null_count features + distribution matching

Rekomendasi submit:
  - Coba keduanya untuk bandingkan dengan submission sebelumnya
  - Jika LB score naik, berarti removing adversarial features membantu



## Stacking Model

In [36]:
# ============================================================================
# STRATEGY 5: XGBOOST + CATBOOST ENSEMBLE
# ============================================================================
print("="*80)
print("STRATEGY 5: XGBOOST + CATBOOST ENSEMBLE")
print("="*80)

# Install CatBoost if needed
try:
    import catboost as cb
    print("✓ CatBoost imported")
except ImportError:
    print("Installing CatBoost...")
    !pip install catboost -q
    import catboost as cb
    print("✓ CatBoost installed and imported")

print("\nAll libraries ready for ensemble!")

# Use clean features (without adversarial features)
X_stack = X_train_clean.copy()
X_stack_test = X_test_clean.copy()

STRATEGY 5: XGBOOST + CATBOOST ENSEMBLE
✓ CatBoost imported

All libraries ready for ensemble!


In [37]:
# ============================================================================
# 5A. TRAIN CATBOOST WITH GPU
# ============================================================================
print("\n" + "="*80)
print("5A. TRAINING CATBOOST (GPU)")
print("="*80)

from catboost import CatBoostClassifier, Pool

cb_params = {
    'iterations': 1000,
    'learning_rate': 0.05,
    'depth': 7,
    'l2_leaf_reg': 3,
    'loss_function': 'MultiClass',
    'eval_metric': 'TotalF1:average=Macro',
    'random_seed': 42,
    'verbose': 100,
    'early_stopping_rounds': 50,
    'class_weights': {0: 1.0, 1: WEIGHT_CLASS_1, 2: 1.0, 3: 1.0, 4: WEIGHT_CLASS_4}
}

if gpu_config['xgboost_gpu']:
    cb_params['task_type'] = 'GPU'
    print("  CatBoost: GPU mode")
else:
    print("  CatBoost: CPU mode")

skf = StratifiedKFold(n_splits=config.N_FOLDS, shuffle=True, random_state=config.RANDOM_STATE)

cb_oof = np.zeros((len(X_stack), 5))
cb_test_pred = np.zeros((len(X_stack_test), 5))
cb_fold_scores = []

for fold, (train_idx, val_idx) in enumerate(tqdm(skf.split(X_stack, y_train_balanced), total=config.N_FOLDS, desc="CatBoost Folds"), 1):
    X_tr, X_val = X_stack.iloc[train_idx], X_stack.iloc[val_idx]
    y_tr, y_val = y_train_balanced[train_idx], y_train_balanced[val_idx]
    
    train_pool = Pool(X_tr, y_tr)
    val_pool = Pool(X_val, y_val)
    
    model = CatBoostClassifier(**cb_params)
    model.fit(train_pool, eval_set=val_pool, use_best_model=True)
    
    cb_oof[val_idx] = model.predict_proba(X_val)
    cb_test_pred += model.predict_proba(X_stack_test) / config.N_FOLDS
    
    fold_pred = np.argmax(cb_oof[val_idx], axis=1)
    fold_score = f1_score(y_val, fold_pred, average='macro')
    cb_fold_scores.append(fold_score)
    print(f"  Fold {fold}: Macro F1 = {fold_score:.6f}")
    
    del model, train_pool, val_pool
    gc.collect()

cb_cv_score = f1_score(y_train_balanced, np.argmax(cb_oof, axis=1), average='macro')
print(f"\n✓ CatBoost CV Score: {cb_cv_score:.6f}")


5A. TRAINING CATBOOST (GPU)
  CatBoost: GPU mode


CatBoost Folds:   0%|          | 0/5 [00:00<?, ?it/s]

0:	learn: 0.5763012	test: 0.5768562	best: 0.5768562 (0)	total: 15.2s	remaining: 4h 13m 4s
100:	learn: 0.6097476	test: 0.6100038	best: 0.6100038 (100)	total: 22.1s	remaining: 3m 16s
200:	learn: 0.6146405	test: 0.6147245	best: 0.6147245 (200)	total: 28.8s	remaining: 1m 54s
300:	learn: 0.6163473	test: 0.6162462	best: 0.6162462 (300)	total: 35.6s	remaining: 1m 22s
400:	learn: 0.6173809	test: 0.6168042	best: 0.6168042 (400)	total: 42.8s	remaining: 1m 3s
500:	learn: 0.6181920	test: 0.6170780	best: 0.6171027 (496)	total: 50.1s	remaining: 49.9s
600:	learn: 0.6189663	test: 0.6174807	best: 0.6174966 (589)	total: 57.6s	remaining: 38.3s
700:	learn: 0.6196536	test: 0.6176417	best: 0.6176956 (683)	total: 1m 5s	remaining: 27.8s
bestTest = 0.6176956144
bestIteration = 683
Shrink model to first 684 iterations.
  Fold 1: Macro F1 = 0.625378
0:	learn: 0.5765774	test: 0.5765635	best: 0.5765635 (0)	total: 69.6ms	remaining: 1m 9s
100:	learn: 0.6089163	test: 0.6089590	best: 0.6089708 (99)	total: 6.97s	remain

In [38]:
# ============================================================================
# 5B. ENSEMBLE XGBOOST + CATBOOST
# ============================================================================
print("\n" + "="*80)
print("5B. ENSEMBLE PREDICTIONS (XGBoost + CatBoost)")
print("="*80)

# Individual model scores
print("\nIndividual Model CV Scores:")
print(f"  XGBoost (clean):  {xgboost_cv_score_clean:.6f}")
print(f"  CatBoost:         {cb_cv_score:.6f}")

# Use XGBoost OOF from clean model
xgb_oof_clean = oof_predictions_clean

# Try different ensemble weights
print("\n" + "-"*60)
print("Testing different XGBoost:CatBoost weights:")
print("-"*60)

best_ensemble_score = 0
best_weights = None

# Grid search for optimal weights (XGB:CB ratio)
for w_xgb in np.arange(0.3, 0.8, 0.05):
    w_cb = 1.0 - w_xgb
    
    # Weighted average of probabilities
    ensemble_oof = w_xgb * xgb_oof_clean + w_cb * cb_oof
    ensemble_pred = np.argmax(ensemble_oof, axis=1)
    ensemble_score = f1_score(y_train_balanced, ensemble_pred, average='macro')
    
    if ensemble_score > best_ensemble_score:
        best_ensemble_score = ensemble_score
        best_weights = (w_xgb, w_cb)
        print(f"  New best: XGB={w_xgb:.2f}, CB={w_cb:.2f} → F1={ensemble_score:.6f}")

print(f"\n✓ Best Ensemble Weights: XGB={best_weights[0]:.2f}, CB={best_weights[1]:.2f}")
print(f"✓ Best Ensemble CV Score: {best_ensemble_score:.6f}")

# Improvement over single models
print("\n" + "-"*60)
print("Improvement over single models:")
print("-"*60)
print(f"  vs XGBoost: {best_ensemble_score - xgboost_cv_score_clean:+.6f}")
print(f"  vs CatBoost: {best_ensemble_score - cb_cv_score:+.6f}")


5B. ENSEMBLE PREDICTIONS (XGBoost + CatBoost)

Individual Model CV Scores:
  XGBoost (clean):  0.631891
  CatBoost:         0.625193

------------------------------------------------------------
Testing different XGBoost:CatBoost weights:
------------------------------------------------------------
  New best: XGB=0.30, CB=0.70 → F1=0.627299
  New best: XGB=0.35, CB=0.65 → F1=0.627717
  New best: XGB=0.40, CB=0.60 → F1=0.628150
  New best: XGB=0.45, CB=0.55 → F1=0.628569
  New best: XGB=0.50, CB=0.50 → F1=0.629045
  New best: XGB=0.55, CB=0.45 → F1=0.629485
  New best: XGB=0.60, CB=0.40 → F1=0.629928
  New best: XGB=0.65, CB=0.35 → F1=0.630393
  New best: XGB=0.70, CB=0.30 → F1=0.630831
  New best: XGB=0.75, CB=0.25 → F1=0.631253

✓ Best Ensemble Weights: XGB=0.75, CB=0.25
✓ Best Ensemble CV Score: 0.631253

------------------------------------------------------------
Improvement over single models:
------------------------------------------------------------
  vs XGBoost: -0.000638
 

In [39]:
# ============================================================================
# 5C. APPLY THRESHOLD TO ENSEMBLE & CREATE SUBMISSION
# ============================================================================
print("\n" + "="*80)
print("5C. ENSEMBLE + THRESHOLD OPTIMIZATION")
print("="*80)

# Apply best weights to get ensemble predictions
w_xgb, w_cb = best_weights

# For OOF
ensemble_oof_final = w_xgb * xgb_oof_clean + w_cb * cb_oof

# Apply DE thresholds to ensemble
ensemble_oof_thresh = ensemble_oof_final * optimal_thresholds
ensemble_pred_thresh = np.argmax(ensemble_oof_thresh, axis=1)
ensemble_score_thresh = f1_score(y_train_balanced, ensemble_pred_thresh, average='macro')

print(f"\nEnsemble + Thresholds CV Score: {ensemble_score_thresh:.6f}")
print(f"Improvement from thresholds: {ensemble_score_thresh - best_ensemble_score:+.6f}")

# For test set
ensemble_test_final = w_xgb * xgboost_test_pred_clean + w_cb * cb_test_pred

# Apply thresholds
ensemble_test_thresh = ensemble_test_final * optimal_thresholds
ensemble_test_pred_labels = np.argmax(ensemble_test_thresh, axis=1)

# Check distribution
ensemble_dist = pd.Series(le_target.inverse_transform(ensemble_test_pred_labels)).value_counts(normalize=True).sort_index()

print("\n" + "-"*60)
print("Ensemble Prediction Distribution:")
print("-"*60)
for label in train_distribution.index:
    pred_pct = ensemble_dist.get(label, 0) * 100
    train_pct = train_distribution.get(label, 0) * 100
    diff = pred_pct - train_pct
    print(f"  {label}: {pred_pct:.2f}% (train: {train_pct:.2f}%, diff: {diff:+.2f}%)")

# Create submission
submission_ensemble = create_submission(
    test_ids,
    ensemble_test_pred_labels,
    le_target,
    'submission-ensemble.csv'
)

print(f"\n✓ Submission saved: submission-ensemble.csv")


5C. ENSEMBLE + THRESHOLD OPTIMIZATION

Ensemble + Thresholds CV Score: 0.633850
Improvement from thresholds: +0.002597

------------------------------------------------------------
Ensemble Prediction Distribution:
------------------------------------------------------------
  Fraud_Indication: 4.95% (train: 5.00%, diff: -0.05%)
  Navigation_Issue: 9.44% (train: 10.02%, diff: -0.58%)
  Perfect_Trip: 57.17% (train: 54.97%, diff: +2.20%)
  Safety_Violation: 19.76% (train: 20.02%, diff: -0.26%)
  Service_Complaint: 8.68% (train: 9.98%, diff: -1.31%)

Submission saved to: submission-ensemble.csv
Submission shape: (4000000, 2)

Prediction distribution:
Trip_Label
Perfect_Trip         2286874
Safety_Violation      790257
Navigation_Issue      377726
Service_Complaint     347018
Fraud_Indication      198125
Name: count, dtype: int64

✓ Submission saved: submission-ensemble.csv


In [41]:
# ============================================================================
# 5D. ENSEMBLE + DISTRIBUTION MATCHING
# ============================================================================
print("\n" + "="*80)
print("5D. ENSEMBLE + DISTRIBUTION MATCHING")
print("="*80)

# Apply distribution matching to ensemble
ensemble_matched = match_distribution_probabilistic(
    ensemble_test_final,  # Raw ensemble probabilities
    train_distribution,
    le_target
)

ensemble_matched_labels = le_target.inverse_transform(ensemble_matched)

# Create submission
submission_ensemble_matched = pd.DataFrame({
    config.ID_COL: test_ids,
    config.TARGET_COL: ensemble_matched_labels
})

submission_ensemble_matched.to_csv('submission-e13.csv', index=False)

print(f"\n✓ Submission saved: submission-ensemble-matched.csv")

# Final summary
print("\n" + "="*80)
print("STRATEGY 5 SUMMARY - ALL SUBMISSIONS")
print("="*80)
print(f"""
Model CV Scores:
  XGBoost (clean):    {xgboost_cv_score_clean:.6f}
  CatBoost:           {cb_cv_score:.6f}
  Ensemble:           {best_ensemble_score:.6f}
  Ensemble+Thresh:    {ensemble_score_thresh:.6f}

Submissions created:
  1. submission-ensemble.csv
     → XGBoost + CatBoost Ensemble + DE thresholds
     
  2. submission-ensemble-matched.csv
     → XGBoost + CatBoost Ensemble + Distribution matching

Best weights: XGB={best_weights[0]:.2f}, CB={best_weights[1]:.2f}
""")


5D. ENSEMBLE + DISTRIBUTION MATCHING

Expected counts per class:
  Fraud_Indication (class 0): 200,156
  Navigation_Issue (class 1): 400,895
  Perfect_Trip (class 2): 2,198,803
  Safety_Violation (class 3): 800,797
  Service_Complaint (class 4): 399,347
  Assigned 200,156 samples to Fraud_Indication
  Assigned 800,797 samples to Safety_Violation
  Assigned 400,895 samples to Navigation_Issue
  Assigned 399,347 samples to Service_Complaint
  Assigned 2,198,803 samples to Perfect_Trip
  Assigned 2 remaining samples

✓ Submission saved: submission-ensemble-matched.csv

STRATEGY 5 SUMMARY - ALL SUBMISSIONS

Model CV Scores:
  XGBoost (clean):    0.631891
  CatBoost:           0.625193
  Ensemble:           0.631253
  Ensemble+Thresh:    0.633850

Submissions created:
  1. submission-ensemble.csv
     → XGBoost + CatBoost Ensemble + DE thresholds
     
  2. submission-ensemble-matched.csv
     → XGBoost + CatBoost Ensemble + Distribution matching

Best weights: XGB=0.75, CB=0.25

