## 1. Configuration & Setup

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
from datetime import datetime
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import f1_score, classification_report, confusion_matrix
import gc

warnings.filterwarnings('ignore')
pd.set_option('display.max_columns', None)

print("Libraries imported successfully.")

In [None]:
class Config:
    DATA_PATH = '/kaggle/input/ride-hailing-trip-classification-dataset/'
    
    N_FOLDS = 5
    RANDOM_STATE = 42
    TARGET_COL = 'Trip_Label'
    ID_COL = 'Trip_ID'
    
    USE_TEMPORAL = True
    USE_DISTANCE = True
    USE_SENSOR_AGG = True
    USE_ECONOMIC = True
    USE_INTERACTION = True
    
    CATBOOST_PARAMS = {
        'iterations': 1000,
        'learning_rate': 0.05,
        'depth': 6,
        'loss_function': 'MultiClass',
        'eval_metric': 'TotalF1:average=Macro',
        'auto_class_weights': 'Balanced',
        'random_seed': 42,
        'verbose': 100,
        'early_stopping_rounds': 50
    }
    
    LIGHTGBM_PARAMS = {
        'objective': 'multiclass',
        'metric': 'multi_logloss',
        'boosting_type': 'gbdt',
        'num_leaves': 31,
        'learning_rate': 0.05,
        'feature_fraction': 0.8,
        'bagging_fraction': 0.8,
        'bagging_freq': 5,
        'random_state': 42,
        'verbose': -1
    }
    
    XGBOOST_PARAMS = {
        'objective': 'multi:softprob',
        'eval_metric': 'mlogloss',
        'max_depth': 6,
        'learning_rate': 0.05,
        'subsample': 0.8,
        'colsample_bytree': 0.8,
        'random_state': 42,
        'verbosity': 1
    }

config = Config()
print("Configuration loaded successfully.")

## 2. Data Loading & Validation

In [None]:
def load_data():
    print("Loading data...")
    train = pd.read_csv(config.DATA_PATH + 'train.csv')
    test = pd.read_csv(config.DATA_PATH + 'test.csv')
    sample_submission = pd.read_csv(config.DATA_PATH + 'sample_submission.csv')
    
    print(f"Train shape: {train.shape}")
    print(f"Test shape: {test.shape}")
    print(f"Sample submission shape: {sample_submission.shape}")
    
    if config.TARGET_COL in train.columns:
        print(f"\nTarget distribution:")
        print(train[config.TARGET_COL].value_counts())
    
    return train, test, sample_submission

train, test, sample_submission = load_data()

In [None]:
def optimize_memory(df):
    for col in df.columns:
        col_type = df[col].dtype
        if col_type != 'object':
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
            else:
                if c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
    return df

print("Optimizing memory...")
train = optimize_memory(train)
test = optimize_memory(test)
print("Memory optimization completed.")

## 3. Feature Engineering

In [None]:
def haversine_distance(lat1, lon1, lat2, lon2):
    R = 6371
    lat1_rad = np.radians(lat1)
    lat2_rad = np.radians(lat2)
    delta_lat = np.radians(lat2 - lat1)
    delta_lon = np.radians(lon2 - lon1)
    
    a = np.sin(delta_lat/2)**2 + np.cos(lat1_rad) * np.cos(lat2_rad) * np.sin(delta_lon/2)**2
    c = 2 * np.arctan2(np.sqrt(a), np.sqrt(1-a))
    
    return R * c

def engineer_features(df, is_train=True):
    print(f"Engineering features for {'train' if is_train else 'test'} set...")
    df = df.copy()
    
    if config.USE_TEMPORAL and 'Timestamp' in df.columns:
        print("  - Creating temporal features...")
        df['Timestamp_parsed'] = pd.to_datetime(df['Timestamp'])
        df['Hour'] = df['Timestamp_parsed'].dt.hour.astype(np.int8)
        df['DayOfWeek'] = df['Timestamp_parsed'].dt.dayofweek.astype(np.int8)
        df['Month'] = df['Timestamp_parsed'].dt.month.astype(np.int8)
        df['IsWeekend'] = (df['DayOfWeek'] >= 5).astype(np.int8)
        df['IsRushHour'] = ((df['Hour'] >= 7) & (df['Hour'] <= 9) | 
                            (df['Hour'] >= 17) & (df['Hour'] <= 19)).astype(np.int8)
        df['IsLateNight'] = ((df['Hour'] >= 22) | (df['Hour'] <= 5)).astype(np.int8)
        df.drop('Timestamp_parsed', axis=1, inplace=True)
    
    if config.USE_DISTANCE:
        print("  - Creating distance features...")
        if all(col in df.columns for col in ['Pickup_Lat', 'Pickup_Long', 'Dropoff_Lat', 'Dropoff_Long']):
            df['Haversine_Distance'] = haversine_distance(
                df['Pickup_Lat'], df['Pickup_Long'],
                df['Dropoff_Lat'], df['Dropoff_Long']
            )
            
            if 'Distance_KM' in df.columns:
                df['Distance_Ratio'] = df['Distance_KM'] / (df['Haversine_Distance'] + 1e-6)
                df['Distance_Difference'] = np.abs(df['Distance_KM'] - df['Haversine_Distance'])
        
        if 'Pickup_Zone' in df.columns and 'Dropoff_Zone' in df.columns:
            df['Is_Same_Zone'] = (df['Pickup_Zone'] == df['Dropoff_Zone']).astype(np.int8)
    
    if config.USE_SENSOR_AGG:
        print("  - Creating sensor aggregation features...")
        if all(col in df.columns for col in ['Accel_X', 'Accel_Y', 'Accel_Z']):
            df['Accel_Magnitude'] = np.sqrt(df['Accel_X']**2 + df['Accel_Y']**2 + df['Accel_Z']**2)
            df['Accel_Max'] = df[['Accel_X', 'Accel_Y', 'Accel_Z']].max(axis=1)
            df['Accel_Min'] = df[['Accel_X', 'Accel_Y', 'Accel_Z']].min(axis=1)
            df['Accel_Std'] = df[['Accel_X', 'Accel_Y', 'Accel_Z']].std(axis=1)
            df['Accel_Range'] = df['Accel_Max'] - df['Accel_Min']
        
        if 'Gyro_Z' in df.columns:
            df['Gyro_Abs'] = np.abs(df['Gyro_Z'])
    
    if config.USE_ECONOMIC:
        print("  - Creating economic features...")
        if 'Est_Price_IDR' in df.columns and 'Distance_KM' in df.columns:
            df['Price_per_KM'] = df['Est_Price_IDR'] / (df['Distance_KM'] + 1e-6)
        
        if 'Promo_Code' in df.columns:
            df['Has_Promo'] = (df['Promo_Code'].notna()).astype(np.int8)
        
        if 'Surge_Multiplier' in df.columns:
            df['Surge_Category'] = pd.cut(df['Surge_Multiplier'], 
                                          bins=[0, 1, 1.5, 2, 10], 
                                          labels=[0, 1, 2, 3]).astype(np.int8)
    
    if config.USE_INTERACTION:
        print("  - Creating interaction features...")
        if 'Surge_Multiplier' in df.columns and 'Hour' in df.columns:
            df['Surge_Hour_Interaction'] = df['Surge_Multiplier'] * df['Hour']
        
        if 'Distance_KM' in df.columns and 'Traffic' in df.columns:
            traffic_map = {'Light': 1, 'Moderate': 2, 'Heavy': 3}
            df['Traffic_Numeric'] = df['Traffic'].map(traffic_map).fillna(0).astype(np.int8)
            df['Distance_Traffic'] = df['Distance_KM'] * df['Traffic_Numeric']
    
    print(f"Feature engineering completed. Shape: {df.shape}")
    return df

train = engineer_features(train, is_train=True)
test = engineer_features(test, is_train=False)
gc.collect()

## 4. Preprocessing

In [None]:
def preprocess_data(train, test):
    print("Preprocessing data...")
    
    cols_to_drop = [config.ID_COL, 'Timestamp']
    if config.TARGET_COL in train.columns:
        y = train[config.TARGET_COL].copy()
        cols_to_drop.append(config.TARGET_COL)
    else:
        y = None
    
    cols_to_drop = [col for col in cols_to_drop if col in train.columns]
    X_train = train.drop(cols_to_drop, axis=1)
    X_test = test.drop([col for col in cols_to_drop if col in test.columns], axis=1)
    
    print(f"\nMissing values before imputation:")
    train_missing = X_train.isnull().sum()
    if train_missing.sum() > 0:
        print(train_missing[train_missing > 0])
    else:
        print("No missing values.")
    
    numeric_cols = X_train.select_dtypes(include=[np.number]).columns.tolist()
    categorical_cols = X_train.select_dtypes(include=['object']).columns.tolist()
    
    for col in numeric_cols:
        median_val = X_train[col].median()
        X_train[col].fillna(median_val, inplace=True)
        X_test[col].fillna(median_val, inplace=True)
    
    for col in categorical_cols:
        X_train[col].fillna('Unknown', inplace=True)
        X_test[col].fillna('Unknown', inplace=True)
    
    for col in numeric_cols:
        q99 = X_train[col].quantile(0.99)
        q01 = X_train[col].quantile(0.01)
        X_train[col] = X_train[col].clip(q01, q99)
        X_test[col] = X_test[col].clip(q01, q99)
    
    label_encoders = {}
    for col in categorical_cols:
        le = LabelEncoder()
        X_train[col] = le.fit_transform(X_train[col].astype(str))
        X_test[col] = X_test[col].astype(str).apply(
            lambda x: le.transform([x])[0] if x in le.classes_ else -1
        )
        label_encoders[col] = le
    
    if y is not None:
        le_target = LabelEncoder()
        y_encoded = le_target.fit_transform(y)
        print(f"\nTarget encoding:")
        for i, label in enumerate(le_target.classes_):
            print(f"  {label}: {i}")
    else:
        y_encoded = None
        le_target = None
    
    print(f"\nPreprocessing completed.")
    print(f"X_train shape: {X_train.shape}")
    print(f"X_test shape: {X_test.shape}")
    
    return X_train, X_test, y_encoded, le_target, label_encoders

X_train, X_test, y_train, le_target, label_encoders = preprocess_data(train, test)
gc.collect()

## 5. Model Training - CatBoost

In [None]:
try:
    from catboost import CatBoostClassifier, Pool
    CATBOOST_AVAILABLE = True
except ImportError:
    print("CatBoost not available. Install with: pip install catboost")
    CATBOOST_AVAILABLE = False

def train_catboost(X_train, y_train, X_test, n_folds=5):
    if not CATBOOST_AVAILABLE:
        return None, None
    
    print("\n" + "="*80)
    print("Training CatBoost Models")
    print("="*80)
    
    skf = StratifiedKFold(n_splits=n_folds, shuffle=True, random_state=config.RANDOM_STATE)
    
    oof_predictions = np.zeros((len(X_train), len(np.unique(y_train))))
    test_predictions = np.zeros((len(X_test), len(np.unique(y_train))))
    
    fold_scores = []
    models = []
    
    for fold, (train_idx, val_idx) in enumerate(skf.split(X_train, y_train), 1):
        print(f"\nFold {fold}/{n_folds}")
        print("-" * 40)
        
        X_tr, X_val = X_train.iloc[train_idx], X_train.iloc[val_idx]
        y_tr, y_val = y_train[train_idx], y_train[val_idx]
        
        train_pool = Pool(X_tr, y_tr)
        val_pool = Pool(X_val, y_val)
        
        model = CatBoostClassifier(**config.CATBOOST_PARAMS)
        model.fit(train_pool, eval_set=val_pool, use_best_model=True, plot=False)
        
        oof_predictions[val_idx] = model.predict_proba(X_val)
        test_predictions += model.predict_proba(X_test) / n_folds
        
        oof_pred_labels = np.argmax(oof_predictions[val_idx], axis=1)
        fold_score = f1_score(y_val, oof_pred_labels, average='macro')
        fold_scores.append(fold_score)
        
        print(f"Fold {fold} Macro F1-Score: {fold_score:.6f}")
        
        models.append(model)
        gc.collect()
    
    oof_pred_labels = np.argmax(oof_predictions, axis=1)
    overall_score = f1_score(y_train, oof_pred_labels, average='macro')
    
    print("\n" + "="*80)
    print(f"CatBoost Overall CV Score: {overall_score:.6f} (+/- {np.std(fold_scores):.6f})")
    print("="*80)
    
    return test_predictions, models

if CATBOOST_AVAILABLE:
    catboost_test_pred, catboost_models = train_catboost(X_train, y_train, X_test, config.N_FOLDS)
else:
    catboost_test_pred, catboost_models = None, None

## 6. Model Training - LightGBM

In [None]:
try:
    import lightgbm as lgb
    LIGHTGBM_AVAILABLE = True
except ImportError:
    print("LightGBM not available. Install with: pip install lightgbm")
    LIGHTGBM_AVAILABLE = False

def train_lightgbm(X_train, y_train, X_test, n_folds=5):
    if not LIGHTGBM_AVAILABLE:
        return None, None
    
    print("\n" + "="*80)
    print("Training LightGBM Models")
    print("="*80)
    
    skf = StratifiedKFold(n_splits=n_folds, shuffle=True, random_state=config.RANDOM_STATE)
    
    oof_predictions = np.zeros((len(X_train), len(np.unique(y_train))))
    test_predictions = np.zeros((len(X_test), len(np.unique(y_train))))
    
    fold_scores = []
    models = []
    
    params = config.LIGHTGBM_PARAMS.copy()
    params['num_class'] = len(np.unique(y_train))
    
    for fold, (train_idx, val_idx) in enumerate(skf.split(X_train, y_train), 1):
        print(f"\nFold {fold}/{n_folds}")
        print("-" * 40)
        
        X_tr, X_val = X_train.iloc[train_idx], X_train.iloc[val_idx]
        y_tr, y_val = y_train[train_idx], y_train[val_idx]
        
        train_data = lgb.Dataset(X_tr, label=y_tr)
        val_data = lgb.Dataset(X_val, label=y_val, reference=train_data)
        
        model = lgb.train(
            params,
            train_data,
            num_boost_round=1000,
            valid_sets=[train_data, val_data],
            valid_names=['train', 'valid'],
            callbacks=[lgb.early_stopping(50), lgb.log_evaluation(100)]
        )
        
        oof_predictions[val_idx] = model.predict(X_val, num_iteration=model.best_iteration)
        test_predictions += model.predict(X_test, num_iteration=model.best_iteration) / n_folds
        
        oof_pred_labels = np.argmax(oof_predictions[val_idx], axis=1)
        fold_score = f1_score(y_val, oof_pred_labels, average='macro')
        fold_scores.append(fold_score)
        
        print(f"Fold {fold} Macro F1-Score: {fold_score:.6f}")
        
        models.append(model)
        gc.collect()
    
    oof_pred_labels = np.argmax(oof_predictions, axis=1)
    overall_score = f1_score(y_train, oof_pred_labels, average='macro')
    
    print("\n" + "="*80)
    print(f"LightGBM Overall CV Score: {overall_score:.6f} (+/- {np.std(fold_scores):.6f})")
    print("="*80)
    
    return test_predictions, models

if LIGHTGBM_AVAILABLE:
    lightgbm_test_pred, lightgbm_models = train_lightgbm(X_train, y_train, X_test, config.N_FOLDS)
else:
    lightgbm_test_pred, lightgbm_models = None, None

## 7. Model Training - XGBoost

In [None]:
try:
    import xgboost as xgb
    XGBOOST_AVAILABLE = True
except ImportError:
    print("XGBoost not available. Install with: pip install xgboost")
    XGBOOST_AVAILABLE = False

def train_xgboost(X_train, y_train, X_test, n_folds=5):
    if not XGBOOST_AVAILABLE:
        return None, None
    
    print("\n" + "="*80)
    print("Training XGBoost Models")
    print("="*80)
    
    skf = StratifiedKFold(n_splits=n_folds, shuffle=True, random_state=config.RANDOM_STATE)
    
    oof_predictions = np.zeros((len(X_train), len(np.unique(y_train))))
    test_predictions = np.zeros((len(X_test), len(np.unique(y_train))))
    
    fold_scores = []
    models = []
    
    params = config.XGBOOST_PARAMS.copy()
    params['num_class'] = len(np.unique(y_train))
    
    for fold, (train_idx, val_idx) in enumerate(skf.split(X_train, y_train), 1):
        print(f"\nFold {fold}/{n_folds}")
        print("-" * 40)
        
        X_tr, X_val = X_train.iloc[train_idx], X_train.iloc[val_idx]
        y_tr, y_val = y_train[train_idx], y_train[val_idx]
        
        dtrain = xgb.DMatrix(X_tr, label=y_tr)
        dval = xgb.DMatrix(X_val, label=y_val)
        dtest = xgb.DMatrix(X_test)
        
        model = xgb.train(
            params,
            dtrain,
            num_boost_round=1000,
            evals=[(dtrain, 'train'), (dval, 'valid')],
            early_stopping_rounds=50,
            verbose_eval=100
        )
        
        oof_predictions[val_idx] = model.predict(dval)
        test_predictions += model.predict(dtest) / n_folds
        
        oof_pred_labels = np.argmax(oof_predictions[val_idx], axis=1)
        fold_score = f1_score(y_val, oof_pred_labels, average='macro')
        fold_scores.append(fold_score)
        
        print(f"Fold {fold} Macro F1-Score: {fold_score:.6f}")
        
        models.append(model)
        gc.collect()
    
    oof_pred_labels = np.argmax(oof_predictions, axis=1)
    overall_score = f1_score(y_train, oof_pred_labels, average='macro')
    
    print("\n" + "="*80)
    print(f"XGBoost Overall CV Score: {overall_score:.6f} (+/- {np.std(fold_scores):.6f})")
    print("="*80)
    
    return test_predictions, models

if XGBOOST_AVAILABLE:
    xgboost_test_pred, xgboost_models = train_xgboost(X_train, y_train, X_test, config.N_FOLDS)
else:
    xgboost_test_pred, xgboost_models = None, None

## 8. Ensemble Predictions

In [None]:
def ensemble_predictions(predictions_list, weights=None):
    predictions_list = [p for p in predictions_list if p is not None]
    
    if len(predictions_list) == 0:
        raise ValueError("No valid predictions available for ensemble")
    
    if len(predictions_list) == 1:
        return predictions_list[0]
    
    if weights is None:
        weights = [1.0 / len(predictions_list)] * len(predictions_list)
    
    ensemble_pred = np.zeros_like(predictions_list[0])
    for pred, weight in zip(predictions_list, weights):
        ensemble_pred += pred * weight
    
    return ensemble_pred

print("\n" + "="*80)
print("Creating Ensemble Predictions")
print("="*80)

available_predictions = []
model_names = []

if catboost_test_pred is not None:
    available_predictions.append(catboost_test_pred)
    model_names.append("CatBoost")

if lightgbm_test_pred is not None:
    available_predictions.append(lightgbm_test_pred)
    model_names.append("LightGBM")

if xgboost_test_pred is not None:
    available_predictions.append(xgboost_test_pred)
    model_names.append("XGBoost")

print(f"Ensembling {len(available_predictions)} models: {', '.join(model_names)}")

if len(available_predictions) > 1:
    final_predictions = ensemble_predictions(available_predictions)
    print("Using equal weights for ensemble")
else:
    final_predictions = available_predictions[0]
    print(f"Using single model: {model_names[0]}")

final_pred_labels = np.argmax(final_predictions, axis=1)
print(f"\nFinal predictions shape: {final_predictions.shape}")
print("="*80)

## 9. Generate Submission

In [None]:
def create_submission(test_ids, predictions, le_target, filename='submission.csv'):
    pred_labels = le_target.inverse_transform(predictions)
    
    submission = pd.DataFrame({
        config.ID_COL: test_ids,
        config.TARGET_COL: pred_labels
    })
    
    submission.to_csv(filename, index=False)
    
    print(f"\nSubmission saved to: {filename}")
    print(f"Submission shape: {submission.shape}")
    print(f"\nPrediction distribution:")
    print(submission[config.TARGET_COL].value_counts())
    
    return submission

test_ids = test[config.ID_COL].values
submission = create_submission(test_ids, final_pred_labels, le_target, 'submission.csv')
submission.head(10)

## 10. Validation & Analysis

In [None]:
print("\n" + "="*80)
print("Final Validation Checks")
print("="*80)

assert submission.shape[0] == test.shape[0], "Submission size mismatch!"
assert submission.columns.tolist() == [config.ID_COL, config.TARGET_COL], "Column names mismatch!"
assert submission[config.TARGET_COL].isnull().sum() == 0, "Null predictions found!"

expected_labels = set(le_target.classes_)
submission_labels = set(submission[config.TARGET_COL].unique())
assert submission_labels.issubset(expected_labels), "Invalid labels in submission!"

print("All validation checks passed!")
print("="*80)

In [None]:
if catboost_models and len(catboost_models) > 0:
    print("\nTop 20 Important Features (CatBoost):")
    feature_importance = catboost_models[0].get_feature_importance()
    feature_names = X_train.columns
    
    importance_df = pd.DataFrame({
        'Feature': feature_names,
        'Importance': feature_importance
    }).sort_values('Importance', ascending=False)
    
    print(importance_df.head(20).to_string(index=False))
    
    plt.figure(figsize=(10, 8))
    top_features = importance_df.head(20)
    plt.barh(range(len(top_features)), top_features['Importance'])
    plt.yticks(range(len(top_features)), top_features['Feature'])
    plt.xlabel('Importance')
    plt.title('Top 20 Feature Importances', fontweight='bold')
    plt.gca().invert_yaxis()
    plt.tight_layout()
    plt.show()

## 11. Summary

In [None]:
print("\n" + "="*80)
print("PIPELINE EXECUTION SUMMARY")
print("="*80)
print(f"\nData:")
print(f"  Training samples: {len(train):,}")
print(f"  Test samples: {len(test):,}")
print(f"  Features after engineering: {X_train.shape[1]}")

print(f"\nModels Trained:")
if CATBOOST_AVAILABLE:
    print(f"  - CatBoost: {config.N_FOLDS} folds")
if LIGHTGBM_AVAILABLE:
    print(f"  - LightGBM: {config.N_FOLDS} folds")
if XGBOOST_AVAILABLE:
    print(f"  - XGBoost: {config.N_FOLDS} folds")

print(f"\nSubmission:")
print(f"  File: submission.csv")
print(f"  Predictions: {len(submission):,}")

print("\n" + "="*80)
print("Pipeline completed successfully!")
print("="*80)