In [1]:
# Imports
import pandas as pd
import numpy as np
from xgboost import XGBClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import log_loss, accuracy_score

# Load data
train = pd.read_csv('../data/train_clean.csv')
test = pd.read_csv('../data/test_clean.csv')

# Split features and labels
X = train.drop(columns=['Transported'])
y = train['Transported'].astype(int)

# One-hot encode categorical features (object, bool, category)
X_encoded = pd.get_dummies(X, drop_first=True)
test_encoded = pd.get_dummies(test, drop_first=True)

# Align columns in case train/test differ
X_encoded, test_encoded = X_encoded.align(test_encoded, join='left', axis=1, fill_value=0)

print(f"Encoded train shape: {X_encoded.shape}")
print(f"Encoded test shape: {test_encoded.shape}")


Encoded train shape: (8693, 23766)
Encoded test shape: (4277, 23766)


In [2]:
# Set up CV
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
log_losses = []
accuracies = []

# OOF and test predictions
oof_preds = np.zeros(len(train))
test_preds = np.zeros(len(test_encoded))

for fold, (train_idx, val_idx) in enumerate(cv.split(X_encoded, y)):
    X_train, y_train = X_encoded.iloc[train_idx], y.iloc[train_idx]
    X_val, y_val = X_encoded.iloc[val_idx], y.iloc[val_idx]

    model = XGBClassifier(
        n_estimators=500,
        learning_rate=0.05,
        max_depth=6,
        subsample=0.8,
        colsample_bytree=0.8,
        random_state=42,
        use_label_encoder=False,
        eval_metric='logloss'
    )

    model.fit(X_train, y_train)

    val_probs = model.predict_proba(X_val)[:, 1]
    val_preds = model.predict(X_val)

    oof_preds[val_idx] = val_probs
    test_preds += model.predict_proba(test_encoded)[:, 1] / cv.n_splits

    fold_log_loss = log_loss(y_val, val_probs)
    fold_accuracy = accuracy_score(y_val, val_preds)

    log_losses.append(fold_log_loss)
    accuracies.append(fold_accuracy)

    print(f"Fold {fold+1} Log Loss: {fold_log_loss:.5f} | Accuracy: {fold_accuracy:.4f}")

# Final results
print(f"\nOverall CV Log Loss: {np.mean(log_losses):.5f}")
print(f"Overall CV Accuracy: {np.mean(accuracies):.4f}")


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


Fold 1 Log Loss: 0.39597 | Accuracy: 0.8079


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


Fold 2 Log Loss: 0.40716 | Accuracy: 0.8062


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


Fold 3 Log Loss: 0.39986 | Accuracy: 0.7993


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


Fold 4 Log Loss: 0.37923 | Accuracy: 0.8136


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


Fold 5 Log Loss: 0.41144 | Accuracy: 0.7980

Overall CV Log Loss: 0.39873
Overall CV Accuracy: 0.8050
