In [1]:
import pandas as pd

# Reload the baseline cleaned data
train = pd.read_csv('../data/train_clean.csv')
test = pd.read_csv('../data/test_clean.csv')

print(f"Train shape: {train.shape}")
print(f"Test shape: {test.shape}")

Train shape: (8693, 34)
Test shape: (4277, 33)


**Random Forest Classifier (Baseline)**

>Let’s train a Random Forest model on the same data setup used for LightGBM.
This will give us a baseline to compare model types before tuning.

In [2]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import log_loss, accuracy_score
import numpy as np

# Reuse X, y setup from earlier (if needed, redefine here)
X = train.drop(columns=['Transported'])
y = train['Transported'].astype(int)

# One-hot encode categorical features (RF doesn’t need label encoding)
X = pd.get_dummies(X)
test_encoded = pd.get_dummies(test)

# Align test features to train
test_encoded = test_encoded.reindex(columns=X.columns, fill_value=0)

# Cross-validation setup
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
oof_preds = np.zeros(len(X))
test_preds = np.zeros(len(test_encoded))

for fold, (train_idx, val_idx) in enumerate(cv.split(X, y)):
    X_train, y_train = X.iloc[train_idx], y.iloc[train_idx]
    X_val, y_val = X.iloc[val_idx], y.iloc[val_idx]

    model = RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=-1)
    model.fit(X_train, y_train)

    oof_preds[val_idx] = model.predict_proba(X_val)[:, 1]
    test_preds += model.predict_proba(test_encoded)[:, 1] / cv.n_splits

    logloss = log_loss(y_val, oof_preds[val_idx])
    acc = accuracy_score(y_val, oof_preds[val_idx] > 0.5)
    print(f"Fold {fold+1} Log Loss: {logloss:.5f} | Accuracy: {acc:.4f}")

# Final CV scores
print(f"\nOverall CV Log Loss: {log_loss(y, oof_preds):.5f}")
print(f"Overall CV Accuracy: {accuracy_score(y, oof_preds > 0.5):.4f}")


Fold 1 Log Loss: 0.43776 | Accuracy: 0.7798
Fold 2 Log Loss: 0.46023 | Accuracy: 0.7660
Fold 3 Log Loss: 0.44062 | Accuracy: 0.7798
Fold 4 Log Loss: 0.43780 | Accuracy: 0.7768
Fold 5 Log Loss: 0.45675 | Accuracy: 0.7727

Overall CV Log Loss: 0.44663
Overall CV Accuracy: 0.7750
