In [1]:
import pandas as pd
from catboost import CatBoostClassifier


train = pd.read_csv('../data/train_clean.csv')
test = pd.read_csv('../data/test_clean.csv')

X = train.drop(columns=['Transported'])
y = train['Transported'].astype(int)

cat_features = X.select_dtypes(include='object').columns.tolist()


In [2]:
print(X.dtypes)
cat_cols = X.select_dtypes(include=["object", "bool", "category"]).columns.tolist()
cat_feature_indices = [X.columns.get_loc(col) for col in cat_cols]

print("Categorical feature indices:", cat_feature_indices)


model = CatBoostClassifier(
    verbose=0,
    random_state=42,
    cat_features=cat_feature_indices  # <- this is the missing piece
)


print(X.iloc[0][cat_cols])


PassengerId              object
HomePlanet               object
CryoSleep                  bool
Cabin                    object
Destination              object
Age                     float64
VIP                        bool
RoomService             float64
FoodCourt               float64
ShoppingMall            float64
Spa                     float64
VRDeck                  float64
Name                     object
GroupID                   int64
MissingCount              int64
NoSpend                    bool
CryoSleep_missing          bool
RoomService_missing        bool
FoodCourt_missing          bool
ShoppingMall_missing       bool
Spa_missing                bool
VRDeck_missing             bool
VIP_missing                bool
Age_missing                bool
CabinDeck                object
CabinNum                float64
CabinSide                object
CabinDeck_missing          bool
CabinSide_missing          bool
NoActivity                 bool
GroupSize                 int64
IsAlone 

In [3]:
X.isnull().sum().sort_values(ascending=False).head(10)


HomePlanet     201
Name           200
Cabin          199
Destination    182
PassengerId      0
Age              0
VIP              0
RoomService      0
CryoSleep        0
FoodCourt        0
dtype: int64

In [4]:
# Force all categorical features to string (NaN becomes 'nan')
for col in cat_cols:
    X[col] = X[col].astype(str)
    test[col] = test[col].astype(str)

In [5]:
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import log_loss, accuracy_score
import numpy as np

# Initialize cross-validation
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
oof_preds = np.zeros(len(train))
test_preds = np.zeros(len(test))
log_losses = []
accuracies = []

# Cross-validation loop
for fold, (train_idx, val_idx) in enumerate(cv.split(X, y)):
    X_train, y_train = X.iloc[train_idx], y.iloc[train_idx]
    X_val, y_val = X.iloc[val_idx], y.iloc[val_idx]
        
    model = CatBoostClassifier(
        verbose=0,
        random_state=42,
        cat_features=cat_feature_indices
    )

    
    model.fit(X_train, y_train)
    
    val_probs = model.predict_proba(X_val)[:, 1]
    val_preds = model.predict(X_val)
    
    fold_log_loss = log_loss(y_val, val_probs)
    fold_accuracy = accuracy_score(y_val, val_preds)
    
    oof_preds[val_idx] = val_probs
    test_preds += model.predict_proba(test)[:, 1] / cv.n_splits
    
    log_losses.append(fold_log_loss)
    accuracies.append(fold_accuracy)
    
    print(f"Fold {fold+1} Log Loss: {fold_log_loss:.5f} | Accuracy: {fold_accuracy:.4f}")

# Final CV results
print(f"\nOverall CV Log Loss: {np.mean(log_losses):.5f}")
print(f"Overall CV Accuracy: {np.mean(accuracies):.4f}")


Fold 1 Log Loss: 0.37539 | Accuracy: 0.8143
Fold 2 Log Loss: 0.39295 | Accuracy: 0.8039
Fold 3 Log Loss: 0.38612 | Accuracy: 0.8137
Fold 4 Log Loss: 0.36704 | Accuracy: 0.8222
Fold 5 Log Loss: 0.39827 | Accuracy: 0.8038

Overall CV Log Loss: 0.38395
Overall CV Accuracy: 0.8116
