In [16]:
"""
Titanic Kaggle Challenge - Random Forest


For detailed explanations about metrics (ROC-AUC, precision, recall etc...), 
please refer to the Typst documentation file in the GitHub repository.

This script implements:
- Feature preprocessing and encoding
- Random Forest classification
- Hyperparameter tuning with RandomizedSearchCV
- Cross-validation
"""

import pandas as pd
from sklearn.model_selection import RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder
from sklearn.metrics import roc_auc_score

#### DATA LOADING

train_data = pd.read_csv('train.csv')
test_data = pd.read_csv('test.csv')

train_y = train_data['Survived']
train_x = train_data.drop(columns=['Survived'])

#### FEATURE ENGINEERING

# Drop columns that are not useful
drop = ['Name', 'Ticket', 'Cabin']
train_x = train_x.drop(columns=drop)
test_x = test_data.drop(columns=drop)

#### MISSING VALUES IMPUTATION

"""
General rule for imputation:
- Numerical columns → replace with median (less sensitive to outliers than mean)
- Categorical columns → replace with mode (most frequent value) or "Unknown"
"""

# Numerical columns
train_x['Age'].fillna(train_x['Age'].median(), inplace=True)
train_x['Fare'].fillna(train_x['Fare'].median(), inplace=True)
test_x['Age'].fillna(test_x['Age'].median(), inplace=True)
test_x['Fare'].fillna(test_x['Fare'].median(), inplace=True)

# Others 
# .mode()[0] returns the most frequent value as a scalar (not a Series)
train_x['Embarked'].fillna(train_x['Embarked'].mode()[0], inplace=True)
test_x['Embarked'].fillna(test_x['Embarked'].mode()[0], inplace=True)

#### ENCODING

"""
Encoding strategy:
- Sex: OneHotEncoding → male (0/1), female (1/0)
  Avoids implicit ordering (male=1, female=0 would suggest male > female)
  
- Embarked: OrdinalEncoding → Q=0, C=1, S=2
  I assume an order from least to most affluent port

"""

# OneHot encoding for Sex
ohe = OneHotEncoder(sparse_output=False)
train_sex = pd.DataFrame(
    ohe.fit_transform(train_x[['Sex']]), 
    columns=ohe.get_feature_names_out(['Sex'])
)
test_sex = pd.DataFrame(
    ohe.transform(test_x[['Sex']]), 
    columns=ohe.get_feature_names_out(['Sex'])
)

# Drop original Sex column and reset index to avoid misalignment
train_x = train_x.drop(columns=['Sex']).reset_index(drop=True)
test_x = test_x.drop(columns=['Sex']).reset_index(drop=True)

# Concatenate encoded Sex columns
train_x = pd.concat([train_x, train_sex], axis=1)
test_x = pd.concat([test_x, test_sex], axis=1)

# Ordinal encoding for Embarked (Q < C < S in terms of affluence)
embarked_enc = OrdinalEncoder(categories=[['Q', 'C', 'S']])
train_x['Embarked'] = embarked_enc.fit_transform(train_x[['Embarked']])
test_x['Embarked'] = embarked_enc.transform(test_x[['Embarked']])

#### HYPERPARAMETERS 

"""
I use RandomizedSearchCV instead of GridSearchCV for efficiency.

Why no train_test_split ? 
I don't split train.csv into train/validation because:
1. Cross-validation already provides a reliable performance estimate
2. We can always come back and add a local test set if CV score is suspicious

An other possibility is :
X_train, X_test_local, y_train, y_test_local = train_test_split(
    train_x, train_y, test_size=0.2, random_state=42
)
Then use X_train for RandomizedSearchCV and X_test_local for final validation.
"""

# Define the model
rf = RandomForestClassifier(random_state=1) 

# Define hyperparameter 
param_dist = {
    'n_estimators': [100, 200, 500],
    'max_depth': [None, 5, 10, 20],
    'min_samples_split': [2, 4, 8],
    'min_samples_leaf': [1, 2, 4],
    'max_features': ['sqrt', 'log2', None]
}

#### RANDOMIZED SEARCH WITH CROSS-VALIDATION

"""
RandomizedSearchCV automatically:
1. Keeps the best combination based on ROC-AUC score
2. Keeps a final model with best parameters on all of train_x 

The final model is stored in: random_search.  (random_search.best_estimator_)
The best parameters are in: random_search.best_params_
The best CV score is in: random_search.best_score_
"""

random_search = RandomizedSearchCV(
    rf,
    param_distributions=param_dist,
    n_iter=20,              # Number of random combinations to test
    cv=5,                   # 5-fold cross-validation
    scoring='roc_auc',      # Optimization metric
    n_jobs=-1,              # Use all CPU cores
    verbose=2,              # Display progress
    random_state=1          # For reproducibility
)

# Fit the model 
random_search.fit(train_x, train_y)

print("Best hyperparameters:", random_search.best_params_)
print("Best CV ROC-AUC score:", random_search.best_score_)

#### PREDICTIONS 

"""
If the CV ROC-AUC score is satisfactory (typically > 0.80), proceed to prediction.
If not, consider more (Feature engineering,Different encoding strategies,Try other models...)
"""

# Make predictions using the best model (automatically retrained on full train_x)
predictions = random_search.predict(test_x)

submission = pd.DataFrame({
    'PassengerId': test_data['PassengerId'],
    'Survived': predictions
})
submission.to_csv('submission_rf.csv', index=False)

''' 
OUTPUT :  Best hyperparameters: {'n_estimators': 100, 'min_samples_split': 8, 'min_samples_leaf': 4, 'max_features': 'log2', 'max_depth': 10}
          Best CV ROC-AUC score: 0.8709513860299894
'''



Fitting 5 folds for each of 20 candidates, totalling 100 fits


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  train_x['Age'].fillna(train_x['Age'].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  train_x['Fare'].fillna(train_x['Fare'].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object 

[CV] END max_depth=5, max_features=None, min_samples_leaf=1, min_samples_split=4, n_estimators=200; total time=   2.0s
[CV] END max_depth=5, max_features=None, min_samples_leaf=1, min_samples_split=4, n_estimators=200; total time=   2.0s
[CV] END max_depth=5, max_features=None, min_samples_leaf=1, min_samples_split=4, n_estimators=200; total time=   2.0s
[CV] END max_depth=5, max_features=None, min_samples_leaf=1, min_samples_split=4, n_estimators=200; total time=   1.9s
[CV] END max_depth=5, max_features=None, min_samples_leaf=1, min_samples_split=4, n_estimators=200; total time=   1.7s
[CV] END max_depth=10, max_features=None, min_samples_leaf=2, min_samples_split=8, n_estimators=200; total time=   1.9s
[CV] END max_depth=10, max_features=None, min_samples_leaf=2, min_samples_split=8, n_estimators=200; total time=   1.9s
[CV] END max_depth=10, max_features=None, min_samples_leaf=2, min_samples_split=8, n_estimators=200; total time=   1.9s
[CV] END max_depth=10, max_features=log2, min