In [21]:
import pandas as pd 
import numpy as np 
from pathlib import Path
import pickle

data_dir = Path("../data/titanic")
output_dir=Path("../output/titanic")
output_dir.mkdir(parents=True, exist_ok=True)
train_df=pd.read_csv(data_dir / "train.csv")
test_df=pd.read_csv(data_dir / "test.csv")
train_df.head()


Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [22]:
id_cols=["PassengerId", "Name"]
cat_feature_cols = ["Pclass", "Sex","Embarked","Cabin"]
num_feature_cols = ["Age", "Fare","Parch","SibSp"]

target_col=["Survived"]

In [23]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder

y = train_df[target_col].values.ravel()
X_cat = train_df[cat_feature_cols]
X_test_cat = test_df[cat_feature_cols]

X_num = train_df[num_feature_cols]
X_test_num = test_df[num_feature_cols]
num_imputer = SimpleImputer(strategy="mean")
X_num_imputed = pd.DataFrame(num_imputer.fit_transform(X_num))
X_test_num_imputed = pd.DataFrame(num_imputer.transform(X_test_num))

ohe = OneHotEncoder(sparse_output=False, handle_unknown="ignore")
X_cat_encoded = pd.DataFrame(ohe.fit_transform(X_cat))
X_test_cat_encoded = pd.DataFrame(ohe.transform(X_test_cat))
X_encoded = pd.concat([X_num_imputed, X_cat_encoded], axis=1)
X_test_encoded = pd.concat([X_test_num_imputed, X_test_cat_encoded], axis=1)

model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_encoded, y)
predictions = model.predict(X_test_encoded)

In [24]:
# grid search
from sklearn.model_selection import GridSearchCV

param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5, 10]
}
grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=5, scoring='accuracy')
grid_search.fit(X_encoded, y)
model = grid_search.best_estimator_
best_params = grid_search.best_params_
print("Best Parameters:", best_params)


Best Parameters: {'max_depth': 20, 'min_samples_split': 5, 'n_estimators': 200}


In [25]:
from sklearn.model_selection import cross_validate 

cv_scores=cross_validate(model, X_encoded, y, cv=5, scoring=['accuracy', 'precision', 'recall', 'f1'])
cv_scores_df=pd.DataFrame(cv_scores)
cv_scores_df.to_csv(output_dir / "cv_scores_tuned.csv", index=False)
cv_scores_df

Unnamed: 0,fit_time,score_time,test_accuracy,test_precision,test_recall,test_f1
0,0.260843,0.01375,0.810056,0.777778,0.710145,0.742424
1,0.245385,0.016744,0.808989,0.793103,0.676471,0.730159
2,0.258862,0.013184,0.859551,0.84127,0.779412,0.80916
3,0.227875,0.014181,0.820225,0.909091,0.588235,0.714286
4,0.254786,0.013595,0.859551,0.854839,0.768116,0.80916


In [26]:
pipe_df = pd.read_csv(output_dir / "cv_scores_pipe.csv")
tuned_df = pd.read_csv(output_dir / "cv_scores_tuned.csv")
# compare and analyze
comparison_df = pd.DataFrame({
    'Pipeline Accuracy': pipe_df['test_accuracy'],
    'Tuned Model Accuracy': tuned_df['test_accuracy'],
    'Pipeline Precision': pipe_df['test_precision'],
    'Tuned Model Precision': tuned_df['test_precision'],
    'Pipeline Recall': pipe_df['test_recall'],
    'Tuned Model Recall': tuned_df['test_recall'],
    'Pipeline F1': pipe_df['test_f1'],
    'Tuned Model F1': tuned_df['test_f1'],
})
comparison_df.mean()    


Pipeline Accuracy        0.812592
Tuned Model Accuracy     0.831674
Pipeline Precision       0.782343
Tuned Model Precision    0.835216
Pipeline Recall          0.707332
Tuned Model Recall       0.704476
Pipeline F1              0.741609
Tuned Model F1           0.761038
dtype: float64

In [28]:
# Submit predictions
predictions = model.predict(X_test_encoded)
submission_df = pd.DataFrame({
    'PassengerId': test_df['PassengerId'],
    'Survived': predictions
})
submission_df.to_csv(output_dir / "titanic_submission.csv", index=False)