In [16]:
import pandas as pd, numpy as np, os, json
import matplotlib.pyplot as plt, joblib
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

In [2]:
Data_dir="../data"
Out_Models="../models"
Out_Reports="../reports"
Out_Output="../output"

for d in [Out_Models, Out_Reports, Out_Output]:
    os.makedirs(d, exist_ok=True)

In [3]:
train_path= os.path.join(Data_dir, "train.csv")
test_path= os.path.join(Data_dir, "test.csv")

assert os.path.exists(train_path), f"Missing {train_path}"
assert os.path.exists(test_path), f"Missing {test_path}"

In [4]:
train=pd.read_csv(train_path)
test=pd.read_csv(test_path)

In [5]:
for df in (train,test):
    df["FamilySize"]= df["SibSp"].fillna(0) + df["Parch"].fillna(0) +1
    df["IsAlone"]=(df["FamilySize"]==1).astype(int)
    df["Pclass"]=df.Pclass.astype(str)

In [6]:
feature_cols=["Pclass","Sex","Age", "SibSp", "Parch", "Fare", "Embarked", "FamilySize", "IsAlone"]
TARGET="Survived"

In [7]:
X= train[feature_cols].copy()
y= train[TARGET].copy()
X_test= test[feature_cols].copy()

for df in (X,X_test):
    for col in ["Pclass", "Sex", "Embarked"]:
        df[col]= df[col].astype(str)

In [8]:
X_train, X_val, y_train, y_val= train_test_split(X,y, test_size=0.2, random_state=42, stratify=y)

In [9]:
num_cols=["Age", "SibSp", "Parch", "Fare", "FamilySize", "IsAlone"]
cat_cols=["Pclass", "Sex", "Embarked"]
ohe=OneHotEncoder(handle_unknown="ignore", sparse_output=False)

In [10]:
numeric_tf= Pipeline(steps=[
    ("impute", SimpleImputer(strategy="median")),
    ("scale", StandardScaler())
])

categorical_tf= Pipeline(steps=[
    ("impute", SimpleImputer(strategy="most_frequent")),
    ("ohe",ohe)
])

preprocessor= ColumnTransformer(transformers=[
    ("num", numeric_tf, num_cols),
    ("cat", categorical_tf, cat_cols)
])

In [12]:
base_rf= RandomForestClassifier(random_state=42, n_jobs=-1)

pipe= Pipeline([
    ("pre", preprocessor),
    ("clf", base_rf)
])

In [19]:
param_grid={
    "clf__n_estimators": [200,300,400],
    "clf__max_depth": [None,6,8,10],
    "clf__min_samples_split": [2,5,6],
    "clf__min_samples_leaf": [1,2,4],
    "clf__max_features": ["sqrt", "log2", None]
}

In [20]:
cv=StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
gs= GridSearchCV(
    estimator=pipe,
    param_grid=param_grid,
    scoring="accuracy",
    cv=cv,
    n_jobs=-1,
    verbose=2
)

gs.fit(X_train, y_train)

print("\nBest CV Accuracy: ", gs.best_score_)
print("Best Params:\n", gs.best_params_)

Fitting 5 folds for each of 324 candidates, totalling 1620 fits

Best CV Accuracy:  0.8328572835615089
Best Params:
 {'clf__max_depth': 10, 'clf__max_features': None, 'clf__min_samples_leaf': 1, 'clf__min_samples_split': 5, 'clf__n_estimators': 200}


In [21]:
val_preds=gs.best_estimator_.predict(X_val)
val_acc=accuracy_score(y_val, val_preds)
cm=confusion_matrix(y_val, val_preds)
cr= classification_report(y_val, val_preds, digits=3)

print("\nValidation Accuracy:", val_acc)
print("\nClassification Report:\n", cr)


Validation Accuracy: 0.8100558659217877

Classification Report:
               precision    recall  f1-score   support

           0      0.817     0.891     0.852       110
           1      0.797     0.681     0.734        69

    accuracy                          0.810       179
   macro avg      0.807     0.786     0.793       179
weighted avg      0.809     0.810     0.807       179



In [23]:
os.makedirs(Out_Reports, exist_ok=True)
with open(os.path.join(Out_Reports, "titanic_rf_tuning_results.json"), "w") as f:
    json.dump({
        "best_cv_accuracy": float(gs.best_score_),
        "val_accuracy": float(val_acc),
        "best_params": gs.best_params_
    },f, indent=2)

In [24]:
plt.figure(figsize=(4,4))
plt.imshow(cm, interpolation="nearest")
plt.title("Confusion Matrix(Validation)")
plt.xticks([0,1], ["Pred 0", "Pred 1"])
plt.yticks([0,1], ["True 0", "True 1"])
for (i,j), v in np.ndenumerate(cm):
    plt.text(j,i,str(v), ha="center", va="center")
plt.tight_layout()
cm_path=os.path.join(Out_Reports, "titanic_rf_tuning_confusion.png")
plt.savefig(cm_path,dpi=150)
plt.close()

In [25]:
best_model_path= os.path.join(Out_Models, "titanic_rf_best.joblib")
joblib.dump(gs.best_estimator_,best_model_path)

['../models\\titanic_rf_best.joblib']

In [28]:
gs.best_estimator_.fit(X,y)
test_preds= gs.best_estimator_.predict(X_test)
assert "PassengerId" in test.columns, "PassengerId missing in test.csv"
sub=pd.DataFrame({"PassengerId": test.PassengerId, "Survived": test_preds})
sub_path= os.path.join(Out_Output,"titanic_rf_tuned_submission.csv")
sub.to_csv(sub_path, index=False)

print("\nSaved:")
print(" -", best_model_path)
print(" -", cm_path)
print(" -", os.path.join(Out_Reports, "titanic_rf_tuning_results.json"))
print(" -", sub_path)


Saved:
 - ../models\titanic_rf_best.joblib
 - ../reports\titanic_rf_tuning_confusion.png
 - ../reports\titanic_rf_tuning_results.json
 - ../output\titanic_rf_tuned_submission.csv
