In [1]:
import os,pandas as pd, numpy as np, joblib
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
import matplotlib.pyplot as plt, json

In [2]:
Data_dir="../data"
Out_Models="../models"
Out_Reports="../reports"
Out_Output="../output"

for d in [Out_Models, Out_Reports, Out_Output]:
    os.makedirs(d, exist_ok=True)

In [11]:
rf_path= os.path.join(Out_Models, "titanic_rf_best.joblib")
lr_path= os.path.join(Out_Models, "titanic_logreg.joblib")
final_model_path= os.path.join(Out_Models, "titanic_final_best.joblib")
summary_json= os.path.join(Out_Reports, "titanic_model_comparison.json")
bar_png= os.path.join(Out_Reports, "titanic_model_accuracy_bar.png")
cm_rf_png= os.path.join(Out_Reports, "titanic_cm_rf.png")
cm_lr_png= os.path.join(Out_Reports, "titanic_cm_lr.png")
submission_path = os.path.join(Out_Reports, "titanic_final_submission.csv")

In [5]:
train_path= os.path.join(Data_dir, "train.csv")
test_path= os.path.join(Data_dir, "test.csv")
train=pd.read_csv(train_path)
test=pd.read_csv(test_path)

In [6]:
for df in (train,test):
    df["FamilySize"]= df["SibSp"].fillna(0) + df["Parch"].fillna(0) +1
    df["IsAlone"]=(df["FamilySize"]==1).astype(int)
    df["Pclass"]=df.Pclass.astype(str)

In [7]:
feature_cols=["Pclass","Sex","Age", "SibSp", "Parch", "Fare", "Embarked", "FamilySize", "IsAlone"]
num_cols=["Age", "SibSp", "Parch", "Fare", "FamilySize", "IsAlone"]
cat_cols=["Pclass", "Sex", "Embarked"]

In [8]:
X= train[feature_cols].copy()
y= train.Survived.copy()
X_test= test[feature_cols].copy()

for df in (X,X_test):
    for col in cat_cols:
        df[col]= df[col].astype(str)

In [9]:
X_train, X_val, y_train, y_val= train_test_split(X,y, test_size=0.2, random_state=42, stratify=y)

In [12]:
assert os.path.exists(rf_path), f"Missing model: {rf_path}"
assert os.path.exists(lr_path), f"Missing model: {lr_path}"

rf_pipe= joblib.load(rf_path)
lr_pipe= joblib.load(lr_path)

In [13]:
def eval_model(name,pipe,Xv,yv):
    preds=pipe.predict(Xv)
    acc= accuracy_score(yv,preds)
    cm= confusion_matrix(yv,preds)
    cr= classification_report(yv, preds, digits=3)
    return acc,cm,cr,preds

In [14]:
rf_acc,rf_cm,rf_cr,_ = eval_model("RandomForest", rf_pipe, X_val,y_val)
lr_acc,lr_cm,lr_cr,_ = eval_model("LogisticRegression", lr_pipe, X_val, y_val)

In [15]:
print("Validation Accuracies:")
print(f" - RandomForest:       {rf_acc:.4f}")
print(f" - LogisticRegression: {lr_acc:.4f}\n")

print("RF Classification Report:\n",rf_cr)
print("LR Classification report:\n",lr_cr)

Validation Accuracies:
 - RandomForest:       0.8101
 - LogisticRegression: 0.8156

RF Classification Report:
               precision    recall  f1-score   support

           0      0.817     0.891     0.852       110
           1      0.797     0.681     0.734        69

    accuracy                          0.810       179
   macro avg      0.807     0.786     0.793       179
weighted avg      0.809     0.810     0.807       179

LR Classification report:
               precision    recall  f1-score   support

           0      0.818     0.900     0.857       110
           1      0.810     0.681     0.740        69

    accuracy                          0.816       179
   macro avg      0.814     0.791     0.799       179
weighted avg      0.815     0.816     0.812       179



In [16]:
def save_cm(cm, title, out_path):
    plt.figure(figsize=(4,4))
    plt.imshow(cm, interpolation="nearest")
    plt.title(title)
    plt.xticks([0,1], ["Pred 0", "Pred 1"])
    plt.yticks([0,1], ["True 0", "True 1"])
    for (i,j), v in np.ndenumerate(cm):
        plt.text(j,i,str(v), ha="center", va= "center")
    plt.tight_layout()
    plt.savefig(out_path, dpi=150)
    plt.close()

In [17]:
save_cm(rf_cm, "Confusion Matrix - RandomForest (Val)", cm_rf_png)
save_cm(lr_cm, "Confusion Matrix - LogisticRegression (Val)", cm_lr_png)

In [18]:
plt.figure(figsize=(5,4))
models= ["RandomForest", "LogReg"]
accs= [rf_acc, lr_acc]
plt.bar(models,accs)
plt.ylim(0,1)
plt.title("Validation Accuracy by Model")
for i, v in enumerate(accs):
    plt.text(i, v+0.01, f"{v:.3f}", ha="center")
plt.tight_layout()
plt.savefig(bar_png, dpi=150)
plt.close()

In [20]:
model_scores= {"RandomForest": rf_acc, "LogisticRegression": lr_acc}
best_name= max(model_scores, key=model_scores.get)
best_pipe_loaded = rf_pipe if best_name == "RandomForest" else lr_pipe

print(f"\nBest model by validation accuracy: {best_name}")


Best model by validation accuracy: LogisticRegression


In [21]:
best_pipe_loaded.fit(X,y)
joblib.dump(best_pipe_loaded, final_model_path)

['../models\\titanic_final_best.joblib']

In [24]:
assert "PassengerId" in test.columns, "Test CSV must contain PassengerId"
test_preds= best_pipe_loaded.predict(X_test)
submission= pd.DataFrame({"PassengerId": test.PassengerId, "Survived": test_preds})
submission.to_csv(submission_path, index=False)

In [25]:
summary = {
    "rf_val_accuracy": float(rf_acc),
    "lr_val_accuracy": float(lr_acc),
    "best_model": best_name,
    "artifacts": {
        "final_model": final_model_path,
        "submission_csv": submission_path,
        "accuracy_bar_png": bar_png,
        "rf_cm_png": cm_rf_png,
        "lr_cm_png": cm_lr_png
    }
}
with open(summary_json, "w") as f:
    json.dump(summary, f, indent=2)

print("\nComparison complete.")
print("Saved:")
print(" -", summary_json)
print(" -", bar_png)
print(" -", cm_rf_png)
print(" -", cm_lr_png)
print(" -", final_model_path)
print(" -", submission_path)


Comparison complete.
Saved:
 - ../reports\titanic_model_comparison.json
 - ../reports\titanic_model_accuracy_bar.png
 - ../reports\titanic_cm_rf.png
 - ../reports\titanic_cm_lr.png
 - ../models\titanic_final_best.joblib
 - ../reports\titanic_final_submission.csv
