In [4]:
import pandas as pd, numpy as np, os, json
import matplotlib.pyplot as plt, joblib
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

In [13]:
Data_dir="../data"
Out_Models="../models"
Out_Reports="../reports"
Out_Output="../output"

for d in [Out_Models, Out_Reports, Out_Output]:
    os.makedirs(d, exist_ok=True)


In [14]:
train_path= os.path.join(Data_dir, "train.csv")
test_path= os.path.join(Data_dir, "test.csv")

assert os.path.exists(train_path), f"Missing {train_path}"
assert os.path.exists(test_path), f"Missing {test_path}"

In [15]:
train=pd.read_csv(train_path)
test=pd.read_csv(test_path)

print("Train shape: ", train.shape, "| Test shape: ", test.shape)

Train shape:  (891, 12) | Test shape:  (418, 11)


In [17]:
for df in (train,test):
    df["FamilySize"]= df["SibSp"].fillna(0) + df["Parch"].fillna(0) +1
    df["IsAlone"]=(df["FamilySize"]==1).astype(int)

for df in (train,test):
    df["Pclass"]=df.Pclass.astype(str)

In [18]:
feature_cols=["Pclass","Sex","Age", "SibSp", "Parch", "Fare", "Embarked", "FamilySize", "IsAlone"]
TARGET="Survived"

assert TARGET in train.columns, "Train file must have 'Survived' column."
assert "PassengerId" in test.columns, "Test file must have 'PassengerId' column."

In [19]:
X= train[feature_cols].copy()
y= train[TARGET].copy()
X_test= test[feature_cols].copy()

for df in (X,X_test):
    for col in ["Pclass", "Sex", "Embarked"]:
        df[col]= df[col].astype(str)

In [20]:
X_train, X_val, y_train, y_val= train_test_split(X,y, test_size=0.2, random_state=42, stratify=y)

In [21]:
num_cols=["Age", "SibSp", "Parch", "Fare", "FamilySize", "IsAlone"]
cat_cols=["Pclass", "Sex", "Embarked"]
ohe=OneHotEncoder(handle_unknown="ignore", sparse_output=False)

In [23]:
numeric_tf= Pipeline(steps=[
    ("impute", SimpleImputer(strategy="median")),
    ("scale", StandardScaler())
])

categorical_tf= Pipeline(steps=[
    ("impute", SimpleImputer(strategy="most_frequent")),
    ("ohe",ohe)
])

preprocessor= ColumnTransformer(transformers=[
    ("num", numeric_tf, num_cols),
    ("cat", categorical_tf, cat_cols)
])

In [24]:
clf= RandomForestClassifier(
    n_estimators=300,
    max_depth=None,
    min_samples_split=2,
    random_state=42,
    n_jobs=-1
)

model= Pipeline(steps=[
    ("pre", preprocessor),
    ("clf", clf)
])

In [26]:
model.fit(X_train,y_train)
val_preds=model.predict(X_val)

acc= accuracy_score(y_val, val_preds)
cm= confusion_matrix(y_val, val_preds)
cr= classification_report(y_val, val_preds, digits=3)

print(f"\nValidation Accuracy: {acc:.4f}\n")
print("Confusion Matrix:\n", cm)
print("\nClassification Report:\n", cr)


Validation Accuracy: 0.8045

Confusion Matrix:
 [[96 14]
 [21 48]]

Classification Report:
               precision    recall  f1-score   support

           0      0.821     0.873     0.846       110
           1      0.774     0.696     0.733        69

    accuracy                          0.804       179
   macro avg      0.797     0.784     0.789       179
weighted avg      0.803     0.804     0.802       179



In [28]:
metrics_txt=os.path.join(Out_Reports, "titanic_baseline_matrix.txt")
with open(metrics_txt, "w") as f:
    f.write(f"Validation Accuracy: {acc:.4f}\n\n")
    f.write("Confusion Matrix:\n")
    f.write(np.array2string(cm))
    f.write("\n\nClassification Report:\n")
    f.write(cr)

In [30]:
plt.figure(figsize=(4,4))
plt.imshow(cm, interpolation="nearest")
plt.title("Confusion Matrix(Validation)")
plt.xticks([0,1], ["Pred 0", "Pred 1"])
plt.yticks([0,1], ["True 0", "True 1"])
for (i,j), v in np.ndenumerate(cm):
    plt.text(j,i,str(v), ha="center", va="center")
plt.tight_layout()
cm_path=os.path.join(Out_Reports, "titanic_confusion_matrix.png")
plt.savefig(cm_path,dpi=150)
plt.close()

In [31]:
model.fit(X,y)
test_preds=model.predict(X_test)
submission = pd.DataFrame({
    "PassengerId": test.PassengerId,
    "Survived": test_preds
})

sub_path=os.path.join(Out_Output, "titanic_baseline_submission.csv")
submission.to_csv(sub_path, index=False)

In [34]:
model_path= os.path.join(Out_Models, "titanic_baseline_rf.joblib")
joblib.dump(model, model_path)

config={
    "feature_cols": feature_cols,
    "numeric_cols": num_cols,
    "categorical_features": cat_cols,
    "model": "RandomForestClassifier",
    "params": {"n_estimator":300, "random_state":42, "n_jobs":-1},
    "val_accuracy":float(acc)
}
with open (os.path.join(Out_Reports, "titanic_baseline_config.json"),"w") as f:
    json.dump(config, f, indent=2)

In [35]:
print("\n DONE — Baseline model trained & artifacts saved.")
print("Artifacts:")
print(" - Model:", model_path)
print(" - Metrics:", metrics_txt)
print(" - Confusion matrix plot:", cm_path)
print(" - Submission CSV:", sub_path)


 DONE — Baseline model trained & artifacts saved.
Artifacts:
 - Model: ../models\titanic_baseline_rf.joblib
 - Metrics: ../reports\titanic_baseline_matrix.txt
 - Confusion matrix plot: ../reports\titanic_confusion_matrix.png
 - Submission CSV: ../output\titanic_baseline_submission.csv
