In [7]:
# TITANIC SURVIVAL PREDICTION SYSTEM
# MODEL DEVELOPMENT NOTEBOOK
# Author: Gideon Belaboh
# Matric Number: 23CG034049
# Environment: universal_env

# Step 1 — Import Libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
import joblib

print("Libraries imported successfully.")


Libraries imported successfully.


In [8]:
# Step 2 — Load Dataset

# Ensure train.csv is in /model/
df = pd.read_csv('train.csv')
print("Dataset loaded successfully.")
df.head()


Dataset loaded successfully.


Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [9]:
# Step 3 — Data Preprocessing

# 3.1 Handle Missing Values
# Fill missing Age with median
df['Age'].fillna(df['Age'].median(), inplace=True)

# Fill missing Embarked with mode
df['Embarked'].fillna(df['Embarked'].mode()[0], inplace=True)

print("Missing values handled.")


Missing values handled.


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Age'].fillna(df['Age'].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Embarked'].fillna(df['Embarked'].mode()[0], inplace=True)


In [10]:
# 3.2 Feature Selection
# Selected input features: Pclass, Sex, Age, SibSp, Fare
features = ['Pclass', 'Sex', 'Age', 'SibSp', 'Fare']
target = 'Survived'

X = df[features]
y = df[target]

print("Feature selection completed.")
X.head()


Feature selection completed.


Unnamed: 0,Pclass,Sex,Age,SibSp,Fare
0,3,male,22.0,1,7.25
1,1,female,38.0,1,71.2833
2,3,female,26.0,0,7.925
3,1,female,35.0,1,53.1
4,3,male,35.0,0,8.05


In [11]:
# 3.3 Encode Categorical Variables
# Sex is categorical → encode using 0/1
X = pd.get_dummies(X, columns=['Sex'], drop_first=True)

print("Categorical encoding completed.")
X.head()


Categorical encoding completed.


Unnamed: 0,Pclass,Age,SibSp,Fare,Sex_male
0,3,22.0,1,7.25,True
1,1,38.0,1,71.2833,False
2,3,26.0,0,7.925,False
3,1,35.0,1,53.1,False
4,3,35.0,0,8.05,True


In [12]:
# 3.4 Feature Scaling
# StandardScaler is applied for models like SVM/KNN (optional for Random Forest)
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

print("Feature scaling completed.")


Feature scaling completed.


In [13]:
# Step 4 — Train/Test Split
X_train, X_test, y_train, y_test = train_test_split(
    X_scaled, y, test_size=0.2, random_state=42
)

print(f"Training set: {X_train.shape}, Test set: {X_test.shape}")


Training set: (712, 5), Test set: (179, 5)


In [14]:
# Step 5 — Train Random Forest Classifier
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# Step 6 — Evaluate Model
y_pred = model.predict(X_test)
print("Classification Report:")
print(classification_report(y_test, y_pred))


Classification Report:
              precision    recall  f1-score   support

           0       0.83      0.86      0.84       105
           1       0.79      0.74      0.76        74

    accuracy                           0.81       179
   macro avg       0.81      0.80      0.80       179
weighted avg       0.81      0.81      0.81       179



In [15]:
# Step 7 — Save the Trained Model
joblib.dump(model, 'titanic_survival_model.pkl')
print("Model saved successfully as 'titanic_survival_model.pkl'.")


Model saved successfully as 'titanic_survival_model.pkl'.


In [16]:
# Step 8 — Test Reloaded Model
loaded_model = joblib.load('titanic_survival_model.pkl')

# Predict first 5 examples from test set
sample_preds = loaded_model.predict(X_test[:5])
print("Sample predictions from reloaded model:", sample_preds)


Sample predictions from reloaded model: [0 0 0 1 0]
