# An Exploratory Analysis of the Titanic Dataset

This project builds a function to predict a Titanic passengerâ€™s survival probability by following these steps:

1. **Data Preparation:** Clean the data and encode categorical features.
2. **Exploratory Analysis:** Examine relationships between features and survival.
3. **Model Training:** Train a classification model using the prepared data.
4. **Prediction Function:** Apply the trained model to new passenger inputs to estimate survival probability.

In [1]:
# Import necessary modules
import pandas as pd                # For handling data
import matplotlib.pyplot as plt    # For visualisations

In [2]:

titanic = pd.read_csv("dataset.csv")
df = (
    titanic
    .drop(["Ticket", "Fare", "Cabin", "Embarked", "Name"], axis=1)  
    .dropna()
    .reset_index(drop=True)                                         
)


# maping male & female to 1 & 0
df['Sex'] = df['Sex'].map({"male": 1, "female": 0})

# observing first 5
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age,SibSp,Parch
0,1,0,3,1,22.0,1,0
1,2,1,1,0,38.0,1,0
2,3,1,3,0,26.0,0,0
3,4,1,1,0,35.0,1,0
4,5,0,3,1,35.0,0,0


In [None]:
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, roc_auc_score
import pickle
import os


def train_model(df: pd.DataFrame = df):

    # features
    X = df[["Pclass", "Sex", "Age", "SibSp", "Parch"]]
    # target
    y = df["Survived"].astype(int)


    # Train/test split with test size being 20% and training 80% (very common especially with small sets)
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y)


    
    # preprocessing the numbers (scaling and imputing)
    numeric_pipeline = Pipeline([
        ("imputer", SimpleImputer(strategy="median")), # median imputation for Age (and any numeric missing)
        ("scaler", StandardScaler()), # normalizing all numeric features using the Standard Scalar
    ])


    # model pipeline (using logisticRegression)
    pipeline = Pipeline([
        ("prep", numeric_pipeline),
        ("clf", LogisticRegression(solver="liblinear", class_weight="balanced"))
        # class_weight="balanced" -> balances the model for imbalanced datasets by assigining higher importance weights to under represented classes
        # This dataset is definitly unbalanced as a simple df.hist() shows there's more male records than female ones.
    ])

    pipeline.fit(X_train, y_train) # fitting the model to the data
    # the goal here is that the model learns the linear relationship between all the features
    # and generalizes (not overfit or underfit) so it can perform on unseen data which i'll test in blocks below

    # Evaluating the model
    y_pred = pipeline.predict(X_test)
    y_proba = pipeline.predict_proba(X_test)[:, 1]
    acc = accuracy_score(y_test, y_pred)
    auc = roc_auc_score(y_test, y_proba)

    # saving the model so we don't have to train it again every time we run an inference
    with open("model.pkl", "wb") as f:
        pickle.dump(pipeline, f)

    return pipeline, {"accuracy": acc, "roc_auc": auc}



# training the model and unpacking the tuple 
model, metrics = train_model()

metrics # viewing metrics

{'accuracy': 0.7692307692307693, 'roc_auc': 0.8623732251521298}

In [61]:

'''
this function returns the survival probability of a passenger with an accuracy of 76%
which we know from the lecture's is the sweet spot.
'''

def estimate_survival(record: pd.DataFrame):
    if not os.path.exists("model.pkl"):
        raise ValueError("Model File: (model.pkl) not found!")

    with open("model.pkl", "rb") as f: # load the model
        model = pickle.load(f)

    return model.predict_proba(record.values)[:,1]



records = df[["Pclass", "Sex", "Age", "SibSp", "Parch"]]

records.insert(1, 'estimates', estimate_survival(records))


records.head()



Unnamed: 0,Pclass,estimates,Sex,Age,SibSp,Parch
0,3,0.143793,1,22.0,1,0
1,1,0.925977,0,38.0,1,0
2,3,0.713277,0,26.0,0,0
3,1,0.934955,0,35.0,1,0
4,3,0.11655,1,35.0,0,0
