In [3]:

import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import accuracy_score

df = pd.read_csv("Project_1/data/titanic.csv")
print(df.head(3))



   PassengerId  Survived  Pclass  \
0            1         0       3   
1            2         1       1   
2            3         1       3   

                                                Name     Sex   Age  SibSp  \
0                            Braund, Mr. Owen Harris    male  22.0      1   
1  Cumings, Mrs. John Bradley (Florence Briggs Th...  female  38.0      1   
2                             Heikkinen, Miss. Laina  female  26.0      0   

   Parch            Ticket     Fare Cabin Embarked  
0      0         A/5 21171   7.2500   NaN        S  
1      0          PC 17599  71.2833   C85        C  
2      0  STON/O2. 3101282   7.9250   NaN        S  


In [4]:
print(df.isna().sum().sort_values(ascending=False).head())


Cabin          687
Age            177
Embarked         2
PassengerId      0
Survived         0
dtype: int64


In [5]:
data = df[["Survived","Sex","Pclass"]].dropna().copy()
data["is_female"] = (data["Sex"] == "female").astype(int)
y = data["Survived"].astype(int).values

In [6]:
X_sex = data[["is_female"]].values
X_cls = data[["Pclass"]].values
X_sex_tr, X_sex_te, y_tr, y_te = train_test_split(X_sex, y, test_size=0.2, stratify=y, random_state=7)
X_cls_tr, X_cls_te, _  , _    = train_test_split(X_cls, y, test_size=0.2, stratify=y, random_state=7)


In [8]:
mA = LinearRegression()
mA.fit(X_sex_tr, y_tr)
probaA = np.clip(mA.predict(X_sex_te), 0.0, 1.0) 
predA  = (probaA >= 0.5).astype(int)
accA   = accuracy_score(y_te, predA)

In [9]:
mB = LinearRegression()
mB.fit(X_cls_tr, y_tr)
probaB = np.clip(mB.predict(X_cls_te), 0.0, 1.0)
predB  = (probaB >= 0.5).astype(int)
accB   = accuracy_score(y_te, predB)

In [10]:
print("Gender only model:", round(accA,3), " sample probs:", np.round(probaA[:5],3))
print("Class only model: ", round(accB,3), " sample probs:", np.round(probaB[:5],3))

Gender only model: 0.765  sample probs: [0.186 0.751 0.751 0.751 0.186]
Class only model:  0.67  sample probs: [0.245 0.444 0.444 0.643 0.245]


In [11]:
def best_threshold(y_true, probs):
    best_t = 0.5
    best_acc = -1.0
    for t in np.arange(0.2, 0.81, 0.05):
        pred = (probs >= t).astype(int)
        acc = accuracy_score(y_true, pred)
        if acc > best_acc:
            best_acc = acc
            best_t = float(t)
    return best_t, best_acc

tA, accA_best = best_threshold(y_te, probaA)
tB, accB_best = best_threshold(y_te, probaB)

In [12]:
print(f"Gender model, best_t={tA:.2f}, accuracy={accA_best:.3f}")
print(f"Class model, best_t={tB:.2f}, accuracy={accB_best:.3f}")

Gender model, best_t=0.20, accuracy=0.765
Class model, best_t=0.45, accuracy=0.670


In [13]:
mA_full = LinearRegression().fit(X_sex, y)
mB_full = LinearRegression().fit(X_cls, y)

def predict_with_sex(sex_str, threshold=0.5):
    x = np.array([[1 if str(sex_str).lower()=="female" else 0]])
    prob = float(np.clip(mA_full.predict(x)[0], 0.0, 1.0))
    label = int(prob >= threshold)
    return prob, label

def predict_with_pclass(pclass, threshold=0.5):
    x = np.array([[int(pclass)]])
    prob = float(np.clip(mB_full.predict(x)[0], 0.0, 1.0))
    label = int(prob >= threshold)
    return prob, label

In [14]:
print("Sex model, 'female':", predict_with_sex("female"))
print("Sex model, 'male':  ", predict_with_sex("male"))
print("Pclass model, 1:", predict_with_pclass(1))
print("Pclass model, 3:", predict_with_pclass(3))

Sex model, 'female': (0.74203821656051, 1)
Sex model, 'male':   (0.18890814558058897, 0)
Pclass model, 1: (0.6416350358642786, 1)
Pclass model, 3: (0.24764392616432607, 0)


In [16]:
probA_full = np.clip(mA_full.predict(X_sex), 0.0, 1.0)
predA_full = (probA_full >= 0.5).astype(int)
accA_full  = accuracy_score(y, predA_full)

probB_full = np.clip(mB_full.predict(X_cls), 0.0, 1.0)
predB_full = (probB_full >= 0.5).astype(int)
accB_full  = accuracy_score(y, predB_full)

print("FULL-data train accuracy — Sex model:   ", round(accA_full, 3))
print("FULL-data train accuracy — Pclass model:", round(accB_full, 3))

FULL-data train accuracy — Sex model:    0.787
FULL-data train accuracy — Pclass model: 0.679
