In [14]:
from sklearn.datasets import fetch_openml
import pandas as pd

# Loading titanic data
titanic = fetch_openml('titanic', version=1, as_frame=True)
df = titanic.frame

print(df.shape)
df.head()

(1309, 14)


Unnamed: 0,pclass,survived,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked,boat,body,home.dest
0,1,1,"Allen, Miss. Elisabeth Walton",female,29.0,0,0,24160,211.3375,B5,S,2.0,,"St Louis, MO"
1,1,1,"Allison, Master. Hudson Trevor",male,0.9167,1,2,113781,151.55,C22 C26,S,11.0,,"Montreal, PQ / Chesterville, ON"
2,1,0,"Allison, Miss. Helen Loraine",female,2.0,1,2,113781,151.55,C22 C26,S,,,"Montreal, PQ / Chesterville, ON"
3,1,0,"Allison, Mr. Hudson Joshua Creighton",male,30.0,1,2,113781,151.55,C22 C26,S,,135.0,"Montreal, PQ / Chesterville, ON"
4,1,0,"Allison, Mrs. Hudson J C (Bessie Waldo Daniels)",female,25.0,1,2,113781,151.55,C22 C26,S,,,"Montreal, PQ / Chesterville, ON"


## <u>Preprocessing data</u>

In [15]:
df = df[ ['pclass', 'sex', 'age', 'sibsp', 'parch', 'fare', 'embarked', 'survived'] ]
aaa = df['age'].isna()

print(df.isnull().sum())
print(df['age'].mean())
print(df.loc[aaa].index)

pclass        0
sex           0
age         263
sibsp         0
parch         0
fare          1
embarked      2
survived      0
dtype: int64
29.8811345124283
Index([  15,   37,   40,   46,   59,   69,   70,   74,   80,  106,
       ...
       1282, 1283, 1284, 1291, 1292, 1293, 1297, 1302, 1303, 1305],
      dtype='int64', length=263)


In [16]:
# Getting mean values of each pclass
mean_ages_by_pclass = df.groupby('pclass')['age'].transform('mean').round()

# Replacing missing ages with the class-specific mean age.
df.loc[:, 'age'] = df['age'].fillna(mean_ages_by_pclass)

# Checing if there are still any missing values of age
no_of_missing_age = df.loc[:,'age'].isna().sum()
if no_of_missing_age == 0:
    print("No missing age exists.")
elif no_of_missing_age == 1:
    print(f"There is {no_of_missing_age} missing age.")
else:
    print(f"There are {no_of_missing_age} missing ages.")

No missing age exists.


### -> all missing ages are filled
<br>
<br>

## Learning model to predict survivers by age using Decision Tree. 
### 1. using only 'age'

In [17]:
# I expect a single input feature would make this not accuarate
# Also, the data might be too imbalanced. 
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import numpy as np

feature_set = df[['age']]
label_set = df['survived']

X_train, X_test, y_train, y_test = train_test_split(feature_set, label_set, test_size=0.2, random_state=11)

df_dtc = DecisionTreeClassifier(random_state=11)
df_dtc.fit(X_train, y_train)
pred = df_dtc.predict(X_test)

accuracy = np.round(accuracy_score(y_test, pred), 3)
print(f"Accuracy with only one factor (age): {accuracy}")

Accuracy with only one factor (age): 0.592


In [18]:
# The accuracy is 59.2%. Checking the dataset of 'age' if this dataset is imbalanced
def find_age_range(age):
    boundaries = [80, 70, 60, 50, 40, 30, 20, 10]

    for limit in boundaries:
        if age >= limit:
            return limit


ages = df['age'].apply(find_age_range)

age_range_cnt = ages.value_counts().reset_index().sort_values(by='age').astype('int')
age_range_cnt.T

Unnamed: 0,2,0,1,3,4,5,6,7
age,10,20,30,40,50,60,70,80
count,143,552,287,135,70,32,7,1


#### -> Age dataset is very imbalanced, so basic accuracy method shouldn't be used. Also, the model must be highly susceptible to underfitting because it uses only one feature, age, out of many that influence survival, meaning it won't reflect the true situation.
<br>
<br>

### 2. adding two more factors: 'pclass' and 'sex'. 
#### * sex is converted to numeric values. 0 is male, 1 is female.

In [20]:
# Leaning model to predict survivers by age, pclass, sex using Decision Tree. 
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import numpy as np

df.loc[:, 'numed_sex'] = df.loc[:,'sex'].map(lambda x: 0 if x=='male' else 1).astype('int')
    
feature_set = df[['age', 'pclass', 'numed_sex']]
label_set = df['survived']

X_train, X_test, y_train, y_test = train_test_split(feature_set, label_set, test_size=0.2, random_state=11)

df_dtc = DecisionTreeClassifier(random_state=11)
df_dtc.fit(X_train, y_train)
pred = df_dtc.predict(X_test)

accuracy = np.round(accuracy_score(y_test, pred), 3)
print(f"Accuracy with 'age', 'pclass', 'sex': \t{accuracy}")

Accuracy with 'age', 'pclass', 'sex': 	0.779


### 3. Using KFold with 'age', 'pclass', and 'sex'

In [21]:
# with KFold (n_split=10)
from sklearn.model_selection import KFold
import numpy as np

df.loc[:, 'numed_sex'] = df['sex'].map(lambda x: 0 if x=='male' else 1).astype('int')
    
feature_set = df[['age', 'pclass', 'numed_sex']]
label_set = df['survived']

kfold = KFold(n_splits=10)

acc_cv = []
for tr_index, te_index in kfold.split(feature_set):
    X_train, X_test = feature_set.loc[tr_index], feature_set.loc[te_index]
    y_train, y_test = label_set.loc[tr_index], label_set.loc[te_index]

    df_dtc = DecisionTreeClassifier(random_state=2)
    df_dtc.fit(X_train, y_train)
    pred = df_dtc.predict(X_test)

    acc_cv.append(accuracy_score(y_test, pred))

print("Accuracy with KFold(n_split=10):", np.round(np.mean(acc_cv), 3))

Accuracy with KFold(n_split=10): 0.775


### 4. Using StratifiedKFold with 'age', 'pclass', and 'sex'

In [22]:
# with StratifiedKFold (n_split=10)
from sklearn.model_selection import StratifiedKFold
import numpy as np

df.loc[:, 'numed_sex'] = df.loc[:,'sex'].map(lambda x: 0 if x=='male' else 1).astype('int')
    
feature_set = df[['age', 'pclass', 'numed_sex']]
label_set = df['survived']

stkfold = StratifiedKFold(n_splits=10)

acc_cv = []
for tr_index, te_index in stkfold.split(feature_set, label_set):
    X_train, X_test = feature_set.loc[tr_index], feature_set.loc[te_index]
    y_train, y_test = label_set.loc[tr_index], label_set.loc[te_index]

    df_dtc = DecisionTreeClassifier(random_state=2)
    df_dtc.fit(X_train, y_train)
    pred = df_dtc.predict(X_test)

    acc_cv.append(np.round(accuracy_score(y_test, pred), 3))

print("Accuracy with StratifiedKFold(n_split=10):", np.round(np.mean(acc_cv), 3))

Accuracy with StratifiedKFold(n_split=10): 0.733


### 5. Using Confusion matrix, Precision and Recall

In [47]:
from sklearn.metrics import confusion_matrix, precision_score, recall_score

df.loc[:, 'numed_sex'] = df.loc[:,'sex'].map(lambda x: 0 if x=='male' else 1).astype('int')
    
feature_set = df[['age', 'pclass', 'numed_sex']]
label_set = df['survived'].astype('int')

X_train, X_test, y_train, y_test = train_test_split(feature_set, label_set, test_size=0.2, random_state=11)

df_dtc = DecisionTreeClassifier(random_state=11)
df_dtc.fit(X_train, y_train)
pred = df_dtc.predict(X_test)

def get_evals( y_test, pred ):
    conf_matrix = confusion_matrix( y_test, pred )
    precision = precision_score( y_test, pred )
    recall = recall_score( y_test, pred )

    print(f"Confusion Matrix:\n{conf_matrix}")
    print(f"Precision: {precision:0.4f}")
    print(f"Recall: {recall:0.4f}")

get_evals( y_test, pred )

Confusion Matrix:
[[142  14]
 [ 44  62]]
Precision: 0.8158
Recall: 0.5849
