## Exercise 1.1.
import data from the 'titanic.csv' file and check its structure.

- passenger_id – Unique passenger id

- pclass – Ticket class (1 = 1st, 2 = 2nd, 3 = 3rd)

- survived – Survival (0 = No, 1 = Yes)

- name – Name and SUrname

- sex – Sex (0 = Male, 1 = Female)

- age – Age in years

- sibsp – # of siblings / spouses aboard the Titanic

- parch – # of parents / children aboard the Titanic

- ticket – Ticket number

- fare – Passenger fare

- cabin – Cabin number

- embarked – Port of Embarkation (C = Cherbourg, Q = Queenstown, S = Southampton)

- boat – Lifeboat (if survived)

- body – Body number (if did not survive and body was recovered)

- home.dest – Home/Destination

# Exercise 1.2
- In the titanic dataset convert columns with qualitative variables to factors (sex, embarked, survived) or ordered factor (pclass) - defining the appropriate levels and, if needed, order of values. Please code the levels as:

- embarked respectively: Cherbourg, Queenstown, Southampton
- sex - Female, Male
- survived - Yes, No
- pclass - 1st <2nd <3rd

In [None]:
import pandas as pd
%matplotlib inline

In [None]:
import numpy as np
import pickle
import statsmodels.api as sm
import matplotlib.pyplot as plt
import gc
from sklearn.metrics import roc_auc_score

In [None]:
titanic = pd.read_csv("titanic.csv")


In [None]:
titanic.head()

Unnamed: 0,passenger_id,pclass,survived,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked,boat,body,home.dest
0,1,1,1,"Allen, Miss. Elisabeth Walton",1,29.0,0,0,24160,211.3375,B5,S,2.0,,"St Louis, MO"
1,2,1,1,"Allison, Master. Hudson Trevor",0,0.9167,1,2,113781,151.55,C22 C26,S,11.0,,"Montreal, PQ / Chesterville, ON"
2,3,1,0,"Allison, Miss. Helen Loraine",1,2.0,1,2,113781,151.55,C22 C26,S,,,"Montreal, PQ / Chesterville, ON"
3,4,1,0,"Allison, Mr. Hudson Joshua Creighton",0,30.0,1,2,113781,151.55,C22 C26,S,,135.0,"Montreal, PQ / Chesterville, ON"
4,5,1,0,"Allison, Mrs. Hudson J C (Bessie Waldo Daniels)",1,25.0,1,2,113781,151.55,C22 C26,S,,,"Montreal, PQ / Chesterville, ON"


In [None]:
titanic['passenger_id'].unique

In [None]:
titanic['embarked'] = titanic['embarked'].map({'C': 'Cherbourg', 'Q': 'Queenstown', 'S': 'Southampton'}).astype(pd.CategoricalDtype(categories=['Cherbourg', 'Queenstown', 'Southampton']))

In [None]:
titanic['sex'] = titanic['sex'].map({0: 'Male', 1: 'Female'}).astype(pd.CategoricalDtype(categories=['Female', 'Male']))

In [None]:
titanic['survived'] = titanic['survived'].map({0: 'No', 1: 'Yes'}).astype(pd.CategoricalDtype(categories=['Yes', 'No']))

In [None]:
titanic['pclass'] = titanic['pclass'].map({1: '1st', 2: '2nd', 3: '3rd'}).astype(pd.CategoricalDtype(categories=['1st', '2nd', '3rd'], ordered=True))

In [None]:
titanic.info()
titanic.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1309 entries, 0 to 1308
Data columns (total 15 columns):
 #   Column        Non-Null Count  Dtype   
---  ------        --------------  -----   
 0   passenger_id  1309 non-null   int64   
 1   pclass        1309 non-null   category
 2   survived      1309 non-null   category
 3   name          1309 non-null   object  
 4   sex           1309 non-null   category
 5   age           1046 non-null   float64 
 6   sibsp         1309 non-null   int64   
 7   parch         1309 non-null   int64   
 8   ticket        1309 non-null   object  
 9   fare          1308 non-null   float64 
 10  cabin         295 non-null    object  
 11  embarked      1307 non-null   category
 12  boat          486 non-null    object  
 13  body          121 non-null    float64 
 14  home.dest     745 non-null    object  
dtypes: category(4), float64(3), int64(3), object(5)
memory usage: 118.2+ KB


Unnamed: 0,passenger_id,pclass,survived,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked,boat,body,home.dest
0,1,1st,Yes,"Allen, Miss. Elisabeth Walton",Female,29.0,0,0,24160,211.3375,B5,Southampton,2.0,,"St Louis, MO"
1,2,1st,Yes,"Allison, Master. Hudson Trevor",Male,0.9167,1,2,113781,151.55,C22 C26,Southampton,11.0,,"Montreal, PQ / Chesterville, ON"
2,3,1st,No,"Allison, Miss. Helen Loraine",Female,2.0,1,2,113781,151.55,C22 C26,Southampton,,,"Montreal, PQ / Chesterville, ON"
3,4,1st,No,"Allison, Mr. Hudson Joshua Creighton",Male,30.0,1,2,113781,151.55,C22 C26,Southampton,,135.0,"Montreal, PQ / Chesterville, ON"
4,5,1st,No,"Allison, Mrs. Hudson J C (Bessie Waldo Daniels)",Female,25.0,1,2,113781,151.55,C22 C26,Southampton,,,"Montreal, PQ / Chesterville, ON"


In [None]:
titanic.drop(columns=['passenger_id','name','home.dest'], inplace=True)

In [None]:
titanic['age'] = titanic['age'].fillna(titanic['age'].median())
titanic['fare'] = titanic['fare'].fillna(titanic['fare'].median())
titanic['embarked'] = titanic['embarked'].fillna(titanic['embarked'].mode()[0])

In [None]:
from sklearn.model_selection import train_test_split
import random

X_train, X_test, y_train, y_test = train_test_split(titanic,
                                                    titanic.survived,
                                                    test_size = 0.3,
                                                    random_state = random.randint(0, 1000))
print(X_train.shape, X_test.shape)
mod = sm.GLM.from_formula(formula = 'survived ~ pclass + sex + age + sibsp + parch + fare '
    '+ embarked',
                          data = X_train,
                          family = sm.families.Binomial())
res = mod.fit()
res.summary()
preds = res.predict(X_test)

roc_auc_score(y_test, preds)

(916, 12) (393, 12)


np.float64(0.8017098731384447)

In [None]:
from sklearn.model_selection import KFold

score = []
kf = KFold(n_splits = 5, shuffle = True, random_state = random.randint(0, 10000))

for train, test in kf.split(titanic.index.values) :

    mod = sm.GLM.from_formula(formula = 'survived ~ pclass + sex + age + sibsp + parch + fare '
    '+ embarked',
                              data = titanic.iloc[train],
                              family = sm.families.Binomial())
    res = mod.fit()
    predsTrain = res.predict(titanic.iloc[train])
    preds = res.predict(titanic.iloc[test])
    score.append(roc_auc_score(titanic.iloc[test].survived, preds))
    print("Train AUC:", round(roc_auc_score(titanic.iloc[train].survived, predsTrain), 4),
          "Valid AUC:", round(roc_auc_score(titanic.iloc[test].survived, preds), 4))

Train AUC: 0.8467 Valid AUC: 0.8299
Train AUC: 0.8541 Valid AUC: 0.8031
Train AUC: 0.8407 Valid AUC: 0.8589
Train AUC: 0.8452 Valid AUC: 0.8384
Train AUC: 0.8365 Valid AUC: 0.8719


In [None]:
for z in range(10) :

    trainRes = []
    valRes = []
    kf = KFold(n_splits = 10, shuffle = True, random_state = random.randint(0, 10000))

    for train, test in kf.split(titanic.index.values):

        mod = sm.GLM.from_formula(formula = 'survived ~ pclass + sex + age + sibsp + parch + fare '
    '+ embarked',
                              data = titanic.iloc[train],
                              family = sm.families.Binomial())
        res = mod.fit()
        predsTrain = res.predict(titanic.iloc[train])
        preds = res.predict(titanic.iloc[test])
        trainRes.append(roc_auc_score(titanic.iloc[train].survived, predsTrain))
        valRes.append(roc_auc_score(titanic.iloc[test].survived, preds))

    print("Train AUC:", round(np.mean(trainRes), 4), "Valid AUC:", round(np.mean(valRes), 4))

Train AUC: 0.8444 Valid AUC: 0.8366
Train AUC: 0.8445 Valid AUC: 0.8409
Train AUC: 0.8444 Valid AUC: 0.8422
Train AUC: 0.8445 Valid AUC: 0.8397
Train AUC: 0.8443 Valid AUC: 0.8416
Train AUC: 0.8445 Valid AUC: 0.8377
Train AUC: 0.8445 Valid AUC: 0.8385
Train AUC: 0.8444 Valid AUC: 0.8395
Train AUC: 0.8442 Valid AUC: 0.8393
Train AUC: 0.8443 Valid AUC: 0.8389
