In [233]:
import pandas as pd
import numpy as np
import seaborn as sns
from sklearn.impute import KNNImputer
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder, StandardScaler, PolynomialFeatures
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.linear_model import LogisticRegression, RidgeClassifier
from sklearn.metrics import roc_auc_score, f1_score
from sklearn.svm import SVC
from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
import joblib

In [2]:
titanik = sns.load_dataset('titanic')
titanik

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.2500,S,Third,man,True,,Southampton,no,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.9250,S,Third,woman,False,,Southampton,yes,True
3,1,1,female,35.0,1,0,53.1000,S,First,woman,False,C,Southampton,yes,False
4,0,3,male,35.0,0,0,8.0500,S,Third,man,True,,Southampton,no,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
886,0,2,male,27.0,0,0,13.0000,S,Second,man,True,,Southampton,no,True
887,1,1,female,19.0,0,0,30.0000,S,First,woman,False,B,Southampton,yes,True
888,0,3,female,,1,2,23.4500,S,Third,woman,False,,Southampton,no,False
889,1,1,male,26.0,0,0,30.0000,C,First,man,True,C,Cherbourg,yes,True


In [3]:
titanik['embarked'].value_counts(),titanik['embark_town'].value_counts()

(embarked
 S    644
 C    168
 Q     77
 Name: count, dtype: int64,
 embark_town
 Southampton    644
 Cherbourg      168
 Queenstown      77
 Name: count, dtype: int64)

In [4]:
titanik['survived'].value_counts(),titanik['alive'].value_counts()

(survived
 0    549
 1    342
 Name: count, dtype: int64,
 alive
 no     549
 yes    342
 Name: count, dtype: int64)

In [5]:
titanik['sex'].value_counts(),titanik['who'].value_counts()

(sex
 male      577
 female    314
 Name: count, dtype: int64,
 who
 man      537
 woman    271
 child     83
 Name: count, dtype: int64)

In [6]:
titanik['pclass'].value_counts(),titanik['class'].value_counts()

(pclass
 3    491
 1    216
 2    184
 Name: count, dtype: int64,
 class
 Third     491
 First     216
 Second    184
 Name: count, dtype: int64)

In [7]:
titanik.drop(['embarked','alive','sex','pclass'], axis = 1, inplace=True)

In [8]:
titanik.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 11 columns):
 #   Column       Non-Null Count  Dtype   
---  ------       --------------  -----   
 0   survived     891 non-null    int64   
 1   age          714 non-null    float64 
 2   sibsp        891 non-null    int64   
 3   parch        891 non-null    int64   
 4   fare         891 non-null    float64 
 5   class        891 non-null    category
 6   who          891 non-null    object  
 7   adult_male   891 non-null    bool    
 8   deck         203 non-null    category
 9   embark_town  889 non-null    object  
 10  alone        891 non-null    bool    
dtypes: bool(2), category(2), float64(2), int64(3), object(2)
memory usage: 52.8+ KB


In [9]:
titanik.describe()

Unnamed: 0,survived,age,sibsp,parch,fare
count,891.0,714.0,891.0,891.0,891.0
mean,0.383838,29.699118,0.523008,0.381594,32.204208
std,0.486592,14.526497,1.102743,0.806057,49.693429
min,0.0,0.42,0.0,0.0,0.0
25%,0.0,20.125,0.0,0.0,7.9104
50%,0.0,28.0,0.0,0.0,14.4542
75%,1.0,38.0,1.0,0.0,31.0
max,1.0,80.0,8.0,6.0,512.3292


In [10]:
titanik.head(10)

Unnamed: 0,survived,age,sibsp,parch,fare,class,who,adult_male,deck,embark_town,alone
0,0,22.0,1,0,7.25,Third,man,True,,Southampton,False
1,1,38.0,1,0,71.2833,First,woman,False,C,Cherbourg,False
2,1,26.0,0,0,7.925,Third,woman,False,,Southampton,True
3,1,35.0,1,0,53.1,First,woman,False,C,Southampton,False
4,0,35.0,0,0,8.05,Third,man,True,,Southampton,True
5,0,,0,0,8.4583,Third,man,True,,Queenstown,True
6,0,54.0,0,0,51.8625,First,man,True,E,Southampton,True
7,0,2.0,3,1,21.075,Third,child,False,,Southampton,False
8,1,27.0,0,2,11.1333,Third,woman,False,,Southampton,False
9,1,14.0,1,0,30.0708,Second,child,False,,Cherbourg,False


In [11]:
titanik.columns

Index(['survived', 'age', 'sibsp', 'parch', 'fare', 'class', 'who',
       'adult_male', 'deck', 'embark_town', 'alone'],
      dtype='object')

In [12]:
titanik['class'].value_counts()

class
Third     491
First     216
Second    184
Name: count, dtype: int64

In [13]:
titanik['deck'].value_counts()

deck
C    59
B    47
D    33
E    32
A    15
F    13
G     4
Name: count, dtype: int64

In [14]:
titanik.groupby('deck', observed=True)['age'].mean()

deck
A    44.833333
B    34.955556
C    36.086667
D    39.032258
E    38.116667
F    19.954545
G    14.750000
Name: age, dtype: float64

In [15]:
pd.crosstab(titanik['deck'], titanik['class'])

class,First,Second,Third
deck,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
A,15,0,0
B,47,0,0
C,59,0,0
D,29,4,0
E,25,4,3
F,0,8,5
G,0,0,4


In [16]:
pd.crosstab(titanik['deck'], titanik['who'])

who,child,man,woman
deck,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
A,1,13,1
B,3,19,25
C,2,31,26
D,0,15,18
E,1,16,15
F,4,5,4
G,2,0,2


In [17]:
x = titanik[['deck','who','class','age','fare']]
x

Unnamed: 0,deck,who,class,age,fare
0,,man,Third,22.0,7.2500
1,C,woman,First,38.0,71.2833
2,,woman,Third,26.0,7.9250
3,C,woman,First,35.0,53.1000
4,,man,Third,35.0,8.0500
...,...,...,...,...,...
886,,man,Second,27.0,13.0000
887,B,woman,First,19.0,30.0000
888,,woman,Third,,23.4500
889,C,man,First,26.0,30.0000


In [18]:
x = titanik.drop('survived', axis = 1)
x

Unnamed: 0,age,sibsp,parch,fare,class,who,adult_male,deck,embark_town,alone
0,22.0,1,0,7.2500,Third,man,True,,Southampton,False
1,38.0,1,0,71.2833,First,woman,False,C,Cherbourg,False
2,26.0,0,0,7.9250,Third,woman,False,,Southampton,True
3,35.0,1,0,53.1000,First,woman,False,C,Southampton,False
4,35.0,0,0,8.0500,Third,man,True,,Southampton,True
...,...,...,...,...,...,...,...,...,...,...
886,27.0,0,0,13.0000,Second,man,True,,Southampton,True
887,19.0,0,0,30.0000,First,woman,False,B,Southampton,True
888,,1,2,23.4500,Third,woman,False,,Southampton,False
889,26.0,0,0,30.0000,First,man,True,C,Cherbourg,True


In [19]:
x.columns

Index(['age', 'sibsp', 'parch', 'fare', 'class', 'who', 'adult_male', 'deck',
       'embark_town', 'alone'],
      dtype='object')

In [20]:
ordinalTrans = OrdinalEncoder()
dummyTrans = OneHotEncoder( handle_unknown= 'ignore')
numericTrans = StandardScaler()

In [21]:
preprocessor = ColumnTransformer([('ordinal', ordinalTrans,  ['deck', 'class']),
                                 ('dummy', dummyTrans, ['sibsp','parch','who','adult_male','embark_town','alone']),
                                 ('numeric', numericTrans, ['age','fare'])])
imputer = KNNImputer()
pipe = Pipeline([('preproc', preprocessor),
                 ('impute', imputer)])

In [22]:
pipe.fit_transform(x)

array([[ 4.2       ,  2.        ,  0.        , ...,  0.        ,
        -0.53037664, -0.50244517],
       [ 2.        ,  0.        ,  0.        , ...,  0.        ,
         0.57183099,  0.78684529],
       [ 4.6       ,  2.        ,  1.        , ...,  1.        ,
        -0.25482473, -0.48885426],
       ...,
       [ 5.4       ,  2.        ,  0.        , ...,  0.        ,
        -0.17215916, -0.17626324],
       [ 2.        ,  0.        ,  1.        , ...,  1.        ,
        -0.25482473, -0.04438104],
       [ 4.8       ,  2.        ,  1.        , ...,  1.        ,
         0.15850313, -0.49237783]])

In [23]:
pipe.named_steps['preproc'].named_transformers_['dummy'].get_feature_names_out()

array(['sibsp_0', 'sibsp_1', 'sibsp_2', 'sibsp_3', 'sibsp_4', 'sibsp_5',
       'sibsp_8', 'parch_0', 'parch_1', 'parch_2', 'parch_3', 'parch_4',
       'parch_5', 'parch_6', 'who_child', 'who_man', 'who_woman',
       'adult_male_False', 'adult_male_True', 'embark_town_Cherbourg',
       'embark_town_Queenstown', 'embark_town_Southampton',
       'embark_town_nan', 'alone_False', 'alone_True'], dtype=object)

In [24]:
pipe.named_steps

{'preproc': ColumnTransformer(transformers=[('ordinal', OrdinalEncoder(),
                                  ['deck', 'class']),
                                 ('dummy',
                                  OneHotEncoder(handle_unknown='ignore'),
                                  ['sibsp', 'parch', 'who', 'adult_male',
                                   'embark_town', 'alone']),
                                 ('numeric', StandardScaler(), ['age', 'fare'])]),
 'impute': KNNImputer()}

In [25]:
xtrans = pipe.fit_transform(x)
xtrans[0]

array([ 4.2       ,  2.        ,  0.        ,  1.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  1.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  1.        ,  0.        ,  0.        ,
        1.        ,  0.        ,  0.        ,  1.        ,  0.        ,
        1.        ,  0.        , -0.53037664, -0.50244517])

In [26]:
ordinal = pipe.named_steps['preproc'].named_transformers_['ordinal'].inverse_transform(xtrans[:, :2])
dummy = pipe.named_steps['preproc'].named_transformers_['dummy'].inverse_transform(xtrans[:, 2:-2])
numeric = pipe.named_steps['preproc'].named_transformers_['numeric'].inverse_transform(xtrans[:, -2:])

In [27]:
print(ordinal[:5], dummy[:5], numeric[:5], sep ='\n')

[['E' 'Third']
 ['C' 'First']
 ['E' 'Third']
 ['C' 'First']
 ['E' 'Third']]
[[1 0 'man' True 'Southampton' False]
 [1 0 'woman' False 'Cherbourg' False]
 [0 0 'woman' False 'Southampton' True]
 [1 0 'woman' False 'Southampton' False]
 [0 0 'man' True 'Southampton' True]]
[[22.      7.25  ]
 [38.     71.2833]
 [26.      7.925 ]
 [35.     53.1   ]
 [35.      8.05  ]]


In [28]:
x['deck'].value_counts().sort_index()

deck
A    15
B    47
C    59
D    33
E    32
F    13
G     4
Name: count, dtype: int64

In [29]:
x = pd.concat([
        pd.DataFrame(ordinal, columns=['deck', 'class']),
        pd.DataFrame(dummy,columns=['sibsp', 'parch', 'who', 'adult_male', 'embark_town', 'alone']),
        pd.DataFrame(numeric, columns=['age', 'fare'])
    ], axis=1)
x

Unnamed: 0,deck,class,sibsp,parch,who,adult_male,embark_town,alone,age,fare
0,E,Third,1,0,man,True,Southampton,False,22.0,7.2500
1,C,First,1,0,woman,False,Cherbourg,False,38.0,71.2833
2,E,Third,0,0,woman,False,Southampton,True,26.0,7.9250
3,C,First,1,0,woman,False,Southampton,False,35.0,53.1000
4,E,Third,0,0,man,True,Southampton,True,35.0,8.0500
...,...,...,...,...,...,...,...,...,...,...
886,C,Second,0,0,man,True,Southampton,True,27.0,13.0000
887,B,First,0,0,woman,False,Southampton,True,19.0,30.0000
888,F,Third,1,2,woman,False,Southampton,False,27.2,23.4500
889,C,First,0,0,man,True,Cherbourg,True,26.0,30.0000


In [30]:
pd.crosstab(x['deck'], x['class']) - pd.crosstab(titanik['deck'], titanik['class'])

class,First,Second,Third
deck,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
A,5,1,0
B,18,22,0
C,15,70,1
D,3,37,20
E,0,29,380
F,0,9,78
G,0,0,0


In [31]:
pd.crosstab(titanik['deck'], titanik['class'])

class,First,Second,Third
deck,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
A,15,0,0
B,47,0,0
C,59,0,0
D,29,4,0
E,25,4,3
F,0,8,5
G,0,0,4


In [32]:
titanik[x.columns] = x

In [33]:
titanik

Unnamed: 0,survived,age,sibsp,parch,fare,class,who,adult_male,deck,embark_town,alone
0,0,22.0,1,0,7.2500,Third,man,True,E,Southampton,False
1,1,38.0,1,0,71.2833,First,woman,False,C,Cherbourg,False
2,1,26.0,0,0,7.9250,Third,woman,False,E,Southampton,True
3,1,35.0,1,0,53.1000,First,woman,False,C,Southampton,False
4,0,35.0,0,0,8.0500,Third,man,True,E,Southampton,True
...,...,...,...,...,...,...,...,...,...,...,...
886,0,27.0,0,0,13.0000,Second,man,True,C,Southampton,True
887,1,19.0,0,0,30.0000,First,woman,False,B,Southampton,True
888,0,27.2,1,2,23.4500,Third,woman,False,F,Southampton,False
889,1,26.0,0,0,30.0000,First,man,True,C,Cherbourg,True


In [34]:
titanik.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 11 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   survived     891 non-null    int64  
 1   age          891 non-null    float64
 2   sibsp        891 non-null    object 
 3   parch        891 non-null    object 
 4   fare         891 non-null    float64
 5   class        891 non-null    object 
 6   who          891 non-null    object 
 7   adult_male   891 non-null    object 
 8   deck         891 non-null    object 
 9   embark_town  889 non-null    object 
 10  alone        891 non-null    object 
dtypes: float64(2), int64(1), object(8)
memory usage: 76.7+ KB


In [35]:
titanik['familySize'] = titanik['sibsp'] + titanik['parch'] + 1

In [36]:
titanik[(titanik['deck'] == 'E') & (titanik['embark_town'] == 'Southampton')].head(35).sort_values(by = [ 'class','familySize'])

Unnamed: 0,survived,age,sibsp,parch,fare,class,who,adult_male,deck,embark_town,alone,familySize
6,0,54.0,0,0,51.8625,First,man,True,E,Southampton,True,1
92,0,46.0,1,0,61.175,First,man,True,E,Southampton,False,2
17,1,33.2,0,0,13.0,Second,man,True,E,Southampton,True,1
56,1,21.0,0,0,10.5,Second,woman,False,E,Southampton,True,1
98,1,34.0,0,1,23.0,Second,woman,False,E,Southampton,False,2
78,1,0.83,0,2,29.0,Second,child,False,E,Southampton,False,3
2,1,26.0,0,0,7.925,Third,woman,False,E,Southampton,True,1
4,0,35.0,0,0,8.05,Third,man,True,E,Southampton,True,1
12,0,20.0,0,0,8.05,Third,man,True,E,Southampton,True,1
14,0,14.0,0,0,7.8542,Third,child,False,E,Southampton,True,1


In [37]:
titanik['farePerPerson'] = titanik['fare'] / titanik['familySize']

In [38]:
titanik['classDeck'] = titanik['class'] + '_' + titanik['deck']

In [39]:
titanik

Unnamed: 0,survived,age,sibsp,parch,fare,class,who,adult_male,deck,embark_town,alone,familySize,farePerPerson,classDeck
0,0,22.0,1,0,7.2500,Third,man,True,E,Southampton,False,2,3.625,Third_E
1,1,38.0,1,0,71.2833,First,woman,False,C,Cherbourg,False,2,35.64165,First_C
2,1,26.0,0,0,7.9250,Third,woman,False,E,Southampton,True,1,7.925,Third_E
3,1,35.0,1,0,53.1000,First,woman,False,C,Southampton,False,2,26.55,First_C
4,0,35.0,0,0,8.0500,Third,man,True,E,Southampton,True,1,8.05,Third_E
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
886,0,27.0,0,0,13.0000,Second,man,True,C,Southampton,True,1,13.0,Second_C
887,1,19.0,0,0,30.0000,First,woman,False,B,Southampton,True,1,30.0,First_B
888,0,27.2,1,2,23.4500,Third,woman,False,F,Southampton,False,4,5.8625,Third_F
889,1,26.0,0,0,30.0000,First,man,True,C,Cherbourg,True,1,30.0,First_C


In [40]:
x = titanik.iloc[:,1:]
y = titanik.iloc[:,0]

In [41]:
x

Unnamed: 0,age,sibsp,parch,fare,class,who,adult_male,deck,embark_town,alone,familySize,farePerPerson,classDeck
0,22.0,1,0,7.2500,Third,man,True,E,Southampton,False,2,3.625,Third_E
1,38.0,1,0,71.2833,First,woman,False,C,Cherbourg,False,2,35.64165,First_C
2,26.0,0,0,7.9250,Third,woman,False,E,Southampton,True,1,7.925,Third_E
3,35.0,1,0,53.1000,First,woman,False,C,Southampton,False,2,26.55,First_C
4,35.0,0,0,8.0500,Third,man,True,E,Southampton,True,1,8.05,Third_E
...,...,...,...,...,...,...,...,...,...,...,...,...,...
886,27.0,0,0,13.0000,Second,man,True,C,Southampton,True,1,13.0,Second_C
887,19.0,0,0,30.0000,First,woman,False,B,Southampton,True,1,30.0,First_B
888,27.2,1,2,23.4500,Third,woman,False,F,Southampton,False,4,5.8625,Third_F
889,26.0,0,0,30.0000,First,man,True,C,Cherbourg,True,1,30.0,First_C


In [42]:
y

0      0
1      1
2      1
3      1
4      0
      ..
886    0
887    1
888    0
889    1
890    0
Name: survived, Length: 891, dtype: int64

In [43]:
y.mean(), len(y) * .8

(0.3838383838383838, 712.8000000000001)

In [44]:
xtrain, xtest, ytrain, ytest = train_test_split(x, y, test_size= .2, stratify= y, random_state= 42)

In [45]:
x.columns

Index(['age', 'sibsp', 'parch', 'fare', 'class', 'who', 'adult_male', 'deck',
       'embark_town', 'alone', 'familySize', 'farePerPerson', 'classDeck'],
      dtype='object')

In [46]:
ordinalTrans = OrdinalEncoder()
dummyTrans = OneHotEncoder( handle_unknown= 'ignore', drop = 'first')
numericTrans = StandardScaler()

In [47]:
ordinal = ['deck', 'class', 'familySize']
dummy = ['sibsp','parch','who','adult_male','embark_town','alone','classDeck']
numeric = ['age','fare','farePerPerson']

In [155]:
def provera(model):
    model = model.best_estimator_
    train, test = model.score(xtrain,ytrain), model.score(xtest,ytest)
    pred = model.predict(xtest)
    roc, f1 = roc_auc_score(ytest, pred), f1_score(ytest,pred)
    return train, test, roc, f1

In [329]:
rezultati.iloc[2:].mean().sort_values(ascending = False)

SVC       0.799951
SVC2      0.792694
ridge     0.781604
ridge2    0.781604
rf2       0.770203
rf3       0.768066
knn       0.766930
knn2      0.766930
logit2    0.760205
logit3    0.760205
logit     0.755085
rf        0.753614
dtype: float64

In [331]:
joblib.dump(rezultati, 'titanikRezultati.pkl')

['titanikRezultati.pkl']