In [61]:
import pandas as pd
import numpy as np
import seaborn as sns

In [62]:
df = sns.load_dataset("titanic")
df.head(3)

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.25,S,Third,man,True,,Southampton,no,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.925,S,Third,woman,False,,Southampton,yes,True


In [63]:
df.drop(columns=["class","who","adult_male","deck","alive","alone","embark_town"],inplace=True)

In [64]:
df.head(3)

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked
0,0,3,male,22.0,1,0,7.25,S
1,1,1,female,38.0,1,0,71.2833,C
2,1,3,female,26.0,0,0,7.925,S


In [65]:
df.sex.value_counts()

sex
male      577
female    314
Name: count, dtype: int64

In [66]:
df.isnull().sum()

survived      0
pclass        0
sex           0
age         177
sibsp         0
parch         0
fare          0
embarked      2
dtype: int64

In [67]:
from sklearn.model_selection import train_test_split

x = df.drop(columns=["survived"])
y = df["survived"]
x_train,x_test,y_train,y_test = train_test_split(x, y, test_size=0.20, random_state=0)
x_train.shape, x_test.shape

((712, 7), (179, 7))

In [68]:
x_train

Unnamed: 0,pclass,sex,age,sibsp,parch,fare,embarked
140,3,female,,0,2,15.2458,C
439,2,male,31.0,0,0,10.5000,S
817,2,male,31.0,1,1,37.0042,C
378,3,male,20.0,0,0,4.0125,C
491,3,male,21.0,0,0,7.2500,S
...,...,...,...,...,...,...,...
835,1,female,39.0,1,1,83.1583,C
192,3,female,19.0,1,0,7.8542,S
629,3,male,,0,0,7.7333,Q
559,3,female,36.0,1,0,17.4000,S


In [69]:
y_train

140    0
439    0
817    0
378    0
491    0
      ..
835    1
192    1
629    0
559    1
684    0
Name: survived, Length: 712, dtype: int64

In [70]:
x_train.head(2)

Unnamed: 0,pclass,sex,age,sibsp,parch,fare,embarked
140,3,female,,0,2,15.2458,C
439,2,male,31.0,0,0,10.5,S


In [71]:
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import MinMaxScaler
from sklearn.compose import ColumnTransformer

# first column transformer
tnf1 = ColumnTransformer(transformers=[
    ("age_imputation", SimpleImputer(), [2]),
    ("embarked_imputation", SimpleImputer(strategy="most_frequent"), [6])
], remainder="passthrough")

In [117]:
tnf2 = ColumnTransformer(transformers=[
    ("ohe_sex_embarked", OneHotEncoder(handle_unknown='ignore',sparse_output=False), [1,6])
], remainder="passthrough")

In [118]:
tnf3 = ColumnTransformer(transformers=[
    ('scale', MinMaxScaler(), slice(0,8)) # there are 8 columns that's why slice 0-8 i choose
])

In [119]:
from sklearn.feature_selection import SelectKBest,chi2

tnf4 = SelectKBest(score_func=chi2, k=6)

In [120]:
from sklearn.linear_model import LogisticRegression

tnf5 = LogisticRegression()

In [143]:
from sklearn.pipeline import Pipeline,make_pipeline

pipe = Pipeline([
    ('tnf1', tnf1),  # age and embarked imputer
    ('tnf2', tnf2),  # sex and embarked one-hot encoder
    ('tnf3', tnf3),  # scaling minmax
    ('tnf4', tnf4),  # SelectKBest feature selection
    ('tnf5', tnf5)   # model which is logisticregression
])

# Pipeline vs make_pipeline

Pipeline requires naming of steps, but make_pipelin does not
(Also columntransformer and make_columntransfer has same concept)

In [144]:
pipe_no_name = make_pipeline(tnf1, tnf2, tnf3, tnf4, tnf5)

In [145]:
pipe.fit(x_train, y_train)
# pipe.transform(x_test)

In [146]:
pipe.named_steps

{'tnf1': ColumnTransformer(remainder='passthrough',
                   transformers=[('age_imputation', SimpleImputer(), [2]),
                                 ('embarked_imputation',
                                  SimpleImputer(strategy='most_frequent'),
                                  [6])]),
 'tnf2': ColumnTransformer(remainder='passthrough',
                   transformers=[('ohe_sex_embarked',
                                  OneHotEncoder(handle_unknown='ignore',
                                                sparse_output=False),
                                  [1, 6])]),
 'tnf3': ColumnTransformer(transformers=[('scale', MinMaxScaler(), slice(0, 8, None))]),
 'tnf4': SelectKBest(k=6, score_func=<function chi2 at 0x00000227B65A1DA0>),
 'tnf5': LogisticRegression()}

In [147]:
pipe.named_steps['tnf1'].transformers_[0][1].statistics_

array([29.74518389])

In [148]:
pipe.named_steps['tnf1'].transformers_[1][1].statistics_

array(['S'], dtype=object)

In [149]:
from sklearn import set_config
set_config(display='diagram')

In [150]:
y_pred = pipe.predict(x_test)
y_pred

array([1, 0, 0, 1, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0,
       0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0,
       0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0,
       0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0,
       1, 0, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0,
       1, 0, 0], dtype=int64)

In [151]:
from sklearn.metrics import accuracy_score

accuracy_score(y_pred, y_test)

0.6759776536312849

In [None]:
import pickle

pickle.dump(pipe,open("pipe.pkl","wb"))