In [291]:
import numpy as np
import pandas as pd
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler, OrdinalEncoder
from sklearn import set_config 
from sklearn.feature_selection import SelectKBest,chi2
from sklearn.tree import DecisionTreeClassifier
from sklearn.pipeline import Pipeline
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LinearRegression, LogisticRegression

In [292]:
set_config(display='diagram')

In [293]:
df=pd.read_csv("./datasets/train.csv")

In [294]:
df.head(3)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S


In [295]:
df.isnull().mean()*100

PassengerId     0.000000
Survived        0.000000
Pclass          0.000000
Name            0.000000
Sex             0.000000
Age            19.865320
SibSp           0.000000
Parch           0.000000
Ticket          0.000000
Fare            0.000000
Cabin          77.104377
Embarked        0.224467
dtype: float64

In [296]:
df=df[["Age","Fare","Embarked","Survived"]]

In [297]:
df.head(4)

Unnamed: 0,Age,Fare,Embarked,Survived
0,22.0,7.25,S,0
1,38.0,71.2833,C,1
2,26.0,7.925,S,1
3,35.0,53.1,S,1


In [298]:
df["Embarked"].isnull().sum()

2

# Train_test_split

In [299]:
X_train,X_test,y_train,y_test=train_test_split(df.iloc[:,:3],df.iloc[:,3:4])

In [300]:
X_train.shape,X_test.shape,y_train.shape,y_test.shape

((668, 3), (223, 3), (668, 1), (223, 1))

In [301]:
X_train.columns

Index(['Age', 'Fare', 'Embarked'], dtype='object')

In [302]:
X_train["Embarked"].value_counts()

S    486
C    121
Q     59
Name: Embarked, dtype: int64

# Pipeline with Missing Indicator

In [303]:
trf1=ColumnTransformer(transformers=[
    ('Age',SimpleImputer(strategy="mean",add_indicator=True),[0]),
    ("Embarked",SimpleImputer(strategy="most_frequent",add_indicator=True),[2])
],
remainder="passthrough")

In [304]:
trf2=ColumnTransformer(transformers=[
    ("Embarked1",OneHotEncoder(sparse=False,handle_unknown="ignore"),[2])
],
remainder="passthrough")

In [305]:
trf3=ColumnTransformer(transformers=[
    ('scale',MinMaxScaler(),slice(0,5))
])

In [306]:
trf4=SelectKBest(score_func=chi2,k=5)

In [307]:
trf5=DecisionTreeClassifier()
#trf5=LogisticRegression()

In [308]:
pipe=Pipeline([('trf1',trf1),('trf2',trf2),('trf3',trf3),('trf4',trf4),('trf5',trf5)])

In [309]:
pipe.fit(X_train,y_train)

In [310]:
cross_val_score(pipe,X_train,y_train,cv=5,scoring="accuracy").mean()

0.6348109078666816

In [311]:
y_pred=pipe.predict(X_test)

In [312]:
accuracy_score(y_pred,y_test)

0.57847533632287

# Pipeline without Missing Indicator

In [313]:
X_train,X_test,y_train,y_test=train_test_split(df.iloc[:,:3],df.iloc[:,3:4])

In [314]:
trf1=ColumnTransformer(transformers=[
    ('Age',SimpleImputer(strategy="mean",add_indicator=False),[0]),
    ("Embarked",SimpleImputer(strategy="most_frequent",add_indicator=False),[2])
],
remainder="passthrough")

In [315]:
trf2=ColumnTransformer(transformers=[
    ("Embarked1",OneHotEncoder(sparse=False,handle_unknown="ignore"),[2])
],
remainder="passthrough")

In [316]:
trf3=ColumnTransformer(transformers=[
    ('scale',MinMaxScaler(),slice(0,5))
])

In [317]:
trf4=SelectKBest(score_func=chi2,k=5)

In [318]:
trf5=DecisionTreeClassifier()

In [319]:
pipe=Pipeline([('trf1',trf1),('trf2',trf2),('trf3',trf3),('trf4',trf4),('trf5',trf5)])

In [320]:
pipe.fit(X_train,y_train)

In [321]:
cross_val_score(pipe,X_train,y_train,cv=10,scoring="accuracy").mean()

0.6077792853912257

In [322]:
y_pred=pipe.predict(X_test)

In [323]:
accuracy_score(y_pred,y_test)

0.6412556053811659