In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder,MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import SelectKBest,chi2

In [118]:
df = pd.read_csv('train.csv')
df.drop(columns=['PassengerId','Name','Cabin','Ticket'],inplace=True)
df

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,0,3,male,22.0,1,0,7.2500,S
1,1,1,female,38.0,1,0,71.2833,C
2,1,3,female,26.0,0,0,7.9250,S
3,1,1,female,35.0,1,0,53.1000,S
4,0,3,male,35.0,0,0,8.0500,S
...,...,...,...,...,...,...,...,...
886,0,2,male,27.0,0,0,13.0000,S
887,1,1,female,19.0,0,0,30.0000,S
888,0,3,female,,1,2,23.4500,S
889,1,1,male,26.0,0,0,30.0000,C


In [113]:
X_train,X_test,y_train,y_test = train_test_split(df.drop(columns=['Survived']),
                                                 df['Survived'],
                                                 test_size=0.2,
                                                 random_state=42)
X_train

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
331,1,male,45.5,0,0,28.5000,S
733,2,male,23.0,0,0,13.0000,S
382,3,male,32.0,0,0,7.9250,S
704,3,male,26.0,1,0,7.8542,S
813,3,female,6.0,4,2,31.2750,S
...,...,...,...,...,...,...,...
106,3,female,21.0,0,0,7.6500,S
270,1,male,,0,0,31.0000,S
860,3,male,41.0,2,0,14.1083,S
435,1,female,14.0,1,2,120.0000,S


In [4]:
X_train.isnull().sum()

Pclass        0
Sex           0
Age         140
SibSp         0
Parch         0
Fare          0
Embarked      2
dtype: int64

Create column Transformers

# Imputation Transformer

In [27]:
trf1 = ColumnTransformer([
    ('imputer_age',SimpleImputer(),[2]),
    ('imputer_embarked',SimpleImputer(strategy='most_frequent'),[6])
],remainder='passthrough')

# One Hot Encoder

In [94]:
trf2 = ColumnTransformer([
    ('ohe_sex_embarked',OneHotEncoder(sparse_output=False,handle_unknown='ignore'),[1,3])
],remainder='passthrough')

# Feature Scaling

In [95]:
trf3 = ColumnTransformer([
    ('scale',MinMaxScaler(),slice(0,10))
],remainder='passthrough')

# Feature Selection

In [8]:
trf4 = SelectKBest(score_func=chi2,k=8)

# Algorithm Selection

In [9]:
trf5 = DecisionTreeClassifier()

# Create Pipeline

In [106]:
pipe = Pipeline([
    ('trf1',trf1),
    ('trf2',trf2),
    ('trf3',trf3),
    ('trf4',trf4),
    ('trf5',trf5),
])

# train

In [107]:
pipe.fit(X_train,y_train)

In [108]:
pipe.named_steps.trf1.output_indices_

{'imputer_age': slice(0, 1, None),
 'imputer_embarked': slice(1, 2, None),
 'remainder': slice(2, 7, None)}

In [109]:
pipe.named_steps.trf2.output_indices_

{'ohe_sex_embarked': slice(0, 5, None), 'remainder': slice(5, 10, None)}

In [110]:
pipe.named_steps.trf3.output_indices_

{'scale': slice(0, 10, None), 'remainder': slice(0, 0, None)}

# **************************************************************
# Problem: 
In what manner order changes when apply, column Transformation

# Solution:
The order of the columns in the transformed feature matrix follows the order of how the columns are specified in the transformers list.

Columns of the original feature matrix that are not specified are dropped from the resulting transformed feature matrix, unless specified in the passthrough keyword. 

Those columns specified with passthrough are added at the right to the output of the transformers.
# **************************************************************

In [111]:
y_pred = pipe.predict(X_test)

In [112]:
from sklearn.metrics import accuracy_score
accuracy_score(y_test,y_pred)

0.7877094972067039

# Export the pipeline

In [114]:
# export 
import pickle
pickle.dump(pipe,open('pipe.pkl','wb'))