# Scikit Learn Pipeline 

![](https://miro.medium.com/v2/resize:fit:1400/1*3CUgNaoTUG3eg_BOFkN-YA.jpeg)

Pipeline chains together multiple steps so that output of each step is used as input to the next step 

Pipeline makes it easy to apply the same preprocessing to train and test 

In [109]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score

In [110]:
df=pd.read_csv('train.csv')

In [111]:
df.value_counts().sum()

183

In [112]:
df.drop(columns=['PassengerId','Name','Ticket','Cabin'],inplace=True )

In [113]:
# step to train /test/ split

X_train,X_test,y_train,y_test=train_test_split(df.drop(columns=['Survived']),
                                              df['Survived'],
                                              test_size=0.2,
                                              random_state=1)

In [114]:
X_train.head(4)

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
301,3,male,,2,0,23.25,Q
309,1,female,30.0,0,0,56.9292,C
516,2,female,34.0,0,0,10.5,S
120,2,male,21.0,2,0,73.5,S


In [115]:
# finding Missing values 
df.isnull().sum()

Survived      0
Pclass        0
Sex           0
Age         177
SibSp         0
Parch         0
Fare          0
Embarked      2
dtype: int64

In [116]:
#applying imputation 
si_age=SimpleImputer()
si_embarked=SimpleImputer(strategy='most_frequent')

X_train_age=si_age.fit_transform(X_train[['Age']])
X_train_embarked=si_embarked.fit_transform(X_train[['Embarked']])

X_test_age=si_age.fit_transform(X_test[['Age']])
X_test_embarked=si_embarked.fit_transform(X_test[['Embarked']])

In [117]:
X_train_age.shape

(712, 1)

In [118]:
X_test_age.shape

(179, 1)

In [119]:
#One hot encode time 
#sex and Embarked 
ohe_sex=OneHotEncoder(sparse=False,handle_unknown='ignore')
ohe_embarked=OneHotEncoder(sparse=False,handle_unknown='ignore')

X_train_sex=ohe_sex.fit_transform(X_train[['Sex']])
X_train_embarked=ohe_embarked.fit_transform(X_train_embarked)

X_test_sex=ohe_sex.transform(X_test[['Sex']])
X_test_embarked=ohe_embarked.transform(X_test_embarked)

In [120]:
X_train_embarked

array([[0., 1., 0.],
       [1., 0., 0.],
       [0., 0., 1.],
       ...,
       [0., 0., 1.],
       [0., 0., 1.],
       [0., 0., 1.]])

In [121]:
X_train_rem=X_train.drop(columns=['Sex','Age','Embarked'])

In [122]:
X_test_rem=X_test.drop(columns=['Sex','Age','Embarked'])

In [123]:
X_train_transformed=np.concatenate((X_train_rem,X_train_age,X_train_sex,X_train_embarked),axis=1)
X_test_transformed=np.concatenate((X_test_rem,X_test_age,X_test_sex,X_test_embarked),axis=1)

In [124]:
X_train_transformed.shape

(712, 10)

In [125]:
X_test_transformed.shape

(179, 10)

In [126]:
clf=DecisionTreeClassifier()
clf.fit(X_train_transformed,y_train)

DecisionTreeClassifier()

In [127]:
y_pred=clf.predict(X_test_transformed)
y_pred

array([1, 0, 1, 1, 1, 0, 0, 1, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0,
       1, 0, 0, 0, 1, 1, 0, 1, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0, 1, 1, 0,
       1, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0,
       0, 0, 0, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0,
       1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0,
       1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0,
       0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0,
       1, 1, 1], dtype=int64)

In [128]:
accuracy_score(y_test,y_pred)

0.7486033519553073

In [129]:
import pickle

In [132]:
pickle.dump(ohe_sex,open('Model_ak/ohe_sex.pkl','wb'))
pickle.dump(ohe_embarked,open('Model_ak/ohe_embarked.pkl','wb'))
pickle.dump(clf,open('model_ak/clf.pkl','wb'))