# <strong>Data Science:</strong> Transformers & Pipelines

**Name:** Arsalan Ali<br>
**Email:** arslanchaos@gmail.com

---

### **Table of Contents**
* **Column Transformer**
    * Importing Libraries
    * Importing Dataset
    * Importing Transformations
    * Adding Column Transformer
    * Applying Transformer
* **Pipeline**
    * Importing Libraries
    * Display Pipeline Function
    * Loading Dataset
    * Dropping Columns
    * Splitting Data
    * Imputation Transformer
    * One-Hot Encoding Transfomer
    * Scaling Transformer
    * Feature Selection
    * Model Selection
    * Create Pipeline
    * Model Training using Pipeline
    * Explore Pipeline
    * Model Prediction
    * Model Evaluation using Accuracy Score
    * Cross Validation using Pipeline
    * GridSearch using Pipeline
    * GridSearch Score
    * Best Parameters of Model using GridSearch
    * Export Pipeline
    * Load Pipeline | Production
    * User Input | Production
    * Predict | Production
---

## **Column Transformer**

In [1]:
# Importing Libraries

import numpy as np
import pandas as pd
import seaborn as sns

In [124]:
# Importing Dataset

df = sns.load_dataset("titanic")
df = df[["sex", "age", "class"]]
df_columns = df.columns
df.head()

Unnamed: 0,sex,age,class
0,male,22.0,Third
1,female,38.0,First
2,female,26.0,Third
3,female,35.0,First
4,male,35.0,Third


In [117]:
# Importing Transformations

from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import OrdinalEncoder
from sklearn.compose import ColumnTransformer

In [122]:
# Adding Transformatiosn to Column Transformer

transformer = ColumnTransformer(transformers=[
("One-Hot_sex", OneHotEncoder(sparse=False, drop='first'), [0]),
("Impute_age", SimpleImputer(), [1]),
("Ordinal_class", OrdinalEncoder(categories=[["Third","Second","First"]]), [2])
], remainder='passthrough')

In [123]:
# Applying Transformer to Dataset

df = transformer.fit_transform(df)
df = pd.DataFrame(df, columns=df_columns)
df.head()

Unnamed: 0,sex,age,class
0,1.0,22.0,0.0
1,0.0,38.0,2.0
2,0.0,26.0,0.0
3,0.0,35.0,2.0
4,1.0,35.0,0.0


## **Pipeline**

In [3]:
# Importing Libraries 

import numpy as np
import pandas as pd
import seaborn as sns

from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import MinMaxScaler
from sklearn.tree import DecisionTreeClassifier
from sklearn.feature_selection import SelectKBest,chi2
from sklearn.model_selection import train_test_split

from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline,make_pipeline

In [None]:
# To Display Pipeline (If it automatically doesn't show up)

from sklearn import set_config
set_config(display='diagram')

In [4]:
# Loading Dataset

df = sns.load_dataset("titanic")
df.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.25,S,Third,man,True,,Southampton,no,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.925,S,Third,woman,False,,Southampton,yes,True
3,1,1,female,35.0,1,0,53.1,S,First,woman,False,C,Southampton,yes,False
4,0,3,male,35.0,0,0,8.05,S,Third,man,True,,Southampton,no,True


In [5]:
# Dropping unwated columns

df = df.drop(columns=["class","who","adult_male", "deck", "embark_town", "alive", "alone"], axis=1)
df.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked
0,0,3,male,22.0,1,0,7.25,S
1,1,1,female,38.0,1,0,71.2833,C
2,1,3,female,26.0,0,0,7.925,S
3,1,1,female,35.0,1,0,53.1,S
4,0,3,male,35.0,0,0,8.05,S


In [6]:
# Splitting Data
X_train,X_test,y_train,y_test = train_test_split(df.drop(columns=['survived']),
                                                 df['survived'],
                                                 test_size=0.2,
                                                random_state=42)

In [7]:
X_train.head()

Unnamed: 0,pclass,sex,age,sibsp,parch,fare,embarked
331,1,male,45.5,0,0,28.5,S
733,2,male,23.0,0,0,13.0,S
382,3,male,32.0,0,0,7.925,S
704,3,male,26.0,1,0,7.8542,S
813,3,female,6.0,4,2,31.275,S


In [8]:
y_train.sample(5)

686    0
726    1
511    0
328    1
295    0
Name: survived, dtype: int64

### Imputation Transformer

In [9]:
trf1 = ColumnTransformer([
    ('impute_age',SimpleImputer(),[2]),
    ('impute_embarked',SimpleImputer(strategy='most_frequent'),[6])
],remainder='passthrough')

### One-Hot Encoding Transformer

In [10]:
trf2 = ColumnTransformer([
    ('ohe_sex_embarked',OneHotEncoder(sparse=False,handle_unknown='ignore'),[1,6])
],remainder='passthrough')

### Scaling Transformer

In [11]:
trf3 = ColumnTransformer([
    ('scale',MinMaxScaler(),slice(0,10))
])

### Feature Selection

In [12]:
trf4 = SelectKBest(score_func=chi2,k=8)

### Model Selection

In [13]:
trf5 = DecisionTreeClassifier()

### Create Pipeline

In [49]:
pipe = Pipeline([
    ('trf1',trf1),
    ('trf2',trf2),
    ('trf3',trf3),
    ('trf4',trf4),
    ('trf5',trf5)
])

### Model Training using Pipeline

In [50]:
pipe.fit(X_train.values,y_train.values)

### Explore Pipeline

In [51]:
pipe.named_steps

{'trf1': ColumnTransformer(remainder='passthrough',
                   transformers=[('impute_age', SimpleImputer(), [2]),
                                 ('impute_embarked',
                                  SimpleImputer(strategy='most_frequent'),
                                  [6])]),
 'trf2': ColumnTransformer(remainder='passthrough',
                   transformers=[('ohe_sex_embarked',
                                  OneHotEncoder(handle_unknown='ignore',
                                                sparse=False),
                                  [1, 6])]),
 'trf3': ColumnTransformer(transformers=[('scale', MinMaxScaler(), slice(0, 10, None))]),
 'trf4': SelectKBest(k=8, score_func=<function chi2 at 0x000001F0A36C2CB0>),
 'trf5': DecisionTreeClassifier()}

### Model Prediction

In [53]:
y_pred = pipe.predict(X_test.values)
y_pred

array([1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0,
       1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 1, 1, 0, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 1, 1, 1,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1,
       0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1,
       0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1,
       0, 0, 0], dtype=int64)

### Model Evaluation using Accuracy Score

In [54]:
from sklearn.metrics import accuracy_score
f"{accuracy_score(y_test,y_pred):.4f}"

'0.6257'

### Cross Validation using Pipeline

In [55]:
from sklearn.model_selection import cross_val_score

f"{cross_val_score(pipe, X_train, y_train, cv=5, scoring='accuracy').mean():.4f}"

'0.6391'

### GridSearch using Pipeline

In [57]:
from sklearn.model_selection import GridSearchCV

params = {'trf5__max_depth':[1,2,3,4,5,20,None]}
grid = GridSearchCV(pipe, params, cv=10, scoring='accuracy')
grid.fit(X_train, y_train)

### GridSearch Score

In [58]:
f"{grid.best_score_:.4f}"

'0.6391'

### Best Parameters of Model using GridSearch

In [59]:
grid.best_params_

{'trf5__max_depth': 1}

### Export Pipeline

In [60]:
import pickle
pickle.dump(pipe,open('titanic_pipe.pkl','wb'))

### Load Pipeline | Production

In [61]:
pipe = pickle.load(open('titanic_pipe.pkl','rb'))

### User Input | Production

In [62]:
user_input = np.array([2, 'male', 31.0, 0, 0, 10.5, 'S'],dtype=object).reshape(1,7)

### Predict | Production

In [63]:
pipe.predict(user_input)

array([0], dtype=int64)