## <font color='clue'> Pipeline </font>

In [1]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression

In [2]:
##Example 1

In [3]:
steps=[("standard_scaler",StandardScaler()),
       ("classifier",LogisticRegression())]

In [4]:
steps

[('standard_scaler', StandardScaler()), ('classifier', LogisticRegression())]

**`Now to convert it into pipeline`**

In [5]:
pipe=Pipeline(steps)

In [6]:
pipe

In [7]:
from sklearn.datasets import make_classification
X,y=make_classification(n_samples=1000,n_features=13)

In [8]:
X.shape

(1000, 13)

In [9]:
y.shape

(1000,)

In [10]:
from sklearn.model_selection import train_test_split

In [11]:
xtrain,xtest,ytrain,ytest=train_test_split(X,y,test_size=.2,random_state=17)

In [12]:
pipe.fit(xtrain,ytrain)

In [13]:
ypred=pipe.predict(xtest)

In [14]:
pipe.score(xtest,ytest)

0.925

In [15]:
lr=LogisticRegression()
lr.fit(xtrain,ytrain)

In [16]:
lr.score(xtest,ytest)

0.925

In [17]:
##Example2

In [18]:
from sklearn.decomposition import PCA
from sklearn.svm import SVC

In [19]:
steps=[("scaling",StandardScaler()),
       ("PCA",PCA(n_components=3)),
       ("SVC",SVC())
       ]

In [20]:
pipe2=Pipeline(steps)
pipe2

In [21]:
##If we want to just check scaling 
pipe2['scaling'].fit_transform(xtrain)

array([[-2.20051265, -1.02513551,  1.34076013, ...,  0.01905006,
        -0.58136335,  0.00886191],
       [-0.78053327,  0.43078255,  2.08119513, ...,  0.16334348,
         0.92151069,  0.38848127],
       [ 0.45725832,  0.53219863,  0.55144849, ...,  1.15525925,
        -1.04378012,  1.93268   ],
       ...,
       [-1.21337309, -1.48655383, -0.08598621, ...,  1.51401412,
         1.46776097, -0.74088854],
       [ 1.27127213,  0.61477416,  0.06174875, ...,  0.45734596,
         0.82174914, -0.7925943 ],
       [ 0.38052481,  1.12180146,  0.65075636, ...,  0.34383882,
         0.69724367,  0.02124335]])

In [22]:
pipe2.fit(xtrain,ytrain)

In [23]:
pipe2.predict(xtest)

array([1, 0, 1, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0,
       0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0,
       1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 1, 1, 0,
       1, 0, 1, 0, 1, 1, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 1, 1, 0, 0, 1,
       1, 0, 1, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 0,
       0, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0,
       0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1,
       0, 0, 1, 0, 1, 0, 0, 0, 1, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1, 1, 0, 1,
       0, 1, 0, 0, 1, 1, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1,
       0, 0])

In [24]:
pipe2.score(xtest,ytest)

0.915

### <div class="alert alert-info"> Complex examples of columns transformer </div>

In [25]:
from sklearn.impute import SimpleImputer
import numpy as np

In [26]:
#For numerical data
steps=[("impute_mean",SimpleImputer(missing_values=np.nan,strategy="mean")),
       ("standard_scaler",StandardScaler())]

In [27]:
numeircal_processor=Pipeline(steps)
numeircal_processor

In [28]:
#for categorical data
from sklearn.preprocessing import OneHotEncoder

In [29]:
steps=[("impute_mode",SimpleImputer(missing_values=np.nan,strategy="mode")),
       ("onehoy",OneHotEncoder(handle_unknown='ignore'))]

In [30]:
categorical_processor=Pipeline(steps)
categorical_processor

In [31]:
## combine processing techniques
from sklearn.compose import ColumnTransformer

In [32]:
processor=ColumnTransformer(
    [("categorical",categorical_processor,['Gender','City']),
     ('numerical',numeircal_processor,['age','height'])]
)
processor

In [33]:
##Now to add an estimator
from sklearn.pipeline import make_pipeline

In [34]:
pipe=make_pipeline(processor,LogisticRegression())
pipe

### `Final Example`

In [35]:
import numpy as np
from sklearn.pipeline import make_pipeline
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split

In [36]:
import seaborn as sns

In [37]:
df=sns.load_dataset('tips')
df.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2
1,10.34,1.66,Male,No,Sun,Dinner,3
2,21.01,3.5,Male,No,Sun,Dinner,3
3,23.68,3.31,Male,No,Sun,Dinner,2
4,24.59,3.61,Female,No,Sun,Dinner,4


In [38]:
X=df.iloc[:,1:]
y=df['total_bill']

In [39]:
xtrain,xtest,ytrain,ytest=train_test_split(X,y,test_size=.2,random_state=77)

In [40]:
#Pipeline
numerical_processor=Pipeline(
    steps=[("impute_mean",SimpleImputer(missing_values=np.nan,strategy="mean")),
       ("standard_scaler",StandardScaler())]
)
numeircal_processor

In [41]:
categorical_processor=Pipeline(
         steps=[("impute_mode",SimpleImputer(fill_value='missing',strategy="constant")),
       ("onehoy",OneHotEncoder(handle_unknown='ignore'))]
)
categorical_processor

In [42]:
df.head(2)

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2
1,10.34,1.66,Male,No,Sun,Dinner,3


In [43]:
processor=ColumnTransformer(
    [("categorical",categorical_processor,['sex','smoker','day','time']),
     ('numerical',numeircal_processor,['tip','size'])]
)
processor

In [66]:
pipe=Pipeline(
       steps=[('processor',processor),('regressor',RandomForestRegressor())]
)

In [67]:
pipe.fit(xtrain,ytrain)

In [68]:
rf=RandomForestRegressor()

In [69]:
pipe.score(xtest,ytest)

0.0697049964211035

### `Adding Hyperparameter Tuning`

In [70]:
import warnings
warnings.filterwarnings('ignore')

In [77]:
param_grid={
    'regressor__n_estimators':[200,500],
    'regressor__max_features':['auto','sqrt','log2'],
    'regressor__max_depth':[4,5,6,7,8]
}


#Wrong way

In [82]:
'''param_grid = {
    'randomforestregressor__n_estimators': [200, 500],
    'randomforestregressor__max_features': ['auto', 'sqrt', 'log2'],
    'randomforestregressor__max_depth': [4, 5, 6, 7, 8]
}
gs = GridSearchCV(pipe, param_grid=param_grid, n_jobs=1)'''

"param_grid = {\n    'randomforestregressor__n_estimators': [200, 500],\n    'randomforestregressor__max_features': ['auto', 'sqrt', 'log2'],\n    'randomforestregressor__max_depth': [4, 5, 6, 7, 8]\n}\ngs = GridSearchCV(pipe, param_grid=param_grid, n_jobs=1)"

In [78]:
param_grid

{'regressor__n_estimators': [200, 500],
 'regressor__max_features': ['auto', 'sqrt', 'log2'],
 'regressor__max_depth': [4, 5, 6, 7, 8]}

In [79]:
gs=GridSearchCV(pipe,param_grid=param_grid,n_jobs=1)

In [80]:
gs.fit(xtrain,ytrain)

In [81]:
gs.best_params_

{'regressor__max_depth': 7,
 'regressor__max_features': 'log2',
 'regressor__n_estimators': 500}

In [83]:
pipe=Pipeline(
       steps=[('processor',processor),('regressor',RandomForestRegressor(max_depth=7,max_features='log2',n_estimators=500))]
)

In [84]:
pipe.fit(xtrain,ytrain)

In [85]:
pipe.score(xtest,ytest)

0.2274140473398013