# ML Pipelines

In [12]:
#IF you can package your model into a tightest possible little capsule that makes your life easier.

In [13]:
#we can combine some data preprocessing techniques using pipelines

In [14]:
#Column transformer -:
#This allows you to do a bunch of additional stuff like you can say these operations should be done for these columns

In [15]:
!mkdir ~/.kaggle

mkdir: cannot create directory ‘/root/.kaggle’: File exists


In [16]:
!cp kaggle.json ~/.kaggle/kaggle.json

In [17]:
!chmod 600 ~/.kaggle/kaggle.json

In [18]:
!kaggle competitions download -c house-prices-advanced-regression-techniques

Downloading house-prices-advanced-regression-techniques.zip to /content
  0% 0.00/199k [00:00<?, ?B/s]
100% 199k/199k [00:00<00:00, 86.0MB/s]


In [19]:
!unzip /content/house-prices-advanced-regression-techniques.zip

Archive:  /content/house-prices-advanced-regression-techniques.zip
  inflating: data_description.txt    
  inflating: sample_submission.csv   
  inflating: test.csv                
  inflating: train.csv               


# Pipeline Practise

In [34]:
import pandas as pd
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import StandardScaler

In [21]:
df = pd.read_csv('train.csv')

In [22]:
df.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


In [23]:
select_df = df[['MSSubClass',	'MSZoning',	'LotFrontage',	'LotArea',	'Street', 'LotShape',	'LandContour',	'Utilities', 'MiscVal',	'MoSold',	'YrSold',	'SaleType',	'SaleCondition', 'SalePrice']].dropna()

In [24]:
#Only certain columns so no need of thinking about Nan values now.
X = pd.get_dummies(select_df.drop('SalePrice', axis=1))
y= select_df.SalePrice

In [25]:
X.head()

Unnamed: 0,MSSubClass,LotFrontage,LotArea,MiscVal,MoSold,YrSold,MSZoning_C (all),MSZoning_FV,MSZoning_RH,MSZoning_RL,...,SaleType_ConLw,SaleType_New,SaleType_Oth,SaleType_WD,SaleCondition_Abnorml,SaleCondition_AdjLand,SaleCondition_Alloca,SaleCondition_Family,SaleCondition_Normal,SaleCondition_Partial
0,60,65.0,8450,0,2,2008,0,0,0,1,...,0,0,0,1,0,0,0,0,1,0
1,20,80.0,9600,0,5,2007,0,0,0,1,...,0,0,0,1,0,0,0,0,1,0
2,60,68.0,11250,0,9,2008,0,0,0,1,...,0,0,0,1,0,0,0,0,1,0
3,70,60.0,9550,0,2,2006,0,0,0,1,...,0,0,0,1,1,0,0,0,0,0
4,60,84.0,14260,0,12,2008,0,0,0,1,...,0,0,0,1,0,0,0,0,1,0


In [26]:
pipeline = make_pipeline(StandardScaler(),RandomForestRegressor())

In [27]:
pipeline.fit(X,y)

In [28]:
pipeline.predict(X)

array([205451.  , 167326.  , 221230.22, ..., 231728.  , 147748.  ,
       155205.  ])

# Save the Pipeline

In [29]:
import pickle

In [30]:
with open('pipelinemodel.pkl', 'wb') as f:
  pickle.dump(pipeline,f)

In [32]:
with open('pipelinemodel.pkl', 'rb') as f:
  reloaded_model = pickle.load(f)

In [33]:
reloaded_model

# Using the Pipeline class

In [35]:
#with pipeline class

custom_pipeline = Pipeline([('scaling', StandardScaler()),
                           ('rfmodel', RandomForestRegressor())
                           ])

In [36]:
#with make_pipeline class
make_pipeline_model = make_pipeline(StandardScaler(), RandomForestRegressor())

In [37]:
make_pipeline_model

# Column Transformers

In [92]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.ensemble import RandomForestRegressor

In [93]:
select_df.select_dtypes('object').columns

Index(['MSZoning', 'Street', 'LotShape', 'LandContour', 'Utilities',
       'SaleType', 'SaleCondition'],
      dtype='object')

In [94]:
#Numeric Features
numeric_features = select_df.drop('SalePrice', axis=1).select_dtypes(exclude='object').columns
numeric_pipeline = Pipeline([('scaler', StandardScaler())])

In [95]:
#Categorical Features
categorical_features = select_df.select_dtypes('object').columns
categorical_pipeline = Pipeline([('onehot', OneHotEncoder())])

In [96]:
transformer = ColumnTransformer([
    ('numeric_preprocessing', numeric_pipeline, numeric_features),
    ('categorical_preprocessing', categorical_pipeline, categorical_features)
])

In [97]:
transformer

In [109]:
ml_pipeline = Pipeline([
    ('all_column_preprocessing', transformer),
    ('randomforestregressor', RandomForestRegressor())
])

In [110]:
ml_pipeline

In [111]:
X= select_df.drop('SalePrice', axis=1)
y = select_df.SalePrice

In [112]:
ml_pipeline.fit(X, y)

In [113]:
ml_pipeline.predict(X)

array([204126.5 , 161685.  , 224237.5 , ..., 230364.53, 145598.  ,
       150162.  ])

In [114]:
with open('columntransformer.pkl', 'wb') as f:
  pickle.dump(ml_pipeline, f)