Authenticate to Kaggle

In [1]:
!mkdir ~/.kaggle

In [3]:
!cp kaggle.json ~/.kaggle/kaggle.json

In [4]:
!chmod 600 ~/.kaggle/kaggle.json

In [None]:
!kaggle competitions download -c house-prices-advanced-regression-techniques

In [None]:
!unzip house-prices-advanced-regression-techniques.zip

Pipeline

In [8]:
import pandas as pd
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor

In [9]:
df = pd.read_csv('train.csv')

In [10]:
df.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


In [11]:
# Eliminate Nans by Selecting Columns
select_df = df[['MSSubClass','MSZoning','LotFrontage','LotArea', 'Street', 'LotShape',
        'LandContour',	'Utilities', 'MiscVal', 'MoSold', 'YrSold', 'SaleType', 'SalePrice']].dropna()

In [12]:
X = pd.get_dummies(select_df.drop('SalePrice', axis=1))
y = select_df.SalePrice

In [13]:
pipeline = make_pipeline(StandardScaler(), RandomForestRegressor())

In [14]:
pipeline.fit(X,y)

Pipeline(steps=[('standardscaler', StandardScaler()),
                ('randomforestregressor', RandomForestRegressor())])

In [15]:
pipeline.predict(X)

array([201922.  , 166307.82, 215542.  , ..., 221995.79, 141935.  ,
       155371.75])

Save Pipeline

In [16]:
import pickle

In [17]:
with open('pipelinemodel.pkl', 'wb') as f:
  pickle.dump(pipeline, f)

In [18]:
with open('pipelinemodel.pkl', 'rb') as f: 
  reloaded_model = pickle.load(f)

In [19]:
reloaded_model

Pipeline(steps=[('standardscaler', StandardScaler()),
                ('randomforestregressor', RandomForestRegressor())])

In [20]:
reloaded_model.steps[1][1].predict(X)

  f"X has feature names, but {self.__class__.__name__} was fitted without"


array([330021.25, 330021.25, 377489.  , ..., 330021.25, 330021.25,
       330021.25])

Using Pipeline Class

In [21]:
custom_pipeline = Pipeline([('scaling', StandardScaler()),('rfmodel', RandomForestRegressor())])

In [22]:
custom_pipeline

Pipeline(steps=[('scaling', StandardScaler()),
                ('rfmodel', RandomForestRegressor())])

In [23]:
make_pipeline_model = make_pipeline(StandardScaler(), RandomForestRegressor())

In [24]:
make_pipeline_model

Pipeline(steps=[('standardscaler', StandardScaler()),
                ('randomforestregressor', RandomForestRegressor())])

Column Transformers

In [25]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder

In [26]:
select_df.select_dtypes('object').columns

Index(['MSZoning', 'Street', 'LotShape', 'LandContour', 'Utilities',
       'SaleType'],
      dtype='object')

In [27]:
# Numeric Features
numeric_features = select_df.drop('SalePrice', axis=1).select_dtypes(exclude='object').columns
numeric_pipeline = Pipeline([('scaler', StandardScaler())])

In [28]:
# Categorical Features
categorical_features = select_df.select_dtypes('object').columns
categorical_pipeline = Pipeline([('onehot', OneHotEncoder())])

In [29]:
transformer = ColumnTransformer([('numeric_preprocessing', numeric_pipeline, numeric_features), 
                                 ('categorical_preprocessing', categorical_pipeline, categorical_features)])

In [30]:
ml_pipeline = Pipeline([('all_column_preprocessing', transformer), ('randforestclassifier', RandomForestRegressor())])

In [31]:
X = select_df.drop('SalePrice', axis=1)
y = select_df['SalePrice']

In [32]:
ml_pipeline.fit(X, y)

Pipeline(steps=[('all_column_preprocessing',
                 ColumnTransformer(transformers=[('numeric_preprocessing',
                                                  Pipeline(steps=[('scaler',
                                                                   StandardScaler())]),
                                                  Index(['MSSubClass', 'LotFrontage', 'LotArea', 'MiscVal', 'MoSold', 'YrSold'], dtype='object')),
                                                 ('categorical_preprocessing',
                                                  Pipeline(steps=[('onehot',
                                                                   OneHotEncoder())]),
                                                  Index(['MSZoning', 'Street', 'LotShape', 'LandContour', 'Utilities',
       'SaleType'],
      dtype='object'))])),
                ('randforestclassifier', RandomForestRegressor())])

In [33]:
ml_pipeline.predict(X)

array([203579.5 , 169928.  , 215177.  , ..., 223573.  , 142349.25,
       153680.  ])

In [34]:
with open('columntransformermodel.pkl', 'wb') as f: 
  pickle.dump(ml_pipeline, f)

In [35]:
with open('columntransformermodel.pkl', 'rb') as f: 
  reloaded_ml_pipeline = pickle.load(f)

In [36]:
reloaded_ml_pipeline

Pipeline(steps=[('all_column_preprocessing',
                 ColumnTransformer(transformers=[('numeric_preprocessing',
                                                  Pipeline(steps=[('scaler',
                                                                   StandardScaler())]),
                                                  Index(['MSSubClass', 'LotFrontage', 'LotArea', 'MiscVal', 'MoSold', 'YrSold'], dtype='object')),
                                                 ('categorical_preprocessing',
                                                  Pipeline(steps=[('onehot',
                                                                   OneHotEncoder())]),
                                                  Index(['MSZoning', 'Street', 'LotShape', 'LandContour', 'Utilities',
       'SaleType'],
      dtype='object'))])),
                ('randforestclassifier', RandomForestRegressor())])