In [None]:
!mkdir ~/.kaggle

In [None]:
!cp kaggle.json ~/.kaggle/kaggle.json

In [None]:
!chmod 600 ~/.kaggle/kaggle.json # change permission

In [None]:
!pip list

In [None]:
!kaggle competitions download -c house-prices-advanced-regression-techniques # setup kaggle api

Downloading house-prices-advanced-regression-techniques.zip to /content
  0% 0.00/199k [00:00<?, ?B/s]
100% 199k/199k [00:00<00:00, 70.0MB/s]


In [None]:
!unzip house-prices-advanced-regression-techniques.zip

Archive:  house-prices-advanced-regression-techniques.zip
  inflating: data_description.txt    
  inflating: sample_submission.csv   
  inflating: test.csv                
  inflating: train.csv               


## Pipeline practice

In [None]:
import pandas as pd

from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor

In [None]:
df = pd.read_csv("train.csv")
df.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


In [None]:
df.columns

Index(['Id', 'MSSubClass', 'MSZoning', 'LotFrontage', 'LotArea', 'Street',
       'Alley', 'LotShape', 'LandContour', 'Utilities', 'LotConfig',
       'LandSlope', 'Neighborhood', 'Condition1', 'Condition2', 'BldgType',
       'HouseStyle', 'OverallQual', 'OverallCond', 'YearBuilt', 'YearRemodAdd',
       'RoofStyle', 'RoofMatl', 'Exterior1st', 'Exterior2nd', 'MasVnrType',
       'MasVnrArea', 'ExterQual', 'ExterCond', 'Foundation', 'BsmtQual',
       'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinSF1',
       'BsmtFinType2', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', 'Heating',
       'HeatingQC', 'CentralAir', 'Electrical', '1stFlrSF', '2ndFlrSF',
       'LowQualFinSF', 'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath',
       'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr', 'KitchenQual',
       'TotRmsAbvGrd', 'Functional', 'Fireplaces', 'FireplaceQu', 'GarageType',
       'GarageYrBlt', 'GarageFinish', 'GarageCars', 'GarageArea', 'GarageQual',
       'GarageCond', 'PavedDrive

In [None]:
select_df=df[['MSSubClass', 'MSZoning', 'LotFrontage', 'LotArea', 'Street',
       'LotShape', 'LandContour', 'Utilities', 'MiscVal', 'MoSold', 'YrSold', 'SaleType',
       'SalePrice']].dropna()
select_df.describe()

Unnamed: 0,MSSubClass,LotFrontage,LotArea,MiscVal,MoSold,YrSold,SalePrice
count,1201.0,1201.0,1201.0,1201.0,1201.0,1201.0,1201.0
mean,57.198168,70.049958,9951.698585,25.490425,6.342215,2007.802664,180770.480433
std,43.106427,24.284752,7924.353975,189.782665,2.703755,1.330486,83389.519866
min,20.0,21.0,1300.0,0.0,1.0,2006.0,34900.0
25%,20.0,59.0,7420.0,0.0,5.0,2007.0,127500.0
50%,50.0,69.0,9262.0,0.0,6.0,2008.0,159500.0
75%,70.0,80.0,11249.0,0.0,8.0,2009.0,213500.0
max,190.0,313.0,215245.0,3500.0,12.0,2010.0,755000.0


In [None]:
# Only certain columns so I dont have to deal with Nan for now
X=pd.get_dummies(select_df.drop(columns="SalePrice"))
y=select_df.SalePrice

In [None]:
pipeline=make_pipeline(StandardScaler(), RandomForestRegressor())

In [None]:
pipeline.fit(X,y)

Pipeline(steps=[('standardscaler', StandardScaler()),
                ('randomforestregressor', RandomForestRegressor())])

In [None]:
pipeline.predict(X)

array([199277.5 , 166856.64, 223489.79, ..., 239215.  , 144131.5 ,
       153851.5 ])

## Save the Pipeline

In [None]:
import pickle
#pickle.dump (wb)
#pickle.load (rb)

In [None]:
with open("pipelinemodel.pkl","wb") as f:
  pickle.dump(pipeline, f)

In [None]:
with open("pipelinemodel.pkl","rb") as f:
  reloaded_model=pickle.load(f)

In [None]:
reloaded_model

Pipeline(steps=[('standardscaler', StandardScaler()),
                ('randomforestregressor', RandomForestRegressor())])

In [None]:
# can extract certain step
# get feature importance value
# get coefficient (for linear model)

In [None]:
reloaded_model.steps

[('standardscaler', StandardScaler()),
 ('randomforestregressor', RandomForestRegressor())]

In [None]:
#extract only random forest regressor model
print(reloaded_model.steps[1])
print("..................................................")
print(reloaded_model.steps[1][1])
print("......................")
print(reloaded_model.steps[1][1].predict(X))

('randomforestregressor', RandomForestRegressor())
..................................................
RandomForestRegressor()
......................
[319696.25 319696.25 359700.   ... 319696.25 319696.25 319696.25]


  f"X has feature names, but {self.__class__.__name__} was fitted without"


## Difference between pipeline and make_pipeline

In [None]:
# make_pipeline automatically name each step

In [None]:
from sklearn.pipeline import Pipeline

In [None]:
pipeline = make_pipeline(StandardScaler(),RandomForestRegressor())

In [None]:
custom_pipeline=Pipeline([
    ("sc",StandardScaler()),
    ("rfmodel",RandomForestRegressor())
])

## Column Transformer

In [None]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder

In [None]:
# identify numerical features and categorical features
# make pipeline for both numerical features and categorical features
# numerical features ---> scaling
# categorical features ---> onehotencoder, pdgetdummies
# finally we can combine them both :)))
# finally finally we can create our machine learning pipeline

In [None]:
select_df.dtypes

MSSubClass       int64
MSZoning        object
LotFrontage    float64
LotArea          int64
Street          object
LotShape        object
LandContour     object
Utilities       object
MiscVal          int64
MoSold           int64
YrSold           int64
SaleType        object
SalePrice        int64
dtype: object

In [None]:
select_df.select_dtypes("object").columns # this gives me the column names for categorical columns

Index(['MSZoning', 'Street', 'LotShape', 'LandContour', 'Utilities',
       'SaleType'],
      dtype='object')

In [None]:
# Numeric features
numerical_features=select_df.drop(columns="SalePrice").select_dtypes(exclude="object").columns
print(numerical_features)
numerical_pipeline=Pipeline([("scaling",StandardScaler())])

Index(['MSSubClass', 'LotFrontage', 'LotArea', 'MiscVal', 'MoSold', 'YrSold'], dtype='object')


In [None]:
# Categorical features
categorical_features=select_df.select_dtypes("object").columns
print(categorical_features)
categorical_pipeline=Pipeline([("onehot",OneHotEncoder())])

Index(['MSZoning', 'Street', 'LotShape', 'LandContour', 'Utilities',
       'SaleType'],
      dtype='object')


In [None]:
##ColumnTransformer??

In [None]:
transformer=ColumnTransformer([("numeric_preprocessing",numerical_pipeline,numerical_features),
                               ("categorical_preprocessing",categorical_pipeline,categorical_features)
])

In [None]:
transformer

ColumnTransformer(transformers=[('numeric_preprocessing',
                                 Pipeline(steps=[('scaling',
                                                  StandardScaler())]),
                                 Index(['MSSubClass', 'LotFrontage', 'LotArea', 'MiscVal', 'MoSold', 'YrSold'], dtype='object')),
                                ('categorical_preprocessing',
                                 Pipeline(steps=[('onehot', OneHotEncoder())]),
                                 Index(['MSZoning', 'Street', 'LotShape', 'LandContour', 'Utilities',
       'SaleType'],
      dtype='object'))])

In [None]:
ml_pipeline=Pipeline([("all_column_preprocessing", transformer),
              ("randomforestregressor",RandomForestRegressor())])

## Make Predictions

In [None]:
X=select_df.drop(columns="SalePrice")
y=select_df["SalePrice"]

In [None]:
ml_pipeline.fit(X,y)

Pipeline(steps=[('all_column_preprocessing',
                 ColumnTransformer(transformers=[('numeric_preprocessing',
                                                  Pipeline(steps=[('scaling',
                                                                   StandardScaler())]),
                                                  Index(['MSSubClass', 'LotFrontage', 'LotArea', 'MiscVal', 'MoSold', 'YrSold'], dtype='object')),
                                                 ('categorical_preprocessing',
                                                  Pipeline(steps=[('onehot',
                                                                   OneHotEncoder())]),
                                                  Index(['MSZoning', 'Street', 'LotShape', 'LandContour', 'Utilities',
       'SaleType'],
      dtype='object'))])),
                ('randomforestregressor', RandomForestRegressor())])

In [None]:
ml_pipeline.predict(X)

array([204844.  , 170672.32, 212384.  , ..., 225723.75, 143585.5 ,
       151517.5 ])

In [None]:
# You can preprocess the data, drop columns, inpute data, transform/scale the data 
# inside the pipeline. Prevents data leakage.

## Save the Pipeline

In [None]:
with open ("columntransformermodel.pkl","wb") as f:
  pickle.dump(ml_pipeline,f)

In [None]:
with open ("columntransformermodel.pkl","rb") as f:
  reloaded_ml_pipeline=pickle.load(f)

reloaded_ml_pipeline

Pipeline(steps=[('all_column_preprocessing',
                 ColumnTransformer(transformers=[('numeric_preprocessing',
                                                  Pipeline(steps=[('scaling',
                                                                   StandardScaler())]),
                                                  Index(['MSSubClass', 'LotFrontage', 'LotArea', 'MiscVal', 'MoSold', 'YrSold'], dtype='object')),
                                                 ('categorical_preprocessing',
                                                  Pipeline(steps=[('onehot',
                                                                   OneHotEncoder())]),
                                                  Index(['MSZoning', 'Street', 'LotShape', 'LandContour', 'Utilities',
       'SaleType'],
      dtype='object'))])),
                ('randomforestregressor', RandomForestRegressor())])