# ML Pipeline Scoring new Data

We we'll be doing here is to repeat every process from feature engineering to prediction on the test data for this project

In [3]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt


import scipy.stats as stats

import joblib


In [4]:
data = pd.read_csv('test.csv')

print(data.shape)

data.head(4)

(1459, 80)


Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition
0,1461,20,RH,80.0,11622,Pave,,Reg,Lvl,AllPub,...,120,0,,MnPrv,,0,6,2010,WD,Normal
1,1462,20,RL,81.0,14267,Pave,,IR1,Lvl,AllPub,...,0,0,,,Gar2,12500,6,2010,WD,Normal
2,1463,60,RL,74.0,13830,Pave,,IR1,Lvl,AllPub,...,0,0,,MnPrv,,0,3,2010,WD,Normal
3,1464,60,RL,78.0,9978,Pave,,IR1,Lvl,AllPub,...,0,0,,,,0,6,2010,WD,Normal


In [5]:
# drop id column

data.drop('Id', axis = 1, inplace=True)

data.shape

(1459, 79)

## Feature Engineering

1. Missing values
3. Temporal variables
3. Non-Gaussian distributed variables
4. Categorical variables: remove rare labels
5. Categorical variables: convert string to numbers
6. Put the variables in a similar scale

### Missing values

In [6]:
# during featurinf MSSubClass was cast as a categorical variable
# the same thing will be done 

data['MSSubClass'] = data['MSSubClass'].astype('O')

In [7]:
# we fill variuous category of categorical variables

with_string_missing = ['Alley', 'FireplaceQu',
                       'PoolQC', 'Fence', 'MiscFeature']

with_mode_missing = {
    'MasVnrType': 'None',
    'BsmtQual': 'TA',
    'BsmtCond': 'TA',
    'BsmtExposure': 'No',
    'BsmtFinType1': 'Unf',
    'BsmtFinType2': 'Unf',
    'Electrical': 'SBrkr',
    'GarageType': 'Attchd',
    'GarageFinish': 'Unf',
    'GarageQual': 'TA',
    'GarageCond': 'TA',
}

In [13]:
with_mode_missing['GarageCond']

'TA'

In [8]:
# replace missing values with new label: Missing

data[with_string_missing] = data[with_string_missing].fillna('Missing')



In [9]:
with_mode_missing.keys()

dict_keys(['MasVnrType', 'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2', 'Electrical', 'GarageType', 'GarageFinish', 'GarageQual', 'GarageCond'])

In [14]:
# replace missing values with frequent category

for var in with_mode_missing.keys():
    data[var].fillna(with_mode_missing[var], inplace = True)
    #data[var].fillna(with_mode_missing)

### Numerical variables

To engineer missing values in numerical variables, we will

1. add a binary missing value indicator column
2. replace the missing values in the original variable with the mean 

In [15]:
# from the feature engineering notebook we already obtained the mean 
# of the numerical variables with null values


vars_with_na = {
    'LotFrontage' : 69.87974098057354,
    'MasVnrArea' : 103.7974006116208,
    'GarageYrBlt' : 1978.2959677419356
}

In [16]:
# add binary indicator to the numerical variabl;e with missing va;ues

for var in vars_with_na:

    data[var + '_na'] = np.where(data[var].isnull(), 1, 0)

    data[var].fillna(vars_with_na[var], inplace = True)


data[vars_with_na].isnull().sum()

LotFrontage    0
MasVnrArea     0
GarageYrBlt    0
dtype: int64

In [18]:
# let check the binary indicator column

data[['LotFrontage_na', 'MasVnrArea_na', 'GarageYrBlt_na']].tail()

Unnamed: 0,LotFrontage_na,MasVnrArea_na,GarageYrBlt_na
1454,0,0,1
1455,0,0,0
1456,0,0,0
1457,0,0,1
1458,0,0,0


### Temporal Variable

As in the feature engineering notebook we will calculate the elapse time between when a house a house was sold and each of the temporal variables

In [19]:
def elapsed_time(df, var):

    df[var] = df['YrSold'] - df[var]

    return df

In [20]:
for var in ['YearBuilt', 'YearRemodAdd', 'GarageYrBlt']:
    data = elapsed_time(data, var)