<img src="http://imgur.com/1ZcRyrc.png" style="float: left; margin: 20px; height: 55px">

# Project 2 
## Part 2: Preprocessing and Feature Engineering


In [1]:
# imports
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression, Lasso, LassoCV, RidgeCV,LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import r2_score
from sklearn.impute import SimpleImputer
from sklearn.feature_selection import SelectFromModel

In [2]:
%store -r df

In [3]:
# Train/Test split before imputing for the 'Lot Frontage' and 'Garage Yr Blt' columns
X=df.drop(columns=['SalePrice'])
y=df['SalePrice']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

In [4]:
%store y_train

Stored 'y_train' (Series)


In [5]:
%store y_test

Stored 'y_test' (Series)


In [6]:
print(X_train.shape,X_test.shape,len(y_train),len(y_test))

(1620, 80) (406, 80) 1620 406


In [7]:
# imputation transformer for completing missing values
imp = SimpleImputer(strategy = 'most_frequent')  # instantiate

In [8]:
imp_X_train = imp.fit_transform(X_train) 

In [9]:
imp_X_train.shape

(1620, 80)

In [10]:
df_X_train=pd.DataFrame(imp_X_train, columns = X_train.columns)

In [11]:
# Check if there are no nan values in training dataset
df_X_train.columns[df_X_train.isna().any()]

Index([], dtype='object')

In [12]:
# Check if there are columns containing empty values in test dataset
X_test.isna().any()

Id              False
PID             False
MS SubClass     False
MS Zoning       False
Lot Frontage     True
                ...  
Misc Feature    False
Misc Val        False
Mo Sold         False
Yr Sold         False
Sale Type       False
Length: 80, dtype: bool

In [13]:
# Perform imputation on test dataset seperately from training dataset to prevent data leakage
imp_X_test = imp.fit_transform(X_test) 
df_X_test=pd.DataFrame(imp_X_test, columns = X_test.columns)

In [14]:
# Check if there are no nan values in test dataset
df_X_test.columns[df_X_test.isna().any()]

Index([], dtype='object')

In [15]:
df_X_train.shape

(1620, 80)

In [16]:
df_X_test.shape

(406, 80)

#### Dummify the categorical columns

In [17]:
# dummify the training dataset first 
X_train_dummified= pd.get_dummies(X_train, drop_first=True)

In [18]:
print(X_train_dummified.shape)

(1620, 259)


In [19]:
# dummify the test dataset seperately
X_test_dummified=pd.get_dummies(X_test,drop_first=True)

In [20]:
print(X_test_dummified.shape)

(406, 229)


We can see that there are 259 columns in the training dataset, whereas only 229 columns in the test dataset. For consistency we need to use the columns in the training dataset to complete the columns in the test dataset. 

In [21]:
# An outer join to include all the columns that exist in training dataset but asbsent from test dataset.
X2=X_test_dummified.align(X_train_dummified, axis=1)[0].fillna(0)

In [22]:
X2.shape

(406, 263)

As we see the number of columns in the outer joined is larger than 259, we need to do the similar action on the training dataset.

In [23]:
X1=X_train_dummified.align(X_test_dummified,axis=1)[0].fillna(0)

In [24]:
X1.shape

(1620, 263)

In [25]:
# Store X1 to call for in the second jupyter notebook
%store X1

Stored 'X1' (DataFrame)


In [26]:
np.std(X1['1st Flr SF'])

395.53233951844976

#### Scale the training and testing datasets separately.

In [27]:
# Scale transform the training data 
ss = StandardScaler() 
X_scaled = ss.fit_transform(X1) # fit and transform (scale) X
#X_scaled[0, :] # preview post-scaling output from 0th row and all cols

In [28]:
%store X_scaled

Stored 'X_scaled' (ndarray)


In [29]:
%store ss

Stored 'ss' (StandardScaler)


In [30]:
X_scaled.shape

(1620, 263)

In [31]:
X_test = ss.transform(X2)

In [32]:
X_test.shape

(406, 263)

In [33]:
%store X_test

Stored 'X_test' (ndarray)
