# BASIC ANALYSIS OF THE DATA

In [6]:
#Importing libraries
import numpy as np
import pandas as pd
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

In [9]:
#Loading the data
train = pd.read_csv(r'C:\\Users\\bhavi\\Downloads\\house-prices-advanced-regression-techniques\\train.csv')
test = pd.read_csv(r'C:\\Users\\bhavi\\Downloads\\house-prices-advanced-regression-techniques\\test.csv')

In [10]:
#View the shape of the data
print('Shape of train data = ',train.shape)
print('Shape of test data = ',test.shape)

Shape of train data =  (1460, 81)
Shape of test data =  (1459, 80)


In [12]:
#View the top five rows
display(train.head())
display(test.head())

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition
0,1461,20,RH,80.0,11622,Pave,,Reg,Lvl,AllPub,...,120,0,,MnPrv,,0,6,2010,WD,Normal
1,1462,20,RL,81.0,14267,Pave,,IR1,Lvl,AllPub,...,0,0,,,Gar2,12500,6,2010,WD,Normal
2,1463,60,RL,74.0,13830,Pave,,IR1,Lvl,AllPub,...,0,0,,MnPrv,,0,3,2010,WD,Normal
3,1464,60,RL,78.0,9978,Pave,,IR1,Lvl,AllPub,...,0,0,,,,0,6,2010,WD,Normal
4,1465,120,RL,43.0,5005,Pave,,IR1,HLS,AllPub,...,144,0,,,,0,1,2010,WD,Normal


# DATA SPLITTING AS X & Y

In [14]:
#Splitting train data as x for independent variables and y for dependent variable
X_train = train.drop(columns='SalePrice', axis = 1)
y_train = train['SalePrice']
#Creating copy of test data
X_test = test.copy()
#View the shape of the data
print('Shape of X train = ',X_train.shape)
print('Shape of y train = ',y_train.shape)
print('Shape of X test = ',X_test.shape)

Shape of X train =  (1460, 80)
Shape of y train =  (1460,)
Shape of X test =  (1459, 80)


# ANALYSIS OF NUMERICAL DATA

In [18]:
#Checking missing values in X train data
X_train_isnull = X_train.isnull().sum()
X_train_isnull[X_train_isnull>0]

LotFrontage      259
Alley           1369
MasVnrType         8
MasVnrArea         8
BsmtQual          37
BsmtCond          37
BsmtExposure      38
BsmtFinType1      37
BsmtFinType2      38
Electrical         1
FireplaceQu      690
GarageType        81
GarageYrBlt       81
GarageFinish      81
GarageQual        81
GarageCond        81
PoolQC          1453
Fence           1179
MiscFeature     1406
dtype: int64

# ANALYSIS OF MISSING VALUES IN NUMERICAL DATA

In [20]:
#Checking how many numeric variables have in X_train
num_vars = X_train.select_dtypes(include=['int64','float64']).columns
num_vars

Index(['Id', 'MSSubClass', 'LotFrontage', 'LotArea', 'OverallQual',
       'OverallCond', 'YearBuilt', 'YearRemodAdd', 'MasVnrArea', 'BsmtFinSF1',
       'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', '1stFlrSF', '2ndFlrSF',
       'LowQualFinSF', 'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath',
       'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr', 'TotRmsAbvGrd',
       'Fireplaces', 'GarageYrBlt', 'GarageCars', 'GarageArea', 'WoodDeckSF',
       'OpenPorchSF', 'EnclosedPorch', '3SsnPorch', 'ScreenPorch', 'PoolArea',
       'MiscVal', 'MoSold', 'YrSold'],
      dtype='object')

In [25]:
#Checking how many numeric variables have missing values
num_vars_miss = [var for var in num_vars if X_train_isnull[var]>0]
num_vars_miss

['LotFrontage', 'MasVnrArea', 'GarageYrBlt']

# ANALYSIS OF MISSING VALUES IN CATEGORICAL DATA

In [26]:
#Checking how many categorical variables have in X_train
cat_vars = X_train.select_dtypes(include=['object']).columns
cat_vars

Index(['MSZoning', 'Street', 'Alley', 'LotShape', 'LandContour', 'Utilities',
       'LotConfig', 'LandSlope', 'Neighborhood', 'Condition1', 'Condition2',
       'BldgType', 'HouseStyle', 'RoofStyle', 'RoofMatl', 'Exterior1st',
       'Exterior2nd', 'MasVnrType', 'ExterQual', 'ExterCond', 'Foundation',
       'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2',
       'Heating', 'HeatingQC', 'CentralAir', 'Electrical', 'KitchenQual',
       'Functional', 'FireplaceQu', 'GarageType', 'GarageFinish', 'GarageQual',
       'GarageCond', 'PavedDrive', 'PoolQC', 'Fence', 'MiscFeature',
       'SaleType', 'SaleCondition'],
      dtype='object')

In [27]:
#Checking how many numeric variables have missing values
cat_vars_miss = [var for var in cat_vars if X_train_isnull[var]>0]
cat_vars_miss

['Alley',
 'MasVnrType',
 'BsmtQual',
 'BsmtCond',
 'BsmtExposure',
 'BsmtFinType1',
 'BsmtFinType2',
 'Electrical',
 'FireplaceQu',
 'GarageType',
 'GarageFinish',
 'GarageQual',
 'GarageCond',
 'PoolQC',
 'Fence',
 'MiscFeature']

# CREATING & IMPLEMENTING PIPELINE

In [28]:
#Creating list variables for missing value imputation

#Imputing mean
num_var_mean = ['LotFrontage']

#Imputing median
num_var_median = ['MasVnrArea', 'GarageYrBlt']

#Imputing mode
cat_vars_mode = ['Alley',
 'MasVnrType',
 'BsmtQual',
 'BsmtCond',
 'BsmtExposure',
 'BsmtFinType1',
 'BsmtFinType2',
 'Electrical',
 'FireplaceQu']

#Imputing constant
cat_vars_missing = ['GarageType',
 'GarageFinish',
 'GarageQual',
 'GarageCond', 
 'PoolQC',
 'Fence',
 'MiscFeature']

In [29]:
#Creating Pipeline for missing value imputation
num_var_mean_imputer = Pipeline(steps=[('imputer', SimpleImputer(strategy='mean'))])
num_var_median_imputer = Pipeline(steps=[('imputer', SimpleImputer(strategy='median'))])
cat_vars_mode_imputer = Pipeline(steps=[('imputer', SimpleImputer(strategy='most_frequent'))])
cat_vars_missing_imputer = Pipeline(steps=[('imputer', SimpleImputer(strategy='constant', fill_value='missing'))])

In [31]:
#Transforming the imputers
preprocessor = ColumnTransformer(transformers=[('mean_imputer', num_var_mean_imputer, num_var_mean),
                                              ('median_imputer', num_var_median_imputer, num_var_median),
                                              ('mode_imputer', cat_vars_mode_imputer, cat_vars_mode),
                                              ('missing_imputer', cat_vars_missing_imputer, cat_vars_missing)])

In [32]:
#Applying transformer
preprocessor.fit(X_train)

ColumnTransformer(transformers=[('mean_imputer',
                                 Pipeline(steps=[('imputer', SimpleImputer())]),
                                 ['LotFrontage']),
                                ('median_imputer',
                                 Pipeline(steps=[('imputer',
                                                  SimpleImputer(strategy='median'))]),
                                 ['MasVnrArea', 'GarageYrBlt']),
                                ('mode_imputer',
                                 Pipeline(steps=[('imputer',
                                                  SimpleImputer(strategy='most_frequent'))]),
                                 ['Alley', 'MasVnrType', 'BsmtQual', 'BsmtCond',
                                  'BsmtExposure', 'BsmtFinType1',
                                  'BsmtFinType2', 'Electrical',
                                  'FireplaceQu']),
                                ('missing_imputer',
                                 Pipe

In [33]:
#Applying transformer
Applying transformer 
preprocessor.transform

<bound method ColumnTransformer.transform of ColumnTransformer(transformers=[('mean_imputer',
                                 Pipeline(steps=[('imputer', SimpleImputer())]),
                                 ['LotFrontage']),
                                ('median_imputer',
                                 Pipeline(steps=[('imputer',
                                                  SimpleImputer(strategy='median'))]),
                                 ['MasVnrArea', 'GarageYrBlt']),
                                ('mode_imputer',
                                 Pipeline(steps=[('imputer',
                                                  SimpleImputer(strategy='most_frequent'))]),
                                 ['Alley', 'MasVnrType', 'BsmtQual', 'BsmtCond',
                                  'BsmtExposure', 'BsmtFinType1',
                                  'BsmtFinType2', 'Electrical',
                                  'FireplaceQu']),
                                ('missing_im

In [34]:
#Getting mean through transformer
preprocessor.named_transformers_['mean_imputer'].named_steps['imputer'].statistics_

array([70.04995837])

In [36]:
#Checking mean
train['LotFrontage'].mean()

70.04995836802665

In [37]:
#Getting median through transformer
preprocessor.named_transformers_['median_imputer'].named_steps['imputer'].statistics_

array([   0., 1980.])

In [39]:
#Checking median
train[['MasVnrArea', 'GarageYrBlt']].median()

MasVnrArea        0.0
GarageYrBlt    1980.0
dtype: float64

In [40]:
#Getting mode through transformer
preprocessor.named_transformers_['mode_imputer'].named_steps['imputer'].statistics_

array(['Grvl', 'None', 'TA', 'TA', 'No', 'Unf', 'Unf', 'SBrkr', 'Gd'],
      dtype=object)

In [41]:
#Getting mode
train[['Alley',
 'MasVnrType',
 'BsmtQual',
 'BsmtCond',
 'BsmtExposure',
 'BsmtFinType1',
 'BsmtFinType2',
 'Electrical',
 'FireplaceQu']].mode()

Unnamed: 0,Alley,MasVnrType,BsmtQual,BsmtCond,BsmtExposure,BsmtFinType1,BsmtFinType2,Electrical,FireplaceQu
0,Grvl,,TA,TA,No,Unf,Unf,SBrkr,Gd


In [42]:
#Getting constant through transformer
preprocessor.named_transformers_['missing_imputer'].named_steps['imputer'].statistics_

array(['missing', 'missing', 'missing', 'missing', 'missing', 'missing',
       'missing'], dtype=object)

In [43]:
#Implementing mean, median and mode through pipeline on train and test data
X_train_clean = preprocessor.transform(X_train)
X_test_clean = preprocessor.transform(X_test)

In [44]:
#Checking data
X_train_clean

array([[65.0, 196.0, 2003.0, ..., 'missing', 'missing', 'missing'],
       [80.0, 0.0, 1976.0, ..., 'missing', 'missing', 'missing'],
       [68.0, 162.0, 2001.0, ..., 'missing', 'missing', 'missing'],
       ...,
       [66.0, 0.0, 1941.0, ..., 'missing', 'GdPrv', 'Shed'],
       [68.0, 0.0, 1950.0, ..., 'missing', 'missing', 'missing'],
       [75.0, 0.0, 1965.0, ..., 'missing', 'missing', 'missing']],
      dtype=object)

In [45]:
#Checking data
X_test_clean

array([[80.0, 0.0, 1961.0, ..., 'missing', 'MnPrv', 'missing'],
       [81.0, 108.0, 1958.0, ..., 'missing', 'missing', 'Gar2'],
       [74.0, 0.0, 1997.0, ..., 'missing', 'MnPrv', 'missing'],
       ...,
       [160.0, 0.0, 1960.0, ..., 'missing', 'missing', 'missing'],
       [62.0, 0.0, 1980.0, ..., 'missing', 'MnPrv', 'Shed'],
       [74.0, 94.0, 1993.0, ..., 'missing', 'missing', 'missing']],
      dtype=object)

In [46]:
preprocessor.transformers_

[('mean_imputer',
  Pipeline(steps=[('imputer', SimpleImputer())]),
  ['LotFrontage']),
 ('median_imputer',
  Pipeline(steps=[('imputer', SimpleImputer(strategy='median'))]),
  ['MasVnrArea', 'GarageYrBlt']),
 ('mode_imputer',
  Pipeline(steps=[('imputer', SimpleImputer(strategy='most_frequent'))]),
  ['Alley',
   'MasVnrType',
   'BsmtQual',
   'BsmtCond',
   'BsmtExposure',
   'BsmtFinType1',
   'BsmtFinType2',
   'Electrical',
   'FireplaceQu']),
 ('missing_imputer',
  Pipeline(steps=[('imputer',
                   SimpleImputer(fill_value='missing', strategy='constant'))]),
  ['GarageType',
   'GarageFinish',
   'GarageQual',
   'GarageCond',
   'PoolQC',
   'Fence',
   'MiscFeature']),
 ('remainder',
  'drop',
  [0,
   1,
   2,
   4,
   5,
   7,
   8,
   9,
   10,
   11,
   12,
   13,
   14,
   15,
   16,
   17,
   18,
   19,
   20,
   21,
   22,
   23,
   24,
   27,
   28,
   29,
   34,
   36,
   37,
   38,
   39,
   40,
   41,
   43,
   44,
   45,
   46,
   47,
   48,
   49,
   

In [47]:
#Creating data frame
X_train_clean_miss_var = pd.DataFrame(X_train_clean,
            columns=num_var_mean + num_var_median + cat_vars_mode + cat_vars_missing)

In [48]:
#Checking data
X_train_clean_miss_var.head()

Unnamed: 0,LotFrontage,MasVnrArea,GarageYrBlt,Alley,MasVnrType,BsmtQual,BsmtCond,BsmtExposure,BsmtFinType1,BsmtFinType2,Electrical,FireplaceQu,GarageType,GarageFinish,GarageQual,GarageCond,PoolQC,Fence,MiscFeature
0,65,196,2003,Grvl,BrkFace,Gd,TA,No,GLQ,Unf,SBrkr,Gd,Attchd,RFn,TA,TA,missing,missing,missing
1,80,0,1976,Grvl,,Gd,TA,Gd,ALQ,Unf,SBrkr,TA,Attchd,RFn,TA,TA,missing,missing,missing
2,68,162,2001,Grvl,BrkFace,Gd,TA,Mn,GLQ,Unf,SBrkr,TA,Attchd,RFn,TA,TA,missing,missing,missing
3,60,0,1998,Grvl,,TA,Gd,No,ALQ,Unf,SBrkr,Gd,Detchd,Unf,TA,TA,missing,missing,missing
4,84,350,2000,Grvl,BrkFace,Gd,TA,Av,GLQ,Unf,SBrkr,TA,Attchd,RFn,TA,TA,missing,missing,missing


In [49]:
#Checking missing values
X_train_clean_miss_var.isnull().sum()

LotFrontage     0
MasVnrArea      0
GarageYrBlt     0
Alley           0
MasVnrType      0
BsmtQual        0
BsmtCond        0
BsmtExposure    0
BsmtFinType1    0
BsmtFinType2    0
Electrical      0
FireplaceQu     0
GarageType      0
GarageFinish    0
GarageQual      0
GarageCond      0
PoolQC          0
Fence           0
MiscFeature     0
dtype: int64

In [50]:
#Checking count of sub class of variable Alley before imputation 
train['Alley'].value_counts()

Grvl    50
Pave    41
Name: Alley, dtype: int64

In [51]:
#Checking count of sub class of variable Alley after imputation 
X_train_clean_miss_var['Alley'].value_counts()

Grvl    1419
Pave      41
Name: Alley, dtype: int64