In [1]:
from IPython.display import HTML

HTML('''<script>
code_show=true; 
function code_toggle() {
 if (code_show){
 $('div.input').hide();
 } else {
 $('div.input').show();
 }
 code_show = !code_show
} 
$( document ).ready(code_toggle);
</script>
<form action="javascript:code_toggle()"><input type="submit" value="Click here to toggle on/off the raw code."></form>''')

# DATA PREPROCESSING: 

**LOADING DATASET AND INITIALIZING**

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

df = pd.read_csv('./house-prices-advanced-regression-techniques/train.csv')
print('Number of Houses in dataset: ', df.shape[0])
print('Number of variables in dataset (including Sale Price): ', df.shape[1]-1)

Number of Houses in dataset:  1460
Number of variables in dataset (including Sale Price):  80


**FEATURES**

In [3]:
coln = list(df.columns)[1::]
for a,b,c,d,e,f,g,h in zip(coln[0:11],coln[10:21],coln[20:31],coln[30:41],coln[40:51],coln[50:61],coln[60:71],coln[70:81]):
    print ('{:14}{:14}{:14}{:14}{:14}{:14}{:14}{}'.format(a,b,c,d,e,f,g,h))

MSSubClass    LandSlope     RoofStyle     BsmtCond      CentralAir    BedroomAbvGr  GarageCars    PoolArea
MSZoning      Neighborhood  RoofMatl      BsmtExposure  Electrical    KitchenAbvGr  GarageArea    PoolQC
LotFrontage   Condition1    Exterior1st   BsmtFinType1  1stFlrSF      KitchenQual   GarageQual    Fence
LotArea       Condition2    Exterior2nd   BsmtFinSF1    2ndFlrSF      TotRmsAbvGrd  GarageCond    MiscFeature
Street        BldgType      MasVnrType    BsmtFinType2  LowQualFinSF  Functional    PavedDrive    MiscVal
Alley         HouseStyle    MasVnrArea    BsmtFinSF2    GrLivArea     Fireplaces    WoodDeckSF    MoSold
LotShape      OverallQual   ExterQual     BsmtUnfSF     BsmtFullBath  FireplaceQu   OpenPorchSF   YrSold
LandContour   OverallCond   ExterCond     TotalBsmtSF   BsmtHalfBath  GarageType    EnclosedPorch SaleType
Utilities     YearBuilt     Foundation    Heating       FullBath      GarageYrBlt   3SsnPorch     SaleCondition
LotConfig     YearRemodAdd  BsmtQual   

**MISSINGNESS**

In [4]:
missingRows   = df.isnull().any(axis=1)
missingCols   = df.isnull().any(axis=0)
missingValues = pd.DataFrame(np.sum(df.isnull())).reset_index()

tempMask1       = missingValues[0]>0

missingValues  = missingValues[tempMask1]
missingValues.columns = ['Feature', 'Missing Values']

In [5]:
missing_impute = pd.read_csv('./house-prices-advanced-regression-techniques/missing_impute.csv')
missing_impute = missing_impute.iloc[:19,].drop(columns=['Unnamed: 5'])
missing_impute['No. Missing'] = missing_impute['No. Missing'].astype(dtype='int')
missing_impute

Unnamed: 0,Feature,No. Missing,Missing Type,Possible Reason,Imputation
0,LotFrontage,259,At random,Unknown,Mode
1,Alley,1369,Not at random,No alley access,
2,MasVnrType,8,At random,Unknown,Mode
3,MasVnrArea,8,Not at random,MasVnrType,Zero
4,BsmtQual,37,Not at random,No basement,Mode
5,BsmtCond,37,Not at random,No basement,Mode
6,BsmtExposure,38,Not at random,No basement,Mode
7,BsmtFinType1,37,Not at random,No basement,Mode
8,BsmtFinType2,38,Not at random,No basement,Mode
9,Electrical,1,At random,Unknown,Mode


- - -

**VARIABLE TYPES**

In [6]:
variables = pd.read_csv('./house-prices-advanced-regression-techniques/variabledata.csv')
# variables = variables[:-2].drop(['Unnamed: 3','Unnamed: 4','Unnamed: 5'], axis=1)
variables.columns = ['Variable', 'Type', 'Description']

quantitative = variables.loc[variables.Type=='Quantitative']
quantitative = quantitative.reset_index().drop(['Type', 'Description','index'], axis=1)

ordinal = variables.loc[variables.Type=='Ordinal']
ordinal = ordinal.reset_index().drop(['Type', 'Description','index'], axis=1)

nominal = variables.loc[variables.Type=='Categorical']
nominal = nominal.reset_index().drop(['Type', 'Description','index'], axis=1)

varType = pd.concat([quantitative, nominal, ordinal], axis=1).fillna('')
varType.columns = ['Quantitative', 'Nominal Categorical', 'Ordinal Categorical']
varType

Unnamed: 0,Quantitative,Nominal Categorical,Ordinal Categorical
0,MSZoning,MSSubClass,LotConfig
1,LotFrontage,LotArea,HouseStyle
2,MasVnrType,Street,OverallQual
3,BsmtFinSF1,Alley,OverallCond
4,BsmtFinSF2,LotShape,YearBuilt
5,BsmtUnfSF,LandContour,MasVnrArea
6,TotalBsmtSF,Utilities,ExterQual
7,1stFlrSF,LandSlope,ExterCond
8,2ndFlrSF,Neighborhood,BsmtQual
9,LowQualFinSF,Condition1,BsmtCond


- - -