In [1]:
from IPython.display import HTML

HTML('''<script>
code_show=true; 
function code_toggle() {
 if (code_show){
 $('div.input').hide();
 } else {
 $('div.input').show();
 }
 code_show = !code_show
} 
$( document ).ready(code_toggle);
</script>
<form action="javascript:code_toggle()"><input type="submit" value="Click here to toggle on/off the raw code."></form>''')

# DATA PREPROCESSING: 

**LOADING DATASET AND INITIALIZING**

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

df = pd.read_csv('./house-prices-advanced-regression-techniques/train.csv')
print('Number of Houses in dataset: ', df.shape[0])
print('Number of variables in dataset (including Sale Price): ', df.shape[1])

Number of Houses in dataset:  1460
Number of variables in dataset (including Sale Price):  81


**FEATURES**

In [3]:
coln = list(df.columns)[1::]
for a,b,c,d,e,f,g,h in zip(coln[0:10],coln[10:20],coln[20:30],coln[30:40],coln[40:50],coln[50:60],coln[60:70],coln[70:79]):
    print ('{:14}{:14}{:14}{:14}{:14}{:14}{:14}{}'.format(a,b,c,d,e,f,g,h))

MSSubClass    LandSlope     RoofStyle     BsmtCond      CentralAir    BedroomAbvGr  GarageCars    PoolArea
MSZoning      Neighborhood  RoofMatl      BsmtExposure  Electrical    KitchenAbvGr  GarageArea    PoolQC
LotFrontage   Condition1    Exterior1st   BsmtFinType1  1stFlrSF      KitchenQual   GarageQual    Fence
LotArea       Condition2    Exterior2nd   BsmtFinSF1    2ndFlrSF      TotRmsAbvGrd  GarageCond    MiscFeature
Street        BldgType      MasVnrType    BsmtFinType2  LowQualFinSF  Functional    PavedDrive    MiscVal
Alley         HouseStyle    MasVnrArea    BsmtFinSF2    GrLivArea     Fireplaces    WoodDeckSF    MoSold
LotShape      OverallQual   ExterQual     BsmtUnfSF     BsmtFullBath  FireplaceQu   OpenPorchSF   YrSold
LandContour   OverallCond   ExterCond     TotalBsmtSF   BsmtHalfBath  GarageType    EnclosedPorch SaleType
Utilities     YearBuilt     Foundation    Heating       FullBath      GarageYrBlt   3SsnPorch     SaleCondition


In [4]:
list(zip(coln[0:10]))

[('MSSubClass',),
 ('MSZoning',),
 ('LotFrontage',),
 ('LotArea',),
 ('Street',),
 ('Alley',),
 ('LotShape',),
 ('LandContour',),
 ('Utilities',),
 ('LotConfig',)]

**MISSINGNESS**

In [5]:
missingRows   = df.isnull().any(axis=1)
missingCols   = df.isnull().any(axis=0)
missingValues = pd.DataFrame(np.sum(df.isnull())).reset_index()

tempMask1       = missingValues[0]>0

missingValues  = missingValues[tempMask1]
missingValues.columns = ['Feature', 'Missing Values']

In [6]:
# Creating Strings to use as values:
mode = 'Mode'
zero = 'Zero'
mar  = 'At random'
mcr  = 'Completely at random'
mnr  = 'Not at random'

# Manually inputting values in dict format:
missingType    = pd.DataFrame({'LotFrontage':mar, 'Alley':mar, 'MasVnrType':mar, 'MasVnrArea':mnr},index = [0])
imputation     = pd.DataFrame({'LotFrontage':mode+' by Neighborhood', 'Alley':zero, 'MasVnrType':mode, 'MasVnrArea':mode},index = [1])

# Manupilating dataframes and merging them together into desireable output:
addtnl         = pd.merge(missingType.transpose().reset_index(),imputation.transpose().reset_index(), on='index', how='left')
addtnl.columns = ['Feature','Missing Type', 'Imputation Method']
finalMissing   = pd.merge(missingValues, addtnl, on='Feature', how='left')

# Printing final table/ dataframe:
finalMissing

Unnamed: 0,Feature,Missing Values,Missing Type,Imputation Method
0,LotFrontage,259,At random,Mode by Neighborhood
1,Alley,1369,At random,Zero
2,MasVnrType,8,At random,Mode
3,MasVnrArea,8,Not at random,Mode
4,BsmtQual,37,,
5,BsmtCond,37,,
6,BsmtExposure,38,,
7,BsmtFinType1,37,,
8,BsmtFinType2,38,,
9,Electrical,1,,


In [7]:
print('Number of Variables with Missing Values: ', len(missingValues))

Number of Variables with Missing Values:  19


- - -

**VARIABLE CATEGORIES**

In [8]:
variables = pd.read_csv('./house-prices-advanced-regression-techniques/variabledata.csv')
variables = variables[:-2].drop(['Unnamed: 3','Unnamed: 4','Unnamed: 5'], axis=1)
variables.columns = ['Variable', 'Type', 'Description']
# variables

In [9]:
quantitative = variables.loc[variables.Type=='Quantitative']
quantitative = quantitative.drop(['Type', 'Description'], axis=1).reset_index()
quantitative

Unnamed: 0,index,Variable
0,33,BsmtFinSF1
1,35,BsmtFinSF2
2,36,BsmtUnfSF
3,37,TotalBsmtSF
4,42,1stFlrSF
5,43,2ndFlrSF
6,44,LowQualFinSF
7,45,GrLivArea
8,46,BsmtFullBath
9,47,BsmtHalfBath


In [10]:
ordinal = variables.loc[variables.Type=='Ordinal']
ordinal = ordinal.drop(['Type', 'Description'], axis=1).reset_index()
ordinal

Unnamed: 0,index,Variable
0,9,LotConfig
1,15,HouseStyle
2,16,OverallQual
3,17,OverallCond
4,18,YearBuilt
5,25,MasVnrArea
6,26,ExterQual
7,27,ExterCond
8,29,BsmtQual
9,30,BsmtCond


In [11]:
nominal = variables.loc[variables.Type=='Categorical']
nominal = nominal.drop(['Type', 'Description'], axis=1).reset_index()
nominal

Unnamed: 0,index,Variable
0,0,MSSubClass
1,3,LotArea
2,4,Street
3,5,Alley
4,6,LotShape
5,7,LandContour
6,8,Utilities
7,10,LandSlope
8,11,Neighborhood
9,12,Condition1


- - -