# Project 2 Notebook: Clean Training & Test dataset 


Load Libraries:

In [1]:
import numpy as np
import pandas as pd
import statistics as st
import matplotlib.pyplot as plt
import math as math
import seaborn as sns
from sklearn import metrics 
from sklearn.model_selection import train_test_split, KFold, cross_val_score
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.preprocessing import StandardScaler

%matplotlib inline

pd.options.display.max_rows=2000

Load Data:

In [2]:
train = pd.read_csv('../datasets/train.csv')

### Conduct exploratory data analysis on TRAIN data set. 


In [3]:
#train.dtypes

In [4]:
#train.shape

In [5]:
def eda(df):
    # Edit column headers; replace embedded sapce to underscore symbol
    df.columns = df.columns.str.strip().str.lower().str.replace(' ', '_')\
                .str.replace('(', '').str.replace(')', '').str.replace('/', '_')  
   
    #Replace missing alley values with 'NA'
    df['alley'].fillna(value='NA',inplace=True)
    #Replace missing lot_frontage values with median value
    df.fillna(value={'lot_frontage':68},inplace=True)
    #Replace missing mas_vnr_type values with 'None'
    df['mas_vnr_type'].fillna(value='None',inplace=True)
    #Replace missing mas_vnr_area values with 'zero
    df['mas_vnr_area'] = df['mas_vnr_area'].fillna(0)   
    #Replace missing bsmt_qual values with 'NA'
    df['bsmt_qual'].fillna(value='NA',inplace=True)
    #Replace missing bsmt_cond values with 'NA'
    df['bsmt_cond'].fillna(value='NA',inplace=True)
    #Replace missing bsmt_exposure values with 'NA'
    df['bsmt_exposure'].fillna(value='NA',inplace=True)
    #Replace missing bsmtfin_type_1 values with 'NA'
    df['bsmtfin_type_1'].fillna(value='NA',inplace=True)
    #Replace missing bsmtfin_sf_1 values with 'zero
    df['bsmtfin_sf_1'] = df['bsmtfin_sf_1'].fillna(0)  
    #Replace missing bsmtfin_type_1 values with 'NA'
    df['bsmtfin_type_2'].fillna(value='NA',inplace=True)
    #Replace missing bsmtfin_sf_2 values with 'zero
    df['bsmtfin_sf_2'] = df['bsmtfin_sf_2'].fillna(0) 
    #Replace missing bsmt_unf_sf values with 'zero
    df['bsmt_unf_sf'] = df['bsmt_unf_sf'].fillna(0)  
    #Replace missing total_bsmt_sf values with 'zero
    df['total_bsmt_sf'] = df['total_bsmt_sf'].fillna(0) 
    #Replace missing bsmt_full_bath values with 'zero
    df['bsmt_full_bath'] = df['bsmt_full_bath'].fillna(0) 
    #Replace missing bsmt_half_bath values with 'zero
    df['bsmt_half_bath'] = df['bsmt_half_bath'].fillna(0) 
    #Replace missing fireplace_qu values with 'NA'
    df['fireplace_qu'].fillna(value='NA',inplace=True)
    #Replace missing garage_type values with 'NA'
    df['garage_type'].fillna(value='NA',inplace=True)
    #Replace missing bsmt_half_bath values with with median value
    df['garage_yr_blt'] = df['garage_yr_blt'].fillna(1979) 
    #Replace missing garage_finish values with 'NA'
    df['garage_finish'].fillna(value='NA',inplace=True)
    df['garage_cars'] = df['garage_cars'].fillna(0) 
    #Replace missing garage_area values with 'zero
    df['garage_area'] = df['garage_area'].fillna(0) 
    #Replace missing garage_qual values with 'NA'
    df['garage_qual'].fillna(value='NA',inplace=True)
    #Replace missing garage_cond values with 'NA'
    df['garage_cond'].fillna(value='NA',inplace=True)
    #Replace missing pool_qc values with 'NA'
    df['pool_qc'].fillna(value='NA',inplace=True)
    #Replace missing fence values with 'NA'
    df['fence'].fillna(value='NA',inplace=True)
    #Replace missing misc_feature values with 'NA'
    df['misc_feature'].fillna(value='NA',inplace=True)
    return  None 

eda(train)


In [6]:
#train.isnull().sum()    #All columns = zero null  

In [7]:
def dummys(df):
    df = pd.get_dummies(df, columns=['ms_zoning', 'street',  'land_contour', 'lot_config' 
                                     ,'condition_1', 'condition_2', 'bldg_type', 'house_style' 
                                     ,'roof_style'
                                     ,'roof_matl', 'exterior_1st', 'exterior_2nd', 'mas_vnr_type'
                                     ,'foundation' 
                                     ,'heating' 
                                     ,'central_air' 
                                     ,'garage_type'
                                     ,'misc_feature'
                                    ], drop_first=True)
    return df

train_new = dummys(train)

In [8]:
#train_new

In [9]:
def str_to_ord(df):
    replace={'lot_shape':{'Reg':3,'IR1':2,'IR2':1,'IR3':0},\
             'utilities':{'AllPub':3,'NoSewr':2,'NoSeWa':1,'ELO':0},\
             'land_slope':{'Gtl':2,'Mod':1,'Sev':0},\
             'exter_qual':{'Ex':4,'Gd':3,'TA':2,'Fa':1,'Po':0},\
             'exter_cond':{'Ex':4,'Gd':3,'TA':2,'Fa':1,'Po':0},\
             'bsmt_qual':{'Ex':5,'Gd':4,'TA':3,'Fa':2,'Po':1,'NA':0},\
             'bsmt_cond':{'Ex':5,'Gd':4,'TA':3,'Fa':2,'Po':1,'NA':0},\
             'bsmt_exposure':{'Gd':4,'Av':3,'Mn':2,'No':1,'NA':0},\
             'bsmtfin_type_1':{'GLQ':6,'ALQ':5,'BLQ':4,'Rec':3,'LwQ':2,'Unf':1,'NA':0},\
             'bsmtfin_type_2':{'GLQ':6,'ALQ':5,'BLQ':4,'Rec':3,'LwQ':2,'Unf':1,'NA':0},\
             'heating_qc':{'Ex':4,'Gd':3,'TA':2,'Fa':1,'Po':0},\
             'electrical':{'SBrkr':4,'FuseA':3,'FuseF':2,'FuseP':1,'Mix':0},\
             'kitchen_qual':{'Ex':4,'Gd':3,'TA':2,'Fa':1,'Po':0},\
             'functional':{'Typ':7,'Min1':6,'Min2':5,'Mod':4,'Maj1':3,'Maj2':2,'Sev':1,'Sal':0},\
             'fireplace_qu':{'Ex':5,'Gd':4,'TA':3,'Fa':2,'Po':1,'NA':0},\
             'garage_finish':{'Fin':3,'RFn':2,'Unf':1,'NA':0},\
             'garage_qual':{'Ex':5,'Gd':4,'TA':3,'Fa':2,'Po':1,'NA':0},\
             'garage_cond':{'Ex':5,'Gd':4,'TA':3,'Fa':2,'Po':1,'NA':0},\
             'paved_drive':{'Y':2,'P':1,'N':0},\
             'pool_qc':{'Ex':4,'Gd':3,'TA':2,'Fa':1,'NA':0},\
             'fence':{'GdPrv':4,'MnPrv':3,'GdWo':2,'MnWw':1,'NA':0}}
    df.replace(replace,inplace=True,method=None)
    return None

str_to_ord(train_new)  


In [10]:
train_new.dtypes

id                        int64
pid                       int64
ms_subclass               int64
lot_frontage            float64
lot_area                  int64
alley                    object
lot_shape                 int64
utilities                object
land_slope                int64
neighborhood             object
overall_qual              int64
overall_cond              int64
year_built                int64
year_remod_add            int64
mas_vnr_area            float64
exter_qual               object
exter_cond                int64
bsmt_qual                 int64
bsmt_cond                 int64
bsmt_exposure             int64
bsmtfin_type_1            int64
bsmtfin_sf_1            float64
bsmtfin_type_2            int64
bsmtfin_sf_2            float64
bsmt_unf_sf             float64
total_bsmt_sf           float64
heating_qc                int64
electrical                int64
1st_flr_sf                int64
2nd_flr_sf                int64
low_qual_fin_sf           int64
gr_liv_a

Datatype Correction:

In [11]:
def dtype_corr(df):
    df[['lot_frontage']]=df[['lot_frontage']].astype(int)
    df[['utilities']]=df[['utilities']].astype(int)
    df[['mas_vnr_area']]=df[['mas_vnr_area']].astype(int)
    df[['exter_qual']]=df[['exter_qual']].astype(int)
    df[['bsmtfin_sf_1']]=df[['bsmtfin_sf_1']].astype(int)
    df[['bsmt_unf_sf']]=df[['bsmt_unf_sf']].astype(int)
    df[['total_bsmt_sf']]=df[['total_bsmt_sf']].astype(int)
    df[['bsmt_full_bath']]=df[['bsmt_full_bath']].astype(int)
    df[['bsmt_half_bath']]=df[['bsmt_half_bath']].astype(int)
    df[['kitchen_qual']]=df[['kitchen_qual']].astype(int)
    df[['garage_yr_blt']]=df[['garage_yr_blt']].astype(int)
    df[['garage_cars']]=df[['garage_cars']].astype(int)
    return None 

dtype_corr(train_new)

In [12]:
max(train_new.isnull().sum())

0

In [13]:
train_new.to_csv('../datasets/train_new.csv', index = False)