# Project 2 Notebook: Clean Test dataset 


Load Libraries:

In [24]:
import numpy as np
import pandas as pd
import statistics as st
import matplotlib.pyplot as plt
import math as math
import seaborn as sns
from sklearn import metrics 
from sklearn.model_selection import train_test_split, KFold, cross_val_score
from sklearn.linear_model import LinearRegression, LassoCV, RidgeCV
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.metrics import r2_score


%matplotlib inline

pd.options.display.max_rows=2000

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

Load Data:

In [25]:
test = pd.read_csv('../datasets/test.csv')

### Conduct exploratory data analysis on Test data set. 


In [26]:
test.dtypes

Id                   int64
PID                  int64
MS SubClass          int64
MS Zoning           object
Lot Frontage       float64
Lot Area             int64
Street              object
Alley               object
Lot Shape           object
Land Contour        object
Utilities           object
Lot Config          object
Land Slope          object
Neighborhood        object
Condition 1         object
Condition 2         object
Bldg Type           object
House Style         object
Overall Qual         int64
Overall Cond         int64
Year Built           int64
Year Remod/Add       int64
Roof Style          object
Roof Matl           object
Exterior 1st        object
Exterior 2nd        object
Mas Vnr Type        object
Mas Vnr Area       float64
Exter Qual          object
Exter Cond          object
Foundation          object
Bsmt Qual           object
Bsmt Cond           object
Bsmt Exposure       object
BsmtFin Type 1      object
BsmtFin SF 1         int64
BsmtFin Type 2      object
B

In [27]:
test.shape

(879, 80)

In [28]:
test.isnull().sum()  

Id                   0
PID                  0
MS SubClass          0
MS Zoning            0
Lot Frontage       160
Lot Area             0
Street               0
Alley              821
Lot Shape            0
Land Contour         0
Utilities            0
Lot Config           0
Land Slope           0
Neighborhood         0
Condition 1          0
Condition 2          0
Bldg Type            0
House Style          0
Overall Qual         0
Overall Cond         0
Year Built           0
Year Remod/Add       0
Roof Style           0
Roof Matl            0
Exterior 1st         0
Exterior 2nd         0
Mas Vnr Type         1
Mas Vnr Area         1
Exter Qual           0
Exter Cond           0
Foundation           0
Bsmt Qual           25
Bsmt Cond           25
Bsmt Exposure       25
BsmtFin Type 1      25
BsmtFin SF 1         0
BsmtFin Type 2      25
BsmtFin SF 2         0
Bsmt Unf SF          0
Total Bsmt SF        0
Heating              0
Heating QC           0
Central Air          0
Electrical 

In [29]:
test.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
Id,879.0,1445.588,850.7171,2.0,692.5,1435.0,2197.0,2928.0
PID,879.0,716505000.0,188913500.0,526302110.0,528486085.0,535454160.0,907192140.0,1007100000.0
MS SubClass,879.0,58.27076,42.21139,20.0,20.0,50.0,70.0,190.0
Lot Frontage,719.0,69.63004,23.62537,21.0,59.0,68.0,80.0,182.0
Lot Area,879.0,10340.92,10047.34,1477.0,7298.5,9453.0,11606.5,215245.0
Overall Qual,879.0,6.054608,1.374756,2.0,5.0,6.0,7.0,10.0
Overall Cond,879.0,5.565415,1.128422,1.0,5.0,5.0,6.0,9.0
Year Built,879.0,1970.534,30.40353,1880.0,1954.0,1972.0,2000.0,2010.0
Year Remod/Add,879.0,1984.445,20.45455,1950.0,1967.0,1992.0,2003.0,2010.0
Mas Vnr Area,878.0,106.9829,188.3568,0.0,0.0,0.0,173.5,1378.0


In [30]:
def eda(df):
    # Edit column headers; replace embedded sapce to underscore symbol
    df.columns = df.columns.str.strip().str.lower().str.replace(' ', '_')\
                .str.replace('(', '').str.replace(')', '').str.replace('/', '_')  
   
    #Replace missing alley values with 'NA'
    df['alley'].fillna(value='NA',inplace=True)
    #Replace missing lot_frontage values with median value
    df.fillna(value={'lot_frontage':68},inplace=True)
    #Replace missing mas_vnr_type values with 'None'
    df['mas_vnr_type'].fillna(value='None',inplace=True)
    #Replace missing mas_vnr_area values with 'zero
    df['mas_vnr_area'] = df['mas_vnr_area'].fillna(0)   
    #Replace missing bsmt_qual values with 'NA'
    df['bsmt_qual'].fillna(value='NA',inplace=True)
    #Replace missing bsmt_cond values with 'NA'
    df['bsmt_cond'].fillna(value='NA',inplace=True)
    #Replace missing bsmt_exposure values with 'NA'
    df['bsmt_exposure'].fillna(value='NA',inplace=True)
    #Replace missing bsmtfin_type_1 values with 'NA'
    df['bsmtfin_type_1'].fillna(value='NA',inplace=True)
    #Replace missing bsmtfin_sf_1 values with 'zero
    #df['bsmtfin_sf_1'] = df['bsmtfin_sf_1'].fillna(0)  
    #Replace missing bsmtfin_type_1 values with 'NA'
    df['bsmtfin_type_2'].fillna(value='NA',inplace=True)
    #Replace missing bsmtfin_sf_2 values with 'zero
    #df['bsmtfin_sf_2'] = df['bsmtfin_sf_2'].fillna(0) 
    #Replace missing bsmt_unf_sf values with 'zero
    #df['bsmt_unf_sf'] = df['bsmt_unf_sf'].fillna(0)  
    #Replace missing total_bsmt_sf values with 'zero
    #df['total_bsmt_sf'] = df['total_bsmt_sf'].fillna(0) 
    #Replace missing bsmt_full_bath values with 'zero
    #df['bsmt_full_bath'] = df['bsmt_full_bath'].fillna(0) 
    #Replace missing bsmt_half_bath values with 'zero
    #df['bsmt_half_bath'] = df['bsmt_half_bath'].fillna(0)
    #Replace missing electrical values with 'zero
    df['electrical'].fillna(value='NA',inplace=True) 
    #Replace missing fireplace_qu values with 'NA'
    df['fireplace_qu'].fillna(value='NA',inplace=True)
    #Replace missing garage_type values with 'NA'
    df['garage_type'].fillna(value='NA',inplace=True)
    #Replace missing bsmt_half_bath values with with median value
    df['garage_yr_blt'] = df['garage_yr_blt'].fillna(1979) 
    #Replace missing garage_finish values with 'NA'
    df['garage_finish'].fillna(value='NA',inplace=True)
    df['garage_cars'] = df['garage_cars'].fillna(0) 
    #Replace missing garage_area values with 'zero
    #df['garage_area'] = df['garage_area'].fillna(0) 
    #Replace missing garage_qual values with 'NA'
    df['garage_qual'].fillna(value='NA',inplace=True)
    #Replace missing garage_cond values with 'NA'
    df['garage_cond'].fillna(value='NA',inplace=True)
    #Replace missing pool_qc values with 'NA'
    df['pool_qc'].fillna(value='NA',inplace=True)
    #Replace missing fence values with 'NA'
    df['fence'].fillna(value='NA',inplace=True)
    #Replace missing misc_feature values with 'NA'
    df['misc_feature'].fillna(value='NA',inplace=True)
    return  None 

eda(test)


In [31]:
test.isnull().sum()    #All columns = zero null  

id                 0
pid                0
ms_subclass        0
ms_zoning          0
lot_frontage       0
lot_area           0
street             0
alley              0
lot_shape          0
land_contour       0
utilities          0
lot_config         0
land_slope         0
neighborhood       0
condition_1        0
condition_2        0
bldg_type          0
house_style        0
overall_qual       0
overall_cond       0
year_built         0
year_remod_add     0
roof_style         0
roof_matl          0
exterior_1st       0
exterior_2nd       0
mas_vnr_type       0
mas_vnr_area       0
exter_qual         0
exter_cond         0
foundation         0
bsmt_qual          0
bsmt_cond          0
bsmt_exposure      0
bsmtfin_type_1     0
bsmtfin_sf_1       0
bsmtfin_type_2     0
bsmtfin_sf_2       0
bsmt_unf_sf        0
total_bsmt_sf      0
heating            0
heating_qc         0
central_air        0
electrical         0
1st_flr_sf         0
2nd_flr_sf         0
low_qual_fin_sf    0
gr_liv_area  

In [32]:
def dummys(df):
    df = pd.get_dummies(df, columns=['ms_zoning', 'street',  'land_contour', 'lot_config' 
                                     ,'condition_1', 'condition_2', 'bldg_type', 'house_style' 
                                     ,'roof_style'
                                     ,'roof_matl', 'exterior_1st', 'exterior_2nd', 'mas_vnr_type'
                                     ,'foundation' 
                                     ,'heating' 
                                     ,'central_air' 
                                     ,'garage_type'
                                     ,'misc_feature'
                                    ], drop_first=True)
    return df

test_new = dummys(test)

In [33]:
#test_new

In [34]:
def str_to_ord(df):
    replace={'lot_shape':{'Reg':3,'IR1':2,'IR2':1,'IR3':0},\
             'utilities':{'AllPub':3,'NoSewr':2,'NoSeWa':1,'ELO':0},\
             'land_slope':{'Gtl':2,'Mod':1,'Sev':0},\
             'exter_qual':{'Ex':4,'Gd':3,'TA':2,'Fa':1,'Po':0},\
             'exter_cond':{'Ex':4,'Gd':3,'TA':2,'Fa':1,'Po':0},\
             'bsmt_qual':{'Ex':5,'Gd':4,'TA':3,'Fa':2,'Po':1,'NA':0},\
             'bsmt_cond':{'Ex':5,'Gd':4,'TA':3,'Fa':2,'Po':1,'NA':0},\
             'bsmt_exposure':{'Gd':4,'Av':3,'Mn':2,'No':1,'NA':0},\
             'bsmtfin_type_1':{'GLQ':6,'ALQ':5,'BLQ':4,'Rec':3,'LwQ':2,'Unf':1,'NA':0},\
             'bsmtfin_type_2':{'GLQ':6,'ALQ':5,'BLQ':4,'Rec':3,'LwQ':2,'Unf':1,'NA':0},\
             'heating_qc':{'Ex':4,'Gd':3,'TA':2,'Fa':1,'Po':0},\
             'electrical':{'SBrkr':4,'FuseA':3,'FuseF':2,'FuseP':1,'Mix':0},\
             'kitchen_qual':{'Ex':4,'Gd':3,'TA':2,'Fa':1,'Po':0},\
             'functional':{'Typ':7,'Min1':6,'Min2':5,'Mod':4,'Maj1':3,'Maj2':2,'Sev':1,'Sal':0},\
             'fireplace_qu':{'Ex':5,'Gd':4,'TA':3,'Fa':2,'Po':1,'NA':0},\
             'garage_finish':{'Fin':3,'RFn':2,'Unf':1,'NA':0},\
             'garage_qual':{'Ex':5,'Gd':4,'TA':3,'Fa':2,'Po':1,'NA':0},\
             'garage_cond':{'Ex':5,'Gd':4,'TA':3,'Fa':2,'Po':1,'NA':0},\
             'paved_drive':{'Y':2,'P':1,'N':0},\
             'pool_qc':{'Ex':4,'Gd':3,'TA':2,'Fa':1,'NA':0},\
             'fence':{'GdPrv':4,'MnPrv':3,'GdWo':2,'MnWw':1,'NA':0}}
    df.replace(replace,inplace=True,method=None)
    return None

str_to_ord(test_new)  


In [35]:
test_new.dtypes

id                        int64
pid                       int64
ms_subclass               int64
lot_frontage            float64
lot_area                  int64
alley                    object
lot_shape                 int64
utilities                object
land_slope                int64
neighborhood             object
overall_qual              int64
overall_cond              int64
year_built                int64
year_remod_add            int64
mas_vnr_area            float64
exter_qual               object
exter_cond                int64
bsmt_qual                 int64
bsmt_cond                 int64
bsmt_exposure             int64
bsmtfin_type_1            int64
bsmtfin_sf_1              int64
bsmtfin_type_2            int64
bsmtfin_sf_2              int64
bsmt_unf_sf               int64
total_bsmt_sf             int64
heating_qc               object
electrical               object
1st_flr_sf                int64
2nd_flr_sf                int64
low_qual_fin_sf           int64
gr_liv_a

Datatype Correction:

In [37]:
def dtype_corr(df):
    df[['lot_frontage']]=df[['lot_frontage']].astype(int)
    df[['lot_shape']]=df[['lot_shape']].astype(int)    
    df[['utilities']]=df[['utilities']].astype(int)
    df[['land_slope']]=df[['land_slope']].astype(int)
    df[['mas_vnr_area']]=df[['mas_vnr_area']].astype(int)
    df[['exter_qual']]=df[['exter_qual']].astype(int)
    df[['exter_cond']]=df[['exter_cond']].astype(int)
    df[['bsmt_qual']]=df[['exter_qual']].astype(int)
    df[['bsmt_cond']]=df[['exter_cond']].astype(int)    
    df[['bsmt_exposure']]=df[['bsmt_exposure']].astype(int)    
    df[['bsmtfin_type_1']]=df[['bsmtfin_type_1']].astype(int)
    df[['bsmtfin_type_2']]=df[['bsmtfin_type_2']].astype(int)
    df[['heating_qc']]=df[['heating_qc']].astype(int)
#    df[['electrical']]=df[['electrical']].astype(int)
    df[['kitchen_qual']]=df[['kitchen_qual']].astype(int)
    df[['functional']]=df[['functional']].astype(int)
    df[['fireplace_qu']]=df[['fireplace_qu']].astype(int)
    df[['garage_yr_blt']]=df[['garage_yr_blt']].astype(int)
    df[['garage_finish']]=df[['garage_finish']].astype(int) 
    df[['garage_qual']]=df[['garage_qual']].astype(int)
    df[['garage_cond']]=df[['garage_cond']].astype(int)
    df[['paved_drive']]=df[['paved_drive']].astype(int)    
    df[['pool_qc']]=df[['pool_qc']].astype(int)
    df[['fence']]=df[['fence']].astype(int)    
    return None 

dtype_corr(test_new)

In [40]:
max(test_new.isnull().sum())

0

In [39]:
test_new.to_csv('../datasets/test_new.csv', index = False)