In [125]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
prices = pd.read_csv('../../data/Ames_Housing_Price_Data.csv',index_col = 0)

In [126]:
def check_empty_vals():
    cols=prices.columns
    for col in cols:
        null_count=pd.isnull(prices[col]).sum()
        if null_count>0:
            print(col,null_count)
        
        
check_empty_vals()

LotFrontage 462
Alley 2412
MasVnrType 14
MasVnrArea 14
BsmtQual 69
BsmtCond 69
BsmtExposure 71
BsmtFinType1 69
BsmtFinSF1 1
BsmtFinType2 70
BsmtFinSF2 1
BsmtUnfSF 1
TotalBsmtSF 1
Electrical 1
BsmtFullBath 2
BsmtHalfBath 2
FireplaceQu 1241
GarageType 127
GarageYrBlt 129
GarageFinish 129
GarageCars 1
GarageArea 1
GarageQual 129
GarageCond 129
PoolQC 2571
Fence 2055
MiscFeature 2483


In [127]:
#Imputation analysis notes
# LotFrontage 462 - mean of neighborhood -  Linear feet of street connected to property (Doug)
# Alley 2412 Type of alley access to property Grvl	Grave Pave	Paved NA 	No alley access - REMOVE?
# MasVnrType 14 - impute None Masonry veneer type  
       # BrkCmn	Brick Common BrkFace	Brick Face CBlock	Cinder Block None	NoneStone	Stone
# MasVnrArea 14 - impute 0
# BASEMENT 68 have no values; 1 has NA for BsmtFinSF1, BsmtFinSF2, BsmtUnfSF,TotalBsmtSF
# BASEMENT 2 rows have basement data 814 and 1202
#   BasmtExposure value is missing - impute with mode
#   BsmtFinSF1 and BsmtFinSF2 should be imputed with mean percentage for Unf BsmtFinType2 and BsmtFinType1
# BsmtQual 69 - Evaluates the height of the basement - impute DNE
       #Ex	Excellent (100+ inches)	
       #Gd	Good (90-99 inches)
       #TA	Typical (80-89 inches)
       #Fa	Fair (70-79 inches)
       #Po	Poor (<70 inches
       #NA	No Basement
# BsmtCond 69 - Evaluates the general condition of the basement - impute DNE
       #Ex	Excellent
       #Gd	Good
       #TA	Typical - slight dampness allowed
       #Fa	Fair - dampness or some cracking or settling
       #Po	Poor - Severe cracking, settling, or wetness
       # NA	No Basement
# BsmtExposure 71 -  Refers to walkout or garden level walls - 814 and 1202 impute Mode
       #Gd	Good Exposure
       #Av	Average Exposure (split levels or foyers typically score average or above)	
       #Mn	Mimimum Exposure
       #No	No Exposure
       #NA	No Basement
# BsmtFinType1 69 - Rating of basement finished area - impute DNE
       #GLQ	Good Living Quarters
       #ALQ	Average Living Quarters
       #BLQ	Below Average Living Quarters	
       #Rec	Average Rec Room
       #LwQ	Low Quality
       #Unf	Unfinshed
       #NA	No Basement
# BsmtFinSF1 1 - 913 Type 1 finished square feet - impute 0
# BsmtFinType2 70  Rating of basement finished area (if multiple types) should be DNE
       #GLQ	Good Living Quarters
       #ALQ	Average Living Quarters
       #BLQ	Below Average Living Quarters	
       #Rec	Average Rec Room
       #LwQ	Low Quality
       #Unf	Unfinshed
       #NA	No Basement
# BsmtFinSF2 1  - Rating of basement finished area (if multiple types) - impute 0
       #GLQ	Good Living Quarters
       #ALQ	Average Living Quarters
       #BLQ	Below Average Living Quarters	
       #Rec	Average Rec Room
       #LwQ	Low Quality
       #Unf	Unfinshed
       #NA	No Basement
# BsmtUnfSF 1 913 - Unfinished square feet of basement area - impute 0
# TotalBsmtSF 1 913- Total square feet of basement area - impute 0
# Electrical 1 - Electrical system - impute mode
       #SBrkr	Standard Circuit Breakers & Romex
       #FuseA	Fuse Box over 60 AMP and all Romex wiring (Average)	
       #FuseF	60 AMP Fuse Box and mostly Romex wiring (Fair)
       #FuseP	60 AMP Fuse Box and mostly knob & tube wiring (poor)
       #Mix	Mixed
# BsmtFullBath 2 Basement full bathrooms 913/2309  should be 0
# BsmtHalfBath 2 Basement half bathrooms 913/2309 should be 0 
# FireplaceQu 1241 Fireplace quality (may be thrown out)
       #Ex	Excellent - Exceptional Masonry Fireplace
       #Gd	Good - Masonry Fireplace in main level
       #TA	Average - Prefabricated Fireplace in main living area or Masonry Fireplace in basement
       #Fa	Fair - Prefabricated Fireplace in basement
       #Po	Poor - Ben Franklin Stove
       #NA	No Fireplace
# GarageType 127  Garage location
       #2Types	More than one type of garage
       #Attchd	Attached to home
       #Basment	Basement Garage
       #BuiltIn	Built-In (Garage part of house - typically has room above garage)
       #CarPort	Car Port
       #Detchd	Detached from home
       #NA	No Garage
# GarageYrBlt 129 - Year garage was built
# GarageFinish 129 Interior finish of the garage - not well correlated
       #Fin	Finished
       #RFn	Rough Finished	
       #Unf	Unfinished
       #NA	No Garage
# GarageCars 1 - Size of garage in car capacity correlated
# GarageArea 1 - Size of garage in square feet correlated
# GarageQual 129 - Garage quality
       #Ex	Excellent
       #Gd	Good
       #TA	Typical/Average
       #Fa	Fair
       #Po	Poor
       #NA	No Garage
# GarageCond 129 - Garage condition
       #Ex	Excellent
       #Gd	Good
       #TA	Typical/Average
       #Fa	Fair
       #Po	Poor
       #NA	No Garage
# PoolQC 2571 -  Pool quality - throw out
       #Ex	Excellent
       #Gd	Good
       #TA	Average/Typical
       #Fa	Fair
       #NA	No Pool
# Fence 2055 - Fence quality - throw out
       #GdPrv	Good Privacy
       #MnPrv	Minimum Privacy
       #GdWo	Good Wood
       #MnWw	Minimum Wood/Wire
       #NA	No Fence
# MiscFeature 2483 Miscellaneous feature not covered in other categories - throw out - not enough data points
       #Elev	Elevator
       #Gar2	2nd Garage (if not described in garage section)
       #Othr	Other
       #Shed	Shed (over 100 SF)
       #TenC	Tennis Court
       #NA	None

In [128]:
prices.drop(['PoolQC', 'MiscFeature', 'Alley', 'Fence', "FireplaceQu"], axis=1, inplace=True)

In [129]:
#MasVnrType & MasVnrArea
prices['MasVnrType'] = prices['MasVnrType'].fillna("None")
prices['MasVnrArea'] = prices['MasVnrArea'].fillna(0)

In [130]:
#Electrical
print(prices['Electrical'].value_counts()) # majority is SBrkr - got with that
prices['Electrical'] = prices['Electrical'].fillna(prices["Electrical"].mode()[0])
print(prices['Electrical'].value_counts()) 

SBrkr    2365
FuseA     168
FuseF      39
FuseP       7
Name: Electrical, dtype: int64
SBrkr    2366
FuseA     168
FuseF      39
FuseP       7
Name: Electrical, dtype: int64


In [131]:
check_empty_vals()

LotFrontage 462
BsmtQual 69
BsmtCond 69
BsmtExposure 71
BsmtFinType1 69
BsmtFinSF1 1
BsmtFinType2 70
BsmtFinSF2 1
BsmtUnfSF 1
TotalBsmtSF 1
BsmtFullBath 2
BsmtHalfBath 2
GarageType 127
GarageYrBlt 129
GarageFinish 129
GarageCars 1
GarageArea 1
GarageQual 129
GarageCond 129
