In [1]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
df = pd.read_csv('Ames_Housing_Data.csv')
df.head()

Unnamed: 0,PID,MS SubClass,MS Zoning,Lot Frontage,Lot Area,Street,Alley,Lot Shape,Land Contour,Utilities,...,Pool Area,Pool QC,Fence,Misc Feature,Misc Val,Mo Sold,Yr Sold,Sale Type,Sale Condition,SalePrice
0,526301100,20,RL,141.0,31770,Pave,,IR1,Lvl,AllPub,...,0,,,,0,5,2010,WD,Normal,215000
1,526350040,20,RH,80.0,11622,Pave,,Reg,Lvl,AllPub,...,0,,MnPrv,,0,6,2010,WD,Normal,105000
2,526351010,20,RL,81.0,14267,Pave,,IR1,Lvl,AllPub,...,0,,,Gar2,12500,6,2010,WD,Normal,172000
3,526353030,20,RL,93.0,11160,Pave,,Reg,Lvl,AllPub,...,0,,,,0,4,2010,WD,Normal,244000
4,527105010,60,RL,74.0,13830,Pave,,IR1,Lvl,AllPub,...,0,,MnPrv,,0,3,2010,WD,Normal,189900


In [3]:
df.isna().sum()

PID                 0
MS SubClass         0
MS Zoning           0
Lot Frontage      490
Lot Area            0
                 ... 
Mo Sold             0
Yr Sold             0
Sale Type           0
Sale Condition      0
SalePrice           0
Length: 81, dtype: int64

In [4]:
df.isna().sum().sum()

13997

In [5]:
df.shape

(2930, 81)

In [6]:
df.columns

Index(['PID', 'MS SubClass', 'MS Zoning', 'Lot Frontage', 'Lot Area', 'Street',
       'Alley', 'Lot Shape', 'Land Contour', 'Utilities', 'Lot Config',
       'Land Slope', 'Neighborhood', 'Condition 1', 'Condition 2', 'Bldg Type',
       'House Style', 'Overall Qual', 'Overall Cond', 'Year Built',
       'Year Remod/Add', 'Roof Style', 'Roof Matl', 'Exterior 1st',
       'Exterior 2nd', 'Mas Vnr Type', 'Mas Vnr Area', 'Exter Qual',
       'Exter Cond', 'Foundation', 'Bsmt Qual', 'Bsmt Cond', 'Bsmt Exposure',
       'BsmtFin Type 1', 'BsmtFin SF 1', 'BsmtFin Type 2', 'BsmtFin SF 2',
       'Bsmt Unf SF', 'Total Bsmt SF', 'Heating', 'Heating QC', 'Central Air',
       'Electrical', '1st Flr SF', '2nd Flr SF', 'Low Qual Fin SF',
       'Gr Liv Area', 'Bsmt Full Bath', 'Bsmt Half Bath', 'Full Bath',
       'Half Bath', 'Bedroom AbvGr', 'Kitchen AbvGr', 'Kitchen Qual',
       'TotRms AbvGrd', 'Functional', 'Fireplaces', 'Fireplace Qu',
       'Garage Type', 'Garage Yr Blt', 'Garage Finish'

In [7]:
def nanDeleter(df,percentage) :
    for col in df.columns : 
        if df[col].isna().sum() > df[col].count() * percentage//100 :
            print(f'{col} column has deleted because {df[col].isna().sum()} nan values')
            df.drop(col , axis = 1 , inplace = True)

In [8]:
nanDeleter(df,80)

Alley column has deleted because 2732 nan values
Fireplace Qu column has deleted because 1422 nan values
Pool QC column has deleted because 2917 nan values
Fence column has deleted because 2358 nan values
Misc Feature column has deleted because 2824 nan values


In [9]:
df.shape

(2930, 76)

In [10]:
def nanShower(df,percentage) :
    for col in df.columns : 
        if df[col].isna().sum() > df[col].count() * percentage//100 :
            print(f'{col} column has {df[col].isna().sum()} nan values')

In [11]:
nanShower(df , 1)

Lot Frontage column has 490 nan values
Bsmt Qual column has 80 nan values
Bsmt Cond column has 80 nan values
Bsmt Exposure column has 83 nan values
BsmtFin Type 1 column has 80 nan values
BsmtFin Type 2 column has 81 nan values
Garage Type column has 157 nan values
Garage Yr Blt column has 159 nan values
Garage Finish column has 159 nan values
Garage Qual column has 159 nan values
Garage Cond column has 159 nan values


In [12]:
df['Garage Finish'].value_counts() # Garage Finish is suspicious column.

Unf    1231
RFn     812
Fin     728
Name: Garage Finish, dtype: int64

In [13]:
def n_class(df):
    for col in df.columns:
        if len(df[col].value_counts()) == 1 :
            print(f"{col} column has only 1 class")
            print(df[col].value_counts(), end = '\n\n\n')
        elif len(df[col].value_counts()) == 2 :
            print(f"{col} column has only 2 class")
            print(df[col].value_counts(), end = '\n\n\n')
        elif len(df[col].value_counts()) == 3 :
            print(f"{col} column has only 3 class")
            print(df[col].value_counts(), end = '\n\n\n')

In [14]:
n_class(df)

Street column has only 2 class
Pave    2918
Grvl      12
Name: Street, dtype: int64


Utilities column has only 3 class
AllPub    2927
NoSewr       2
NoSeWa       1
Name: Utilities, dtype: int64


Land Slope column has only 3 class
Gtl    2789
Mod     125
Sev      16
Name: Land Slope, dtype: int64


Central Air column has only 2 class
Y    2734
N     196
Name: Central Air, dtype: int64


Bsmt Half Bath column has only 3 class
0.0    2753
1.0     171
2.0       4
Name: Bsmt Half Bath, dtype: int64


Half Bath column has only 3 class
0    1843
1    1062
2      25
Name: Half Bath, dtype: int64


Garage Finish column has only 3 class
Unf    1231
RFn     812
Fin     728
Name: Garage Finish, dtype: int64


Paved Drive column has only 3 class
Y    2652
N     216
P      62
Name: Paved Drive, dtype: int64




In [15]:
dropping = ['Street' , 'Utilities' , 'Land Slope' , 'Central Air' , 'Bsmt Half Bath' , 'Paved Drive']

In [16]:
df.drop(dropping, axis = 1 , inplace=True)

In [17]:
df.shape

(2930, 70)

In [18]:
df.head()

Unnamed: 0,PID,MS SubClass,MS Zoning,Lot Frontage,Lot Area,Lot Shape,Land Contour,Lot Config,Neighborhood,Condition 1,...,Enclosed Porch,3Ssn Porch,Screen Porch,Pool Area,Misc Val,Mo Sold,Yr Sold,Sale Type,Sale Condition,SalePrice
0,526301100,20,RL,141.0,31770,IR1,Lvl,Corner,NAmes,Norm,...,0,0,0,0,0,5,2010,WD,Normal,215000
1,526350040,20,RH,80.0,11622,Reg,Lvl,Inside,NAmes,Feedr,...,0,0,120,0,0,6,2010,WD,Normal,105000
2,526351010,20,RL,81.0,14267,IR1,Lvl,Corner,NAmes,Norm,...,0,0,0,0,12500,6,2010,WD,Normal,172000
3,526353030,20,RL,93.0,11160,Reg,Lvl,Corner,NAmes,Norm,...,0,0,0,0,0,4,2010,WD,Normal,244000
4,527105010,60,RL,74.0,13830,IR1,Lvl,Inside,Gilbert,Norm,...,0,0,0,0,0,3,2010,WD,Normal,189900


In [19]:
df.select_dtypes('object')

Unnamed: 0,MS Zoning,Lot Shape,Land Contour,Lot Config,Neighborhood,Condition 1,Condition 2,Bldg Type,House Style,Roof Style,...,Heating QC,Electrical,Kitchen Qual,Functional,Garage Type,Garage Finish,Garage Qual,Garage Cond,Sale Type,Sale Condition
0,RL,IR1,Lvl,Corner,NAmes,Norm,Norm,1Fam,1Story,Hip,...,Fa,SBrkr,TA,Typ,Attchd,Fin,TA,TA,WD,Normal
1,RH,Reg,Lvl,Inside,NAmes,Feedr,Norm,1Fam,1Story,Gable,...,TA,SBrkr,TA,Typ,Attchd,Unf,TA,TA,WD,Normal
2,RL,IR1,Lvl,Corner,NAmes,Norm,Norm,1Fam,1Story,Hip,...,TA,SBrkr,Gd,Typ,Attchd,Unf,TA,TA,WD,Normal
3,RL,Reg,Lvl,Corner,NAmes,Norm,Norm,1Fam,1Story,Hip,...,Ex,SBrkr,Ex,Typ,Attchd,Fin,TA,TA,WD,Normal
4,RL,IR1,Lvl,Inside,Gilbert,Norm,Norm,1Fam,2Story,Gable,...,Gd,SBrkr,TA,Typ,Attchd,Fin,TA,TA,WD,Normal
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2925,RL,IR1,Lvl,CulDSac,Mitchel,Norm,Norm,1Fam,SLvl,Gable,...,TA,SBrkr,TA,Typ,Detchd,Unf,TA,TA,WD,Normal
2926,RL,IR1,Low,Inside,Mitchel,Norm,Norm,1Fam,1Story,Gable,...,TA,SBrkr,TA,Typ,Attchd,Unf,TA,TA,WD,Normal
2927,RL,Reg,Lvl,Inside,Mitchel,Norm,Norm,1Fam,SFoyer,Gable,...,TA,SBrkr,TA,Typ,,,,,WD,Normal
2928,RL,Reg,Lvl,Inside,Mitchel,Norm,Norm,1Fam,1Story,Gable,...,Gd,SBrkr,TA,Typ,Attchd,RFn,TA,TA,WD,Normal


In [20]:
def hardClass(df) :
    for col in df.columns : 
        for n_class in df[col].value_counts():
            if n_class > df[col].count() * 80 // 100:
                print(f'{col} column has a hard class. {n_class} in one class')

In [21]:
hardClass(df)

Land Contour column has a hard class. 2633 in one class
Condition 1 column has a hard class. 2522 in one class
Condition 2 column has a hard class. 2900 in one class
Bldg Type column has a hard class. 2425 in one class
Roof Matl column has a hard class. 2887 in one class
Exter Cond column has a hard class. 2549 in one class
Bsmt Cond column has a hard class. 2616 in one class
BsmtFin Type 2 column has a hard class. 2499 in one class
BsmtFin SF 2 column has a hard class. 2578 in one class
Heating column has a hard class. 2885 in one class
Electrical column has a hard class. 2682 in one class
Low Qual Fin SF column has a hard class. 2890 in one class
Kitchen AbvGr column has a hard class. 2796 in one class
Functional column has a hard class. 2728 in one class
Garage Qual column has a hard class. 2615 in one class
Garage Cond column has a hard class. 2665 in one class
Enclosed Porch column has a hard class. 2471 in one class
3Ssn Porch column has a hard class. 2893 in one class
Screen Por

In [22]:
dropping2 = ['Land Contour' , 'Condition 1' , 'Condition 2' , 'Bldg Type' , 'Roof Matl' , 'Exter Cond' , 'Bsmt Cond' , 'BsmtFin Type 2' , 'BsmtFin SF 2' , 'Heating' ,
             'Electrical' , 'Low Qual Fin SF' , 'Kitchen AbvGr' , 'Functional' , 'Garage Qual' ,'Garage Cond' ,'Enclosed Porch' ,'3Ssn Porch' , 'Screen Porch',
             'Pool Area' ,'Misc Val' , 'Sale Type' ,'Sale Condition']

In [23]:
df.drop(dropping2, axis = 1 , inplace=True)

In [24]:
df.shape

(2930, 47)

In [25]:
df.head()

Unnamed: 0,PID,MS SubClass,MS Zoning,Lot Frontage,Lot Area,Lot Shape,Lot Config,Neighborhood,House Style,Overall Qual,...,Garage Type,Garage Yr Blt,Garage Finish,Garage Cars,Garage Area,Wood Deck SF,Open Porch SF,Mo Sold,Yr Sold,SalePrice
0,526301100,20,RL,141.0,31770,IR1,Corner,NAmes,1Story,6,...,Attchd,1960.0,Fin,2.0,528.0,210,62,5,2010,215000
1,526350040,20,RH,80.0,11622,Reg,Inside,NAmes,1Story,5,...,Attchd,1961.0,Unf,1.0,730.0,140,0,6,2010,105000
2,526351010,20,RL,81.0,14267,IR1,Corner,NAmes,1Story,6,...,Attchd,1958.0,Unf,1.0,312.0,393,36,6,2010,172000
3,526353030,20,RL,93.0,11160,Reg,Corner,NAmes,1Story,7,...,Attchd,1968.0,Fin,2.0,522.0,0,0,4,2010,244000
4,527105010,60,RL,74.0,13830,IR1,Inside,Gilbert,2Story,5,...,Attchd,1997.0,Fin,2.0,482.0,212,34,3,2010,189900


In [26]:
def continousFeatures(df,num):
    for col in df.columns : 
        if len(df[col].value_counts()) > num : 
            print(f'{col} column continous feature. Because has {len(df[col].value_counts())} different value.')

In [27]:
continousFeatures(df,15)

PID column continous feature. Because has 2930 different value.
MS SubClass column continous feature. Because has 16 different value.
Lot Frontage column continous feature. Because has 128 different value.
Lot Area column continous feature. Because has 1960 different value.
Neighborhood column continous feature. Because has 28 different value.
Year Built column continous feature. Because has 118 different value.
Year Remod/Add column continous feature. Because has 61 different value.
Exterior 1st column continous feature. Because has 16 different value.
Exterior 2nd column continous feature. Because has 17 different value.
Mas Vnr Area column continous feature. Because has 445 different value.
BsmtFin SF 1 column continous feature. Because has 995 different value.
Bsmt Unf SF column continous feature. Because has 1137 different value.
Total Bsmt SF column continous feature. Because has 1058 different value.
1st Flr SF column continous feature. Because has 1083 different value.
2nd Flr 

In [28]:
df.columns

Index(['PID', 'MS SubClass', 'MS Zoning', 'Lot Frontage', 'Lot Area',
       'Lot Shape', 'Lot Config', 'Neighborhood', 'House Style',
       'Overall Qual', 'Overall Cond', 'Year Built', 'Year Remod/Add',
       'Roof Style', 'Exterior 1st', 'Exterior 2nd', 'Mas Vnr Type',
       'Mas Vnr Area', 'Exter Qual', 'Foundation', 'Bsmt Qual',
       'Bsmt Exposure', 'BsmtFin Type 1', 'BsmtFin SF 1', 'Bsmt Unf SF',
       'Total Bsmt SF', 'Heating QC', '1st Flr SF', '2nd Flr SF',
       'Gr Liv Area', 'Bsmt Full Bath', 'Full Bath', 'Half Bath',
       'Bedroom AbvGr', 'Kitchen Qual', 'TotRms AbvGrd', 'Fireplaces',
       'Garage Type', 'Garage Yr Blt', 'Garage Finish', 'Garage Cars',
       'Garage Area', 'Wood Deck SF', 'Open Porch SF', 'Mo Sold', 'Yr Sold',
       'SalePrice'],
      dtype='object')

In [29]:
df.isna().sum()

PID                 0
MS SubClass         0
MS Zoning           0
Lot Frontage      490
Lot Area            0
Lot Shape           0
Lot Config          0
Neighborhood        0
House Style         0
Overall Qual        0
Overall Cond        0
Year Built          0
Year Remod/Add      0
Roof Style          0
Exterior 1st        0
Exterior 2nd        0
Mas Vnr Type       23
Mas Vnr Area       23
Exter Qual          0
Foundation          0
Bsmt Qual          80
Bsmt Exposure      83
BsmtFin Type 1     80
BsmtFin SF 1        1
Bsmt Unf SF         1
Total Bsmt SF       1
Heating QC          0
1st Flr SF          0
2nd Flr SF          0
Gr Liv Area         0
Bsmt Full Bath      2
Full Bath           0
Half Bath           0
Bedroom AbvGr       0
Kitchen Qual        0
TotRms AbvGrd       0
Fireplaces          0
Garage Type       157
Garage Yr Blt     159
Garage Finish     159
Garage Cars         1
Garage Area         1
Wood Deck SF        0
Open Porch SF       0
Mo Sold             0
Yr Sold   

In [30]:
df['Total Bsmt SF'].value_counts()

0.0       79
864.0     74
672.0     29
912.0     26
1040.0    25
          ..
839.0      1
1146.0     1
1415.0     1
1684.0     1
1003.0     1
Name: Total Bsmt SF, Length: 1058, dtype: int64

In [None]:
# Have some problem I'll look this later.

In [None]:
df['Lot Frontage'].fillna(df['Lot Frontage'].mean(),inplace = True)
df['Mas Vnr Type'].fillna(df['Mas Vnr Type'].mode(),inplace = True)
df['Mas Vnr Area'].fillna(df['Mas Vnr Area'].mode(),inplace = True)
df.drop('BsmtFin SF 1' , axis = 1 , inplace = True)
df.drop('Bsmt Unf SF' , axis = 1 , inplace = True)
df['Total Bsmt SF'].fillna(df['Total Bsmt SF'].mode(),inplace = True)
df['Bsmt Full Bath'].fillna(df['Bsmt Full Bath'].mode(),inplace = True)
df['Garage Type'].fillna(df['Garage Type'].mode(),inplace = True)
df.drop('Garage Yr Blt' , axis = 1 , inplace = True) 
df.drop('Garage Finish' , axis = 1 , inplace = True) 
df.drop('Garage Cars' , axis = 1 , inplace = True) 
df['Garage Area'].fillna(df['Garage Area'].mode(),inplace = True)

In [None]:
df.shape

In [None]:
df.isna().sum()

In [None]:
df['Mas Vnr Type'] = df['Mas Vnr Type'].fillna(df['Mas Vnr Type'].mode() , inplace = True)

In [34]:
df.isna().sum()

PID                 0
MS SubClass         0
MS Zoning           0
Lot Frontage      490
Lot Area            0
Lot Shape           0
Lot Config          0
Neighborhood        0
House Style         0
Overall Qual        0
Overall Cond        0
Year Built          0
Year Remod/Add      0
Roof Style          0
Exterior 1st        0
Exterior 2nd        0
Mas Vnr Type       23
Mas Vnr Area       23
Exter Qual          0
Foundation          0
Bsmt Qual          80
Bsmt Exposure      83
BsmtFin Type 1     80
BsmtFin SF 1        1
Bsmt Unf SF         1
Total Bsmt SF       1
Heating QC          0
1st Flr SF          0
2nd Flr SF          0
Gr Liv Area         0
Bsmt Full Bath      2
Full Bath           0
Half Bath           0
Bedroom AbvGr       0
Kitchen Qual        0
TotRms AbvGrd       0
Fireplaces          0
Garage Type       157
Garage Yr Blt     159
Garage Finish     159
Garage Cars         1
Garage Area         1
Wood Deck SF        0
Open Porch SF       0
Mo Sold             0
Yr Sold   

In [33]:
df['Mas Vnr Area'] = df['Mas Vnr Area'].fillna(df['Mas Vnr Area'].mode())