## House Price data

We upload the data, encode some features.

In [223]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np

import warnings
# Suppress FutureWarning messages
warnings.simplefilter(action='ignore', category=FutureWarning)

In [224]:
home_data = pd.read_csv('house-prices-data/train.csv')

In [225]:
home_data.shape

(1460, 81)

In [226]:
## log price so end prices are more comparable

home_data['log_price'] = home_data['SalePrice'].apply(np.log)

In [227]:
home_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Data columns (total 82 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Id             1460 non-null   int64  
 1   MSSubClass     1460 non-null   int64  
 2   MSZoning       1460 non-null   object 
 3   LotFrontage    1201 non-null   float64
 4   LotArea        1460 non-null   int64  
 5   Street         1460 non-null   object 
 6   Alley          91 non-null     object 
 7   LotShape       1460 non-null   object 
 8   LandContour    1460 non-null   object 
 9   Utilities      1460 non-null   object 
 10  LotConfig      1460 non-null   object 
 11  LandSlope      1460 non-null   object 
 12  Neighborhood   1460 non-null   object 
 13  Condition1     1460 non-null   object 
 14  Condition2     1460 non-null   object 
 15  BldgType       1460 non-null   object 
 16  HouseStyle     1460 non-null   object 
 17  OverallQual    1460 non-null   int64  
 18  OverallC

In [228]:
## variable we will have to hotcode:
## Utilities, LotConfig, Neighborhood, Condition1, Condition2, BldgType, HouseStyle, Heating, CentralAir, GarageType, SaleCondition, SaleType
# home_data.info()
columns_to_drop = ['Alley', 'MasVnrType', 'FireplaceQu', 'PoolQC', 'Fence', 'MiscFeature', 'Utilities', 'Street','LandSlope','Condition2']
home_data = home_data.drop(columns = columns_to_drop)

home_data = home_data.dropna()
home_data.shape


(1094, 72)

In [229]:
with open('house-prices-data/data_description.txt') as file:
    print(file.read())

MSSubClass: Identifies the type of dwelling involved in the sale.	

        20	1-STORY 1946 & NEWER ALL STYLES
        30	1-STORY 1945 & OLDER
        40	1-STORY W/FINISHED ATTIC ALL AGES
        45	1-1/2 STORY - UNFINISHED ALL AGES
        50	1-1/2 STORY FINISHED ALL AGES
        60	2-STORY 1946 & NEWER
        70	2-STORY 1945 & OLDER
        75	2-1/2 STORY ALL AGES
        80	SPLIT OR MULTI-LEVEL
        85	SPLIT FOYER
        90	DUPLEX - ALL STYLES AND AGES
       120	1-STORY PUD (Planned Unit Development) - 1946 & NEWER
       150	1-1/2 STORY PUD - ALL AGES
       160	2-STORY PUD - 1946 & NEWER
       180	PUD - MULTILEVEL - INCL SPLIT LEV/FOYER
       190	2 FAMILY CONVERSION - ALL STYLES AND AGES

MSZoning: Identifies the general zoning classification of the sale.
		
       A	Agriculture
       C	Commercial
       FV	Floating Village Residential
       I	Industrial
       RH	Residential High Density
       RL	Residential Low Density
       RP	Residential Low Density Park 
       RM

In [230]:
## For exterior quality and condition, we change to a 10 points scale
Quality = ['ExterQual', 'ExterCond', 'BsmtQual','BsmtCond', 
           'HeatingQC','KitchenQual', 'GarageQual','GarageCond']

for ext in Quality:
    listo = home_data[ext]
    num_list = list()

    for quality in listo:
        if quality == 'Ex':
            num_list.append(10)  ## Excellent to 10
        if quality  == 'Gd':
            num_list.append(8) ## Good to 8
        if quality == 'TA':
            num_list.append(6) ## Average/Typical to 6
        if quality == 'Fa':
            num_list.append(4) ## Fair to 4
        if quality == 'Po':
            num_list.append(2) ## Poor to 2

    home_data.loc[:,ext] = num_list 

In [231]:
listo = home_data['BsmtExposure']
num_list = list()
for quality in listo:
        if quality  == 'Gd':
            num_list.append(8) ## Good to 8
        if quality == 'Av':
            num_list.append(6) ## Average/Typical to 6
        if quality == 'Mn':
            num_list.append(4) ## Fair to 4
        if quality == 'No':
            num_list.append(2) ## Poor to 2
home_data.loc[:,'BsmtExposure'] = num_list

Types = ['BsmtFinType1','BsmtFinType2']           
for ext in Types:
    listo = home_data[ext]
    num_list = list()

    for quality in listo:
        if quality == 'GLQ':
            num_list.append(10)  ## Excellent to 10
        if quality  == 'ALQ':
            num_list.append(8) ## Good to 8
        if quality == 'BLQ':
            num_list.append(6) ## Average/Typical to 6
        if quality == 'Rec':
            num_list.append(4) ## Fair to 4
        if quality == 'LwQ':
            num_list.append(2) ## Poor to 2
        if quality == 'Unf':
            num_list.append(0) ## Unfinished to 0
    home_data.loc[:,ext] = num_list


In [232]:
'LotShape'

'LotShape'

In [233]:
new_feature_list = []
## creating the new basement feature

Bsmt = ['BsmtQual','BsmtCond','BsmtExposure','BsmtFinType1','BsmtFinType2', 'BsmtFullBath', 'BsmtHalfBath']

col_sum = []
for i,col in enumerate(Bsmt):
    ## if we are on S15r1, we square the column 
    if i == 0:
        col_sum = home_data[col].values**2
    ## after the first question, we now sum the squares
    else: 
        col_sum = home_data[col]**2 + col_sum
col_sum = col_sum.astype(float)
## we want to form a new dataframe, which is the sum of the squares
df_sum = pd.DataFrame(col_sum)

## we square root the sum of squares dataframe, giving us the magnitude.
df_sum = df_sum.apply(np.sqrt)

## taking the magnitude dataframe, which is a single column, we place that
## in the clean dataframe under correct column name. 
home_data['Bsmt_magnitude'] = df_sum

new_feature_list.append('Bsmt_magnitude')


## new remodel feature

home_data['Remod_diff'] = home_data['YrSold'] - home_data['YearRemodAdd']
new_feature_list.append('Remod_diff')
    
## 

feature_to_target = ['MSSubClass','Neighborhood','MSZoning', 'HouseStyle', 'LotConfig', 'Condition1', 'BldgType','MoSold','YrSold', 'SaleType','SaleCondition']
    
for x in feature_to_target:
    home_data[x + '_encoded'] = home_data.groupby(x)['log_price'].transform('mean')
    new_feature_list.append(x + '_encoded')

## drop those columns we have target encoded
home_data = home_data.drop(columns = feature_to_target)


## dummy encode garage type
dummies = pd.get_dummies(home_data['GarageType']) 
garage_type = ['Attchd','Detchd']
other_garage = ['BuiltIn','CarPort','Basment','2Types']

home_data[garage_type] = 1*pd.get_dummies(home_data['GarageType'])[garage_type]

df = 1*pd.get_dummies(home_data['GarageType'])[other_garage]
df['sum'] = df.sum(axis = 1)

home_data['other_garage'] = df['sum'].values

new_feature_list.extend(garage_type)
new_feature_list.append('other_garage')

## dummy encode Garage finish
dummies = pd.get_dummies(home_data['GarageFinish']) 
spots = ['RFn','Unf','Fin']
# other_spots = ['BuiltIn','CarPort','Basment','2Types']

home_data[spots] = 1*pd.get_dummies(home_data['GarageFinish'])[spots]

new_feature_list.extend(spots)

##
dummies = pd.get_dummies(home_data['LotShape'])
lots = ['Reg','IR1']
other_lots = ['IR2','IR3']

home_data[lots] = 1*pd.get_dummies(home_data['LotShape'])[lots]

df = 1*pd.get_dummies(home_data['LotShape'])[other_lots]
df['sum'] = df.sum(axis = 1)

home_data['other_lots'] = df['sum'].values

##

dummies = pd.get_dummies(home_data['RoofStyle'])
roofs = ['Gable','Hip']
other_roofs = ['Gambrel','Mansard','Flat']

home_data[roofs] = 1*pd.get_dummies(home_data['RoofStyle'])[roofs]

df = 1*pd.get_dummies(home_data['RoofStyle'])[other_roofs]
df['sum'] = df.sum(axis = 1)

home_data['other_roofs'] = df['sum'].values

## to csv 
home_data.to_csv('cleaned_data.csv', index = False)
print(new_feature_list)
print('Saved new data set as csv')

['Bsmt_magnitude', 'Remod_diff', 'MSSubClass_encoded', 'Neighborhood_encoded', 'MSZoning_encoded', 'HouseStyle_encoded', 'LotConfig_encoded', 'Condition1_encoded', 'BldgType_encoded', 'MoSold_encoded', 'YrSold_encoded', 'SaleType_encoded', 'SaleCondition_encoded', 'Attchd', 'Detchd', 'other_garage', 'RFn', 'Unf', 'Fin']
Saved new data set as csv
