In [30]:
import numpy as np
import os
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib import gridspec
import seaborn as sns
from sklearn.preprocessing import scale,LabelEncoder,LabelBinarizer,MinMaxScaler,OrdinalEncoder
from sklearn.compose import make_column_transformer
from sklearn.pipeline import make_pipeline,Pipeline
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from scipy.stats import skew, norm
from scipy import stats
from datetime import date
from collections import Counter
import category_encoders as ce
import torch
import torch.nn as nn
import locale
from fuzzywuzzy import process
import warnings
warnings.filterwarnings('ignore')

In [2]:
SingleFamilyHome = pd.read_csv('../data/processed/SingleFamilyFinal.csv')

In [3]:
SingleFamilyHome.shape

(6348, 56)

In [4]:
SingleFamilyHome.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6348 entries, 0 to 6347
Data columns (total 56 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   image_link           6348 non-null   object 
 1   ListingPrice         6348 non-null   int64  
 2   Address              6348 non-null   object 
 3   City                 6348 non-null   object 
 4   ZipCode              6348 non-null   int64  
 5   County               6348 non-null   object 
 6   Legal                6348 non-null   object 
 7   NoBed                6348 non-null   int64  
 8   Stories              6348 non-null   float64
 9   Style                6348 non-null   object 
 10  BuildSqft            6348 non-null   int64  
 11  LotSize              6348 non-null   float64
 12  Fireplace            6348 non-null   int64  
 13  Heating              6348 non-null   object 
 14  Cooling              6348 non-null   object 
 15  IceMaker             6348 non-null   i

In [5]:
SingleFamilyHome.ZipCode = SingleFamilyHome.ZipCode.astype('object',copy=False)

In [20]:
SingleFamilyHome.select_dtypes(include=['object']).head()

Unnamed: 0,image_link,Address,City,ZipCode,County,Legal,Style,Heating,Cooling,Roof,Foundation,ExteriorType,LotDes,ControlAccess,WaterSewer,DwellingType,CarportDescription,SubName
0,['https://photos.harstatic.com/189115929/hr/im...,1735 Eado Point Lane,Houston,77003,Harris County,LT 17 BLK 2 EADO POINT,Contemporary/Modern,"Central Gas, Zoned","Central Electric, Zoned",Composition,Slab,Cement Board,Other,"Automatic Gate, Driveway Gate",Public,Free Standing,not applicable,EADO POINT
1,['https://photos.harstatic.com/188179029/hr/im...,619 Live Oak Street,Houston,77003,Harris County,LT 7 BLK 1 CAPITOL OAKS SEC 3 RP NO 1,Traditional,Central Gas,Central Electric,Aluminum,Slab,Stucco,Subdivision Lot,no controlled access,Public,Free Standing,not applicable,Capitol Oaks
2,['https://photos.harstatic.com/189527871/hr/im...,1417 Hussion Street,Houston,77003,Harris County,LT 7 BLK 1 W L EDMUNDSON SEC 3,Traditional,Central Gas,Central Electric,Composition,Pier & Beam,Cement Board,Subdivision Lot,no controlled access,Public,Free Standing,not applicable,Edmundson (77023)
3,['https://photos.harstatic.com/189915458/hr/im...,1737 Aden Drive,Houston,77003,Harris County,"Lot 59, Block 1",Other,Central Gas,Central Electric,Composition,Slab,Brick,Corner,Automatic Gate,Public,Free Standing,not applicable,Midtown Village
4,['https://photos.harstatic.com/190529396/hr/im...,1773 Aden Drive,Houston,77003,Harris County,LT 90 BLK 1 MIDTOWN VILLAGE,Traditional,Central Gas,Central Electric,Composition,Slab,Brick,Subdivision Lot,Automatic Gate,Public,Free Standing,not applicable,Midtown Village


In [23]:
SingleFamilyHome.drop(['image_link','Address','Legal','Heating'],axis=1,inplace=True)

In [22]:
SingleFamilyHome.Cooling.value_counts()

Central Electric                                         5109
Central Electric, Zoned                                   793
Central Gas                                               171
Central Electric, Central Gas                              98
Window Units                                               41
Central Electric, Window Units                             25
No Cooling/Vent                                            23
Central Gas, Zoned                                         16
Other Cooling                                              14
Central Electric, Other Cooling                             9
Zoned                                                       9
Central Electric, Other Cooling, Zoned                      7
Central Electric, Heat Pump                                 7
Central Electric, Central Gas, Zoned                        6
No Cooling/Vent, Other Cooling                              5
Central Electric, Solar Assisted                            3
Heat Pum

In [25]:
standard_colling=['Central','No Cooling','Other','Heat Pump']
#For each correct roof  type . in standard roof list
for cool in standard_colling:
    
    # Find matches in gender
    matches = process.extract(cool, SingleFamilyHome.Cooling,
                 limit = SingleFamilyHome.shape[0])
    
    
# For each possible_match with similarity score >= 90
    for possible_match in matches:
        if possible_match[1] >= 90:
      
            
            matching = SingleFamilyHome.Cooling == possible_match[0]
           # I decided to use 'W' for female since there is high similarity between 'female' and 'male' 
        SingleFamilyHome.loc[matching , 'Cooling'] = cool

SingleFamilyHome.Cooling.value_counts()

Central         6251
Window Units      41
No Cooling        28
Other             15
Zoned              9
Heat Pump          4
Name: Cooling, dtype: int64

In [6]:
SingleFamilyHome.LotDes.value_counts()

Subdivision Lot                                             3408
Cul-De-Sac, Subdivision Lot                                  365
Corner, Subdivision Lot                                      313
Corner                                                       285
Cul-De-Sac                                                   265
                                                            ... 
Cleared, Corner, Cul-De-Sac, Subdivision Lot, Water View       1
Cleared, Waterfront                                            1
Corner, Cul-De-Sac, Patio Lot, Wooded                          1
On Golf Course, Subdivision Lot                                1
Cul-De-Sac, Other                                              1
Name: LotDes, Length: 210, dtype: int64

In [13]:
SingleFamilyHome.UnitLoc.value_counts()

Subdivision Lot                                             3408
Cul-De-Sac, Subdivision Lot                                  365
Corner, Subdivision Lot                                      313
Corner                                                       285
Cul-De-Sac                                                   265
                                                            ... 
Cleared, Corner, Cul-De-Sac, Subdivision Lot, Water View       1
Cleared, Waterfront                                            1
Corner, Cul-De-Sac, Patio Lot, Wooded                          1
On Golf Course, Subdivision Lot                                1
Cul-De-Sac, Other                                              1
Name: UnitLoc, Length: 210, dtype: int64

In [15]:
SingleFamilyHome.drop('UnitLoc',axis=1,inplace=True)

In [7]:
standard_LotDes=['Subdivision Lot','Cul-De-Sac','Corner','Patio Lot','Water View','Cleared','Wooded','Other','Golf Course']
#For each correct roof  type . in standard roof list
for des in standard_LotDes:
    
    # Find matches in gender
    matches = process.extract(des, SingleFamilyHome.LotDes,
                 limit = SingleFamilyHome.shape[0])
    
    
# For each possible_match with similarity score >= 90
    for possible_match in matches:
        if possible_match[1] >= 90:
      
            
            matching = SingleFamilyHome.LotDes == possible_match[0]
           # I decided to use 'W' for female since there is high similarity between 'female' and 'male' 
        SingleFamilyHome.loc[matching , 'LotDes'] = des
SingleFamilyHome.LotDes = SingleFamilyHome.LotDes.replace(['Waterfront'],'Water View')
SingleFamilyHome.LotDes.value_counts()

Subdivision Lot    4963
Cul-De-Sac          408
Corner              339
Other               204
Cleared             166
Patio Lot            93
Water View           64
Golf Course          56
Wooded               43
Greenbelt            10
Ravine                2
Name: LotDes, dtype: int64

In [8]:
SingleFamilyHome.ExteriorType.value_counts()

Brick                                        1377
Brick, Wood                                  1028
Brick, Cement Board                           801
Brick & Wood                                  564
Stucco                                        321
                                             ... 
Brick & Wood, Cement Board, Stone, Stucco       1
Aluminum, Stone                                 1
Brick, Cement Board, Stone, Vinyl, Wood         1
Brick & Wood, Cement Board, Vinyl               1
Aluminum, Brick, Stone                          1
Name: ExteriorType, Length: 140, dtype: int64

In [9]:
standard_exterior=['Brick','Wood','Stucco','Cement Board','Stone','Aluminum','Other']
#For each correct roof  type . in standard roof list
for exterior in standard_exterior:
    
    # Find matches in gender
    matches = process.extract(exterior, SingleFamilyHome.ExteriorType,
                 limit = SingleFamilyHome.shape[0])
    
    
# For each possible_match with similarity score >= 90
    for possible_match in matches:
        if possible_match[1] >= 90:
      
            
            matching = SingleFamilyHome.ExteriorType == possible_match[0]
           # I decided to use 'W' for female since there is high similarity between 'female' and 'male' 
        SingleFamilyHome.loc[matching , 'ExteriorType'] = exterior
SingleFamilyHome.ExteriorType = SingleFamilyHome.ExteriorType.replace(['Unknown'],'Other')
SingleFamilyHome.ExteriorType.value_counts()

Brick           4887
Stucco           601
Wood             356
Cement Board     310
Other            100
Vinyl             39
Stone             29
Aluminum          20
Asbestos           6
Name: ExteriorType, dtype: int64

In [10]:
SingleFamilyHome.WaterSewer.value_counts()

Public Sewer, Public Water                        3600
Water District                                    1570
Public Sewer, Public Water, Water District         456
Public Sewer, Water District                       197
Public Water                                       188
Public Sewer                                        98
Public Water, Water District                        32
Septic Tank                                         29
Septic Tank, Well                                   29
Public Water, Septic Tank                           19
Other Water/Sewer                                   18
Public Sewer, Public Water, Well                    15
Aerobic, Public Water                               14
Aerobic                                             13
Aerobic, Septic Tank                                 7
Aerobic, Septic Tank, Well                           7
Aerobic, Other Water/Sewer, Septic Tank              6
Aerobic, Well                                        6
Septic Tan

In [11]:
standard_watersewer=['Public','Water District','Septic Tank','Aerobic','No Sewer','Other']
#For each correct roof  type . in standard roof list
for water in standard_watersewer:
    
    # Find matches in gender
    matches = process.extract(water, SingleFamilyHome.WaterSewer,limit = SingleFamilyHome.shape[0])
    
    
# For each possible_match with similarity score >= 90
    for possible_match in matches:
        if possible_match[1] >= 90:
      
            
            matching = SingleFamilyHome.WaterSewer == possible_match[0]
           # I decided to use 'W' for female since there is high similarity between 'female' and 'male' 
        SingleFamilyHome.loc[matching , 'WaterSewer'] = water

SingleFamilyHome.WaterSewer.value_counts()

Public            4643
Water District    1580
Septic Tank         82
Aerobic             22
Other               18
No Sewer             2
Well                 1
Name: WaterSewer, dtype: int64

In [16]:
SingleFamilyHome.ListType.value_counts()

Exclusive Right to Sell/Lease                    6208
Exclusive Agency to Sell/Lease                    111
Exclusive Right to Sell/Lse w/ Named Prospect      29
Name: ListType, dtype: int64

In [17]:
SingleFamilyHome.drop('ListType',axis=1,inplace=True)

In [18]:
SingleFamilyHome.DwellingType.value_counts()

Free Standing    6272
Patio Home         40
Duplex             24
Historic            6
Manufactured        6
Name: DwellingType, dtype: int64

In [41]:
categorical = SingleFamilyHome.columns[SingleFamilyHome.dtypes=='object']

In [42]:
categorical

Index(['City', 'ZipCode', 'County', 'Style', 'Cooling', 'Roof', 'Foundation',
       'ExteriorType', 'LotDes', 'ControlAccess', 'WaterSewer', 'DwellingType',
       'CarportDescription', 'SubName'],
      dtype='object')

There are various ways to encode data , converting to 1-hot encoding or Label, or Ordinal. I will convert categorical features  with sklearn ordinal encoding to simply because it keeps our feature count field to variable and saves lot of time and space in processing

In [43]:
ordinal = OrdinalEncoder()

In [46]:
SingleFamilyHome[categorical] = ordinal.fit_transform(SingleFamilyHome[categorical])

In [47]:
SingleFamilyHome.head()

Unnamed: 0,ListingPrice,City,ZipCode,County,NoBed,Stories,Style,BuildSqft,LotSize,Fireplace,...,AvgNeighborValRange,MedianPrice/Sqft,PaidTax,TaxRate,TotalBedSqft,FullBath,HalfBath,MaintenanceFee,Age,MedianAge
0,364990,2.0,0.0,1.0,3,3.0,1.0,1736,1428.0,0,...,217000.0,192.51,2169.0,2.6554,430.0,3,1,1195.0,1,2
1,419000,2.0,0.0,1.0,3,3.0,14.0,2671,2006.0,1,...,443500.0,200.05,10366.0,2.5466,320.0,3,1,2244.0,10,8
2,298800,2.0,0.0,1.0,3,1.0,14.0,1972,5000.0,0,...,126500.0,147.06,5854.0,2.5716,616.0,1,1,0.0,14,91
3,289900,2.0,0.0,1.0,2,2.0,10.0,1688,1918.0,0,...,259000.0,181.62,6685.4,2.5465,330.0,2,0,1500.0,14,15
4,284900,2.0,0.0,1.0,2,2.0,14.0,1410,1845.0,0,...,259000.0,181.62,6299.0,2.5466,282.0,2,1,1400.0,15,15


In [48]:
SingleFamilyHome.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6348 entries, 0 to 6347
Data columns (total 50 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   ListingPrice         6348 non-null   int64  
 1   City                 6348 non-null   float64
 2   ZipCode              6348 non-null   float64
 3   County               6348 non-null   float64
 4   NoBed                6348 non-null   int64  
 5   Stories              6348 non-null   float64
 6   Style                6348 non-null   float64
 7   BuildSqft            6348 non-null   int64  
 8   LotSize              6348 non-null   float64
 9   Fireplace            6348 non-null   int64  
 10  Cooling              6348 non-null   float64
 11  IceMaker             6348 non-null   int64  
 12  Microwave            6348 non-null   int64  
 13  Compactor            6348 non-null   int64  
 14  Dishwasher           6348 non-null   int64  
 15  Disposal             6348 non-null   i