### Importing Libraries


In [76]:
import numpy as np
import pandas as pd
pd.set_option('display.max_columns', None)

import matplotlib as mpl
import matplotlib.pyplot as plt
import matplotlib.pylab as pylab
import seaborn as sns 

import importlib
from helper import utility as ut
importlib.reload(ut)


<module 'helper.utility' from 'c:\\Users\\Lewkh\\Documents\\GitHub\\CE4041_ML\\helper\\utility.py'>

### Importing the dataset

In [77]:
FILE_PATH = 'data/'
# Load in 2016 Properties data and 2016 training data
prop_2016 = ut.load_properties_data(FILE_PATH + 'properties_2016.csv')
prop_2017 = ut.load_properties_data(FILE_PATH + 'properties_2017.csv')
train_2016 = ut.load_data(FILE_PATH + 'train_2016_v2.csv')
train_2017 = ut.load_data(FILE_PATH + 'train_2017.csv')

print(f"Prop 2016|| Number of columns = {len(prop_2016.columns)} | Number of Rows : {len(prop_2016)}")
print(f"Prop_2017|| Number of columns = {len(prop_2017.columns)} | Number of Rows : {len(prop_2017)}")

print(f"Train_2016|| Number of columns = {len(train_2016.columns)} | Number of Rows : {len(train_2016)}")
print(f"Train_2017|| Number of columns = {len(train_2017.columns)} | Number of Rows : {len(train_2017)}")

Prop 2016|| Number of columns = 58 | Number of Rows : 2985217
Prop_2017|| Number of columns = 58 | Number of Rows : 2985217
Train_2016|| Number of columns = 3 | Number of Rows : 90275
Train_2017|| Number of columns = 3 | Number of Rows : 77613


### Feature Engineering on Properties Data Set

In [78]:
landuse_code = ut.get_landuse_code_df(prop_2016, prop_2017)
zone_code = ut.get_zoning_desc_code_df(prop_2016, prop_2017)

prop_2016_clean = ut.process_columns(prop_2016, landuse_code, zone_code)
prop_2017_clean = ut.process_columns(prop_2017, landuse_code, zone_code)

# Our flagged out list of categorical values
categorical_list = ['airconditioningtypeid', 'architecturalstyletypeid', 'buildingclasstypeid',
                             'heatingorsystemtypeid', 'regionidcounty', 'typeconstructiontypeid', 'fips', 'propertylandusetypeid','propertycountylandusecode_id', 'propertyzoningdesc_id']

# Changing certain columns to categorical type 
# Reducing float64 to float32 for optimisation purposes
prop_2016_clean = ut.retype_columns(prop_2016_clean, categorical_list)
prop_2016_clean = ut.feature_engineering(prop_2016_clean)

prop_2017_clean = ut.retype_columns(prop_2017_clean, categorical_list)
prop_2017_clean = ut.feature_engineering(prop_2017_clean)

print(f"Prop_2016 after processing|| Number of columns = {len(prop_2016_clean.columns)} | Number of Rows : {len(prop_2016_clean)}")
print(f"Prop_2017 after processing|| Number of columns = {len(prop_2017_clean.columns)} | Number of Rows : {len(prop_2017_clean)}")


Prop_2016 after processing|| Number of columns = 67 | Number of Rows : 2985217
Prop_2017 after processing|| Number of columns = 67 | Number of Rows : 2985217


In [79]:
# Save this copy of data
prop_2016_clean.to_csv('clean_data/prop_2016_clean.csv', index=False)
prop_2017_clean.to_csv('clean_data/prop_2017_clean.csv', index=False)

### Preparing for Training Data


In [80]:
train_2016 = train_2016.merge(how='left' , right=prop_2016_clean , on='parcelid')
train_2017 = train_2017.merge(how='left' , right=prop_2017_clean, on='parcelid')

train_combined = pd.concat([train_2016, train_2017], axis=0, ignore_index=True)

print(f"Num of Rows : {len(train_combined)}")

# Add Feature based on trasactiondate from train dataset provided
train_combined = ut.add_ymq_features(train_combined)

train_combined.head()

Num of Rows : 167888


Unnamed: 0,parcelid,logerror,airconditioningtypeid,architecturalstyletypeid,basementsqft,bathroomcnt,bedroomcnt,buildingclasstypeid,buildingqualitytypeid,decktypeid,finishedfloor1squarefeet,calculatedfinishedsquarefeet,finishedsquarefeet13,finishedsquarefeet50,finishedsquarefeet6,fips,fireplacecnt,fullbathcnt,garagecarcnt,garagetotalsqft,hashottuborspa,heatingorsystemtypeid,latitude,longitude,lotsizesquarefeet,poolcnt,poolsizesum,pooltypeid10,pooltypeid2,pooltypeid7,propertylandusetypeid,rawcensustractandblock,regionidcity,regionidcounty,regionidneighborhood,regionidzip,roomcnt,storytypeid,threequarterbathnbr,typeconstructiontypeid,unitcnt,yardbuildingsqft17,yardbuildingsqft26,yearbuilt,numberofstories,fireplaceflag,structuretaxvaluedollarcnt,taxvaluedollarcnt,assessmentyear,landtaxvaluedollarcnt,taxamount,taxdelinquencyflag,taxdelinquencyyear,censustractandblock,propertycountylandusecode_id,propertyzoningdesc_id,avg_garage_size,property_tax_per_sqft,coord_1,coord_2,coord_3,coord_4,missing_finished_area,missing_total_area,missing_bathroom_cnt_calc,total_room_cnt,avg_area_per_room,derived_avg_area_per_room,year,month,quarter
0,11016594,0.0276,0,-1,,2.0,3.0,-1,4.0,,,1684.0,,,,0,,2.0,,,,1,34280992.0,-118488536.0,7528.0,,,,,,230,60371068.0,12447.0,0,31817.0,96370.0,0.0,,,-1,1.0,,,1959.0,,,122754.0,360170.0,2015.0,237416.0,6735.879883,,,60371070000000.0,33,21,,3.999929,-84207544.0,152769536.0,4658858.0,63903126.0,0.0,1.0,0.0,5.0,,336.799988,0,1,1
1,14366692,-0.1684,-1,-1,,3.5,4.0,-1,,,,2263.0,,,,22,,3.0,2.0,468.0,,-1,33668120.0,-117677552.0,3643.0,,,,,,230,60590524.0,32380.0,1,,96962.0,0.0,,1.0,-1,,,,2014.0,,,346458.0,585529.0,2015.0,239071.0,10153.019531,,,,14,-1,234.0,4.486531,-84009432.0,151345664.0,4248732.0,63087508.0,0.0,1.0,0.0,7.5,,301.733337,0,1,1
2,12098116,-0.004,0,-1,,3.0,2.0,-1,4.0,,,2217.0,,,,0,,3.0,,,,1,34136312.0,-118175032.0,11423.0,,,,,,230,60374640.0,47019.0,0,275411.0,96293.0,0.0,,,-1,1.0,,,1940.0,,,61994.0,119906.0,2015.0,57912.0,11484.480469,,,60374640000000.0,33,56,,5.18019,-84038720.0,152311344.0,4592554.0,63680070.0,0.0,1.0,0.0,5.0,,443.399994,0,1,1
3,12643413,0.0218,0,-1,,2.0,2.0,-1,4.0,,,839.0,,,,0,,2.0,,,,1,33755800.0,-118309000.0,70859.0,,,,,,235,60372964.0,12447.0,0,54300.0,96222.0,0.0,,,-1,1.0,,,1987.0,,,171518.0,244880.0,2015.0,73362.0,3048.73999,,,60372960000000.0,38,43,,3.633778,-84553200.0,152064800.0,4178550.0,63333050.0,0.0,1.0,0.0,4.0,,209.75,0,1,1
4,14432541,-0.005,-1,-1,,2.5,4.0,-1,,,,2283.0,,,,22,,2.0,2.0,598.0,,-1,33485644.0,-117700232.0,6000.0,1.0,,,,1.0,230,60590424.0,17686.0,1,,96961.0,8.0,,1.0,-1,,,,1981.0,2.0,,169574.0,434551.0,2015.0,264977.0,5488.959961,,,60590420000000.0,21,-1,299.0,2.404275,-84214592.0,151185872.0,4060586.0,62910702.0,0.0,1.0,0.0,6.5,285.375,351.230774,0,1,1


In [81]:
train_combined.to_csv('clean_data/train_combined.csv', index=False)