### Importing Libraries


In [None]:
import numpy as np
import pandas as pd
pd.set_option('display.max_columns', None)

import matplotlib as mpl
import matplotlib.pyplot as plt
import matplotlib.pylab as pylab
import seaborn as sns 

import importlib
from helper import utility as ut
importlib.reload(ut)


### Importing the dataset

In [None]:
FILE_PATH = 'data/'
# Load in 2016 Properties data and 2016 training data
prop_2016 = ut.load_properties_data(FILE_PATH + 'properties_2016.csv')
prop_2017 = ut.load_properties_data(FILE_PATH + 'properties_2017.csv')
train_2016 = ut.load_data(FILE_PATH + 'train_2016_v2.csv')
train_2017 = ut.load_data(FILE_PATH + 'train_2017.csv')

print(f"Prop 2016|| Number of columns = {len(prop_2016.columns)} | Number of Rows : {len(prop_2016)}")
print(f"Prop_2017|| Number of columns = {len(prop_2017.columns)} | Number of Rows : {len(prop_2017)}")

print(f"Train_2016|| Number of columns = {len(train_2016.columns)} | Number of Rows : {len(train_2016)}")
print(f"Train_2017|| Number of columns = {len(train_2017.columns)} | Number of Rows : {len(train_2017)}")

### Feature Engineering on Properties Data Set

In [None]:
landuse_code = ut.get_landuse_code_df(prop_2016, prop_2017)
zone_code = ut.get_zoning_desc_code_df(prop_2016, prop_2017)

prop_2016_clean = ut.process_columns(prop_2016, landuse_code, zone_code)
prop_2017_clean = ut.process_columns(prop_2017, landuse_code, zone_code)

# Our flagged out list of categorical values
categorical_list = ['airconditioningtypeid', 'architecturalstyletypeid', 'buildingclasstypeid',
                             'heatingorsystemtypeid', 'regionidcounty', 'typeconstructiontypeid', 'fips', 'propertylandusetypeid','propertycountylandusecode_id', 'propertyzoningdesc_id']

# Changing certain columns to categorical type 
# Reducing float64 to float32 for optimisation purposes
prop_2016_clean = ut.retype_columns(prop_2016_clean, categorical_list)
prop_2016_clean = ut.feature_engineering(prop_2016_clean)

prop_2017_clean = ut.retype_columns(prop_2017_clean, categorical_list)
prop_2017_clean = ut.feature_engineering(prop_2017_clean)

print(f"Prop_2016 after processing|| Number of columns = {len(prop_2016_clean.columns)} | Number of Rows : {len(prop_2016_clean)}")
print(f"Prop_2017 after processing|| Number of columns = {len(prop_2017_clean.columns)} | Number of Rows : {len(prop_2017_clean)}")


In [None]:
# Save this copy of data
prop_2016_clean.to_csv('clean_data/prop_2016_clean.csv', index=False)
prop_2017_clean.to_csv('clean_data/prop_2017_clean.csv', index=False)

### Preparing for Training Data


In [None]:
train_2016 = train_2016.merge(how='left' , right=prop_2016_clean , on='parcelid')
train_2017 = train_2017.merge(how='left' , right=prop_2017_clean, on='parcelid')

train_combined = pd.concat([train_2016, train_2017], axis=0, ignore_index=True)

print(f"Num of Rows : {len(train_combined)}")

# Add Feature based on trasactiondate from train dataset provided
train_combined = ut.add_ymq_features(train_combined)

train_combined.head()

In [None]:
train_combined.to_csv('clean_data/train_combined.csv', index=False)