# Model Prep

In [1]:
# Import various libraries & packages:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.backends.backend_pdf import PdfPages
import seaborn as sns

from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor
from sklearn import tree
from sklearn.ensemble import RandomForestRegressor
from sklearn import metrics
from sklearn.model_selection import cross_validate
from sklearn.model_selection import cross_val_score 

In [2]:
# Read CSV file into a Data Frame:
df = pd.read_csv('data/final/rppr_income.csv', keep_default_na=True, delimiter=',', skipinitialspace=True)

In [3]:
# Checking how many rows and columns:
df.shape

(9989, 13)

In [4]:
# Print first 5 rows:
df.head(5)

Unnamed: 0,Address,Postal_Code,County,Price_euro,Not_Full_Market_Price,Description_of_Property,Property_Size_Description,Day,Month,Year,DoS_yyyymmdd,Income_Indices,Income_Per_Person_euro
0,"1 ANNE ST, CORK",Unknown,Cork,20000.0,No,Second-Hand Dwelling house /Apartment,Unknown,20,12,2017,2017-12-20,98.943012,29293.89683
1,"15 INNISMORE, CRUMLIN VILLAGE, DUBLIN 12",Dublin 12,Dublin,20000.0,Yes,Second-Hand Dwelling house /Apartment,Unknown,1,2,2018,2018-02-01,122.341731,37406.12892
2,"14 Chapel Farm, Lusk",Unknown,Dublin,20000.0,Yes,Second-Hand Dwelling house /Apartment,Unknown,15,3,2010,2010-03-15,114.683825,29224.81553
3,"94B RATOATH AVE, FINGLAS, DUBLIN 11",Dublin 11,Dublin,20000.0,No,Second-Hand Dwelling house /Apartment,Unknown,9,12,2013,2013-12-09,120.059108,30843.61949
4,"Parade Field, Chapel Street, Bantry",Unknown,Cork,20000.0,No,Second-Hand Dwelling house /Apartment,Unknown,16,8,2012,2012-08-16,99.684309,25832.86283


In [5]:
# Check Feature Data Types:
df.dtypes

Address                       object
Postal_Code                   object
County                        object
Price_euro                   float64
Not_Full_Market_Price         object
Description_of_Property       object
Property_Size_Description     object
Day                            int64
Month                          int64
Year                           int64
DoS_yyyymmdd                  object
Income_Indices               float64
Income_Per_Person_euro       float64
dtype: object

In [6]:
# Change Data Types:
df['DoS_yyyymmdd'] = df['DoS_yyyymmdd'].astype('datetime64')

df['Address'] = df['Address'].astype('string')

df['Postal_Code'] = df['Postal_Code'].astype('category')
df['County'] = df['County'].astype('category')
df['Not_Full_Market_Price'] = df['Not_Full_Market_Price'].astype('category')
df['Description_of_Property'] = df['Description_of_Property'].astype('category')
df['Property_Size_Description'] = df['Property_Size_Description'].astype('category')

df['Day'] = df['Day'].astype('int64')
df['Month'] = df['Month'].astype('int64')
df['Year'] = df['Year'].astype('int64')

df.dtypes

Address                              string
Postal_Code                        category
County                             category
Price_euro                          float64
Not_Full_Market_Price              category
Description_of_Property            category
Property_Size_Description          category
Day                                   int64
Month                                 int64
Year                                  int64
DoS_yyyymmdd                 datetime64[ns]
Income_Indices                      float64
Income_Per_Person_euro              float64
dtype: object

<h3>Categorical Encoding:</h3>

Integer encoding: assigns an integer number for each category<br>
Downside: if applied in one way, it could introduce an arbitrary ordering of categories.  To address this, I may want to intentionally select and assign numerical values.  For example, County Dublin should get the highest category numerical value (26) since it has the highest median property prices, and County Leitrim should get the lowest category numerical value (1) since it has the lowest median property prices. Although this only really matters for Linear Models.

In [7]:
# Categorical Encoding
df['num_Postal_Code'] = df['Postal_Code'].cat.codes
df.head()

Unnamed: 0,Address,Postal_Code,County,Price_euro,Not_Full_Market_Price,Description_of_Property,Property_Size_Description,Day,Month,Year,DoS_yyyymmdd,Income_Indices,Income_Per_Person_euro,num_Postal_Code
0,"1 ANNE ST, CORK",Unknown,Cork,20000.0,No,Second-Hand Dwelling house /Apartment,Unknown,20,12,2017,2017-12-20,98.943012,29293.89683,22
1,"15 INNISMORE, CRUMLIN VILLAGE, DUBLIN 12",Dublin 12,Dublin,20000.0,Yes,Second-Hand Dwelling house /Apartment,Unknown,1,2,2018,2018-02-01,122.341731,37406.12892,3
2,"14 Chapel Farm, Lusk",Unknown,Dublin,20000.0,Yes,Second-Hand Dwelling house /Apartment,Unknown,15,3,2010,2010-03-15,114.683825,29224.81553,22
3,"94B RATOATH AVE, FINGLAS, DUBLIN 11",Dublin 11,Dublin,20000.0,No,Second-Hand Dwelling house /Apartment,Unknown,9,12,2013,2013-12-09,120.059108,30843.61949,2
4,"Parade Field, Chapel Street, Bantry",Unknown,Cork,20000.0,No,Second-Hand Dwelling house /Apartment,Unknown,16,8,2012,2012-08-16,99.684309,25832.86283,22


In [8]:
# Categorical Encoding
df['num_County'] = df['County'].cat.codes
df.head()

Unnamed: 0,Address,Postal_Code,County,Price_euro,Not_Full_Market_Price,Description_of_Property,Property_Size_Description,Day,Month,Year,DoS_yyyymmdd,Income_Indices,Income_Per_Person_euro,num_Postal_Code,num_County
0,"1 ANNE ST, CORK",Unknown,Cork,20000.0,No,Second-Hand Dwelling house /Apartment,Unknown,20,12,2017,2017-12-20,98.943012,29293.89683,22,3
1,"15 INNISMORE, CRUMLIN VILLAGE, DUBLIN 12",Dublin 12,Dublin,20000.0,Yes,Second-Hand Dwelling house /Apartment,Unknown,1,2,2018,2018-02-01,122.341731,37406.12892,3,5
2,"14 Chapel Farm, Lusk",Unknown,Dublin,20000.0,Yes,Second-Hand Dwelling house /Apartment,Unknown,15,3,2010,2010-03-15,114.683825,29224.81553,22,5
3,"94B RATOATH AVE, FINGLAS, DUBLIN 11",Dublin 11,Dublin,20000.0,No,Second-Hand Dwelling house /Apartment,Unknown,9,12,2013,2013-12-09,120.059108,30843.61949,2,5
4,"Parade Field, Chapel Street, Bantry",Unknown,Cork,20000.0,No,Second-Hand Dwelling house /Apartment,Unknown,16,8,2012,2012-08-16,99.684309,25832.86283,22,3


## Shuffle the Rows of the Dataset

In [9]:
# Row shuffle inspired from Geeks for Geeks: https://www.geeksforgeeks.org/pandas-how-to-shuffle-a-dataframe-rows/
df = df.sample(frac = 1)
df.head()

Unnamed: 0,Address,Postal_Code,County,Price_euro,Not_Full_Market_Price,Description_of_Property,Property_Size_Description,Day,Month,Year,DoS_yyyymmdd,Income_Indices,Income_Per_Person_euro,num_Postal_Code,num_County
6530,"74 CLIFTON COURT, ELLIS QUAY, DUBLIN 7",Dublin 7,Dublin,260000.0,No,Second-Hand Dwelling house /Apartment,Unknown,25,9,2017,2017-09-25,120.840492,35777.04822,19,5
8761,"176 OXMANTOWN RD, STONEYBATTER, DUBLIN 7",Dublin 7,Dublin,416000.0,No,Second-Hand Dwelling house /Apartment,Unknown,27,8,2018,2018-08-27,122.341731,37406.12892,19,5
1262,"1 SEAVIEW, CLIFDEN, CO GALWAY",Unknown,Galway,75000.0,No,Second-Hand Dwelling house /Apartment,Unknown,7,9,2018,2018-09-07,91.05342,27839.69069,22,6
9217,"1 OAKWOOD COURT, TINAHASK UPPER, WICKLOW",Unknown,Wicklow,507500.0,No,Second-Hand Dwelling house /Apartment,Unknown,29,11,2019,2019-11-29,101.845226,32398.90117,22,25
9414,"15 WHITE PINES DRIVE, STOCKING AVENUE, RATHFAR...",Unknown,Dublin,570000.0,No,Second-Hand Dwelling house /Apartment,Unknown,16,7,2021,2021-07-16,123.917676,41522.06589,22,5


In [10]:
# Send shuffled df to csv:
df.to_csv('data/final/model_data/version_0/shuf_cat.csv', index=False)

In [11]:
df.shape

(9989, 15)

<h2>Splitting Data into Train (70%) and Test (30%) Data:</h2>

I split the data to avoid over-fitting.  The data used to evaluate the model (the test data set) should not be the same data used to train the model (the training data set).  The error measured on the test data will check if the model devised from the training data is generalizable.  

In [12]:
# train_test_split already includes a shuffle method, but no harm to shuffle again
train, test = train_test_split(df, test_size=0.3, random_state=42, shuffle=True)

In [13]:
train

Unnamed: 0,Address,Postal_Code,County,Price_euro,Not_Full_Market_Price,Description_of_Property,Property_Size_Description,Day,Month,Year,DoS_yyyymmdd,Income_Indices,Income_Per_Person_euro,num_Postal_Code,num_County
4824,"50 MEADOWVALE, SLIGO, SLIGO",Unknown,Sligo,195000.0,No,Second-Hand Dwelling house /Apartment,Unknown,10,12,2015,2015-12-10,88.016479,23926.61189,22,20
1702,"12 SALLYGARDENS, DUBLIN ST, BALLYJAMESDUFF",Unknown,Cavan,90000.0,No,Second-Hand Dwelling house /Apartment,Unknown,30,10,2020,2020-10-30,80.120358,26052.14604,22,1
9181,"APT 6, 58 UPPER LEESON ST, LEESON ST DUBLIN 4",Dublin 4,Dublin,495000.0,No,Second-Hand Dwelling house /Apartment,Unknown,7,7,2017,2017-07-07,120.840492,35777.04822,15,5
7917,"DRUMACAVOY, CARRICKMACROSS, MONAGHAN",Unknown,Monaghan,332500.0,No,Second-Hand Dwelling house /Apartment,Unknown,16,5,2018,2018-05-16,79.021691,24160.97543,22,17
5501,"TOWER VIEW AVE, THE STEEPLES 30, DULEEK",Unknown,Meath,220000.0,No,Second-Hand Dwelling house /Apartment,Unknown,18,2,2021,2021-02-18,99.698100,33245.70711,22,16
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2404,"7 STRAND AVE, ROSSLARE STRAND, ROSSLARE",Unknown,Wexford,115500.0,No,Second-Hand Dwelling house /Apartment,Unknown,5,6,2019,2019-06-05,83.518985,26568.97561,22,24
5559,"BALLAGHBOY, DOORA, ENNIS",Unknown,Clare,220000.0,No,Second-Hand Dwelling house /Apartment,Unknown,22,3,2014,2014-03-22,87.582971,22939.23074,22,2
5935,"39 BANGOR RD, CRUMLIN, DUBLIN 12",Dublin 12,Dublin,235000.0,No,Second-Hand Dwelling house /Apartment,Unknown,17,6,2019,2019-06-17,122.290137,38902.81574,3,5
1783,"PASS OF KILBRIDE, MILLTOWNPASS, MULLINGAR",Unknown,Westmeath,93750.0,Yes,Second-Hand Dwelling house /Apartment,Unknown,25,2,2020,2020-02-25,74.697922,24268.74961,22,23


In [14]:
train.shape

(6992, 15)

In [15]:
test

Unnamed: 0,Address,Postal_Code,County,Price_euro,Not_Full_Market_Price,Description_of_Property,Property_Size_Description,Day,Month,Year,DoS_yyyymmdd,Income_Indices,Income_Per_Person_euro,num_Postal_Code,num_County
5960,"APT 17, BODEN MEATH, BALLYBODEN DUBLIN 16",Dublin 16,Dublin,235100.0,No,Second-Hand Dwelling house /Apartment,Unknown,7,3,2017,2017-03-07,120.840492,35777.04822,7,5
358,"6 NEW SCHEME (BUNNACURRY), BUNNACURRY, MAYO",Unknown,Mayo,38750.0,Yes,Second-Hand Dwelling house /Apartment,Unknown,30,6,2014,2014-06-30,83.239574,21801.63297,22,15
9901,"33 TERENURE RD NORTH, DUBLIN 6W, DUBLIN",Unknown,Dublin,1200000.0,No,Second-Hand Dwelling house /Apartment,Unknown,10,9,2021,2021-09-10,123.917676,41522.06589,22,5
7964,"78, The Dickens, The Gasworks",Dublin 4,Dublin,380225.0,No,New Dwelling house /Apartment,greater than or equal to 38 sq metres and less...,6,2,2012,2012-02-06,119.009318,30840.87562,15,5
3361,"9 GLENVIEW AVE, THOMASTOWN, ARKLOW",Unknown,Wicklow,149000.0,No,Second-Hand Dwelling house /Apartment,Unknown,14,9,2017,2017-09-14,105.057589,31104.22975,22,25
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4178,"Apartment 46, The Printworks, Lower Dargle Road",Unknown,Wicklow,198625.0,No,New Dwelling house /Apartment,greater than or equal to 38 sq metres and less...,19,9,2018,2018-09-19,103.236204,31564.59124,22,25
1701,"BALLINGRANE, RATHKEALE, LIMERICK",Unknown,Limerick,90000.0,No,Second-Hand Dwelling house /Apartment,Unknown,28,10,2016,2016-10-28,108.840222,30467.95989,22,12
5517,"6 ALLENTON GREEN, BALLYCRAGH, TALLAGHT",Dublin 24,Dublin,220000.0,No,Second-Hand Dwelling house /Apartment,Unknown,9,6,2021,2021-06-09,123.917676,41522.06589,13,5
5134,"RATHBOURNE, KILKENNY",Unknown,Kilkenny,205000.0,No,Second-Hand Dwelling house /Apartment,Unknown,3,2,2017,2017-02-03,86.974940,25750.52909,22,9


In [16]:
test.shape

(2997, 15)

In [17]:
# Send train df to csv:
train.to_csv('data/final/model_data/version_0/train.csv', index=False)

In [18]:
# Send test df to csv:
test.to_csv('data/final/model_data/version_0/test.csv', index=False)