### Imports

In [1]:
# Data Management
import pandas as pd
import numpy as np
# from pandas_datareader.data import DataReader
# from ta import add_all_ta_features

# Statistics
from statsmodels.tsa.stattools import adfuller

# Data Preprocessing
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler

# Supervised Machine Learning
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import RepeatedKFold

# Reporting
import matplotlib.pyplot as plt

### Data Ingestion

In [2]:
df = pd.read_csv("SydneyHousePrices.csv")
print(f"Length of Data: {len(df)}")
df.head()

Length of Data: 199504


Unnamed: 0,Date,Id,suburb,postalCode,sellPrice,bed,bath,car,propType
0,2019-06-19,1,Avalon Beach,2107,1210000,4.0,2,2.0,house
1,2019-06-13,2,Avalon Beach,2107,2250000,4.0,3,4.0,house
2,2019-06-07,3,Whale Beach,2107,2920000,3.0,3,2.0,house
3,2019-05-28,4,Avalon Beach,2107,1530000,3.0,1,2.0,house
4,2019-05-22,5,Whale Beach,2107,8000000,5.0,4,4.0,house


In [3]:
# Interpret data
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 199504 entries, 0 to 199503
Data columns (total 9 columns):
 #   Column      Non-Null Count   Dtype  
---  ------      --------------   -----  
 0   Date        199504 non-null  object 
 1   Id          199504 non-null  int64  
 2   suburb      199504 non-null  object 
 3   postalCode  199504 non-null  int64  
 4   sellPrice   199504 non-null  int64  
 5   bed         199350 non-null  float64
 6   bath        199504 non-null  int64  
 7   car         181353 non-null  float64
 8   propType    199504 non-null  object 
dtypes: float64(2), int64(4), object(3)
memory usage: 13.7+ MB


### Feature Engineering - Common Tasks

##### Handle Non-Numerical Data

In [4]:
# Count unique items for suburb
suburb_text_unique = df["suburb"].unique()
print("Unique Suburbs: ", len(suburb_text_unique))
print("Perform LabelEncoding")

Unique Suburbs:  685
Perform LabelEncoding


In [5]:
# Count unique items for propType
prop_type_text_unique = df["propType"].unique()
print("Unique Prop Types: ", len(prop_type_text_unique))
print("Perform OneHotEncoding")

Unique Prop Types:  8
Perform OneHotEncoding


In [6]:
# Label Encoding
labelencoder = LabelEncoder()
encoded_suburbs = labelencoder.fit_transform(df["suburb"])
df["suburbs_encoded"] = encoded_suburbs
df.head()

Unnamed: 0,Date,Id,suburb,postalCode,sellPrice,bed,bath,car,propType,suburbs_encoded
0,2019-06-19,1,Avalon Beach,2107,1210000,4.0,2,2.0,house,22
1,2019-06-13,2,Avalon Beach,2107,2250000,4.0,3,4.0,house,22
2,2019-06-07,3,Whale Beach,2107,2920000,3.0,3,2.0,house,654
3,2019-05-28,4,Avalon Beach,2107,1530000,3.0,1,2.0,house,22
4,2019-05-22,5,Whale Beach,2107,8000000,5.0,4,4.0,house,654


In [7]:
# One Hot Encoding
onehot_encoded = pd.get_dummies(df["propType"], prefix="pt", drop_first=True)
df = df.join(onehot_encoded)
df.head(3)

Unnamed: 0,Date,Id,suburb,postalCode,sellPrice,bed,bath,car,propType,suburbs_encoded,pt_duplex/semi-detached,pt_house,pt_other,pt_terrace,pt_townhouse,pt_villa,pt_warehouse
0,2019-06-19,1,Avalon Beach,2107,1210000,4.0,2,2.0,house,22,0,1,0,0,0,0,0
1,2019-06-13,2,Avalon Beach,2107,2250000,4.0,3,4.0,house,22,0,1,0,0,0,0,0
2,2019-06-07,3,Whale Beach,2107,2920000,3.0,3,2.0,house,654,0,1,0,0,0,0,0


##### Set Target

In [8]:
# Set target
df["TARGET"] = df["sellPrice"]
df.head(3)

Unnamed: 0,Date,Id,suburb,postalCode,sellPrice,bed,bath,car,propType,suburbs_encoded,pt_duplex/semi-detached,pt_house,pt_other,pt_terrace,pt_townhouse,pt_villa,pt_warehouse,TARGET
0,2019-06-19,1,Avalon Beach,2107,1210000,4.0,2,2.0,house,22,0,1,0,0,0,0,0,1210000
1,2019-06-13,2,Avalon Beach,2107,2250000,4.0,3,4.0,house,22,0,1,0,0,0,0,0,2250000
2,2019-06-07,3,Whale Beach,2107,2920000,3.0,3,2.0,house,654,0,1,0,0,0,0,0,2920000


##### Remove Redundant Features

In [9]:
# Remove features
df_drop = df.copy()
df_drop.drop(columns=["Date", "Id", "suburb", "propType", "sellPrice"], inplace=True)
df_drop.head()

Unnamed: 0,postalCode,bed,bath,car,suburbs_encoded,pt_duplex/semi-detached,pt_house,pt_other,pt_terrace,pt_townhouse,pt_villa,pt_warehouse,TARGET
0,2107,4.0,2,2.0,22,0,1,0,0,0,0,0,1210000
1,2107,4.0,3,4.0,22,0,1,0,0,0,0,0,2250000
2,2107,3.0,3,2.0,654,0,1,0,0,0,0,0,2920000
3,2107,3.0,1,2.0,22,0,1,0,0,0,0,0,1530000
4,2107,5.0,4,4.0,654,0,1,0,0,0,0,0,8000000


##### Check for NaN or Inf Values

In [12]:
# Check for Null or Inf values
is_null = df_drop.isnull().values.any()
is_inf = df_drop.isin([np.inf, -np.inf]).values.any()
print("Is Null: ", is_null)
print("Is Inf: ", is_inf)

Is Null:  False
Is Inf:  False


In [11]:
# Fill NA
df_drop = df_drop.fillna(df_drop.mean())
df_drop.isnull().values.any()

False

##### Feature Scaling - Min Max Scaling

In [13]:
df_scaling = df_drop.copy()
mms = MinMaxScaler()
df_scaling.iloc[:] = mms.fit_transform(df_scaling)
df_scaling.head()

Unnamed: 0,postalCode,bed,bath,car,suburbs_encoded,pt_duplex/semi-detached,pt_house,pt_other,pt_terrace,pt_townhouse,pt_villa,pt_warehouse,TARGET
0,0.037179,0.030612,0.010204,0.025,0.032164,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.000563
1,0.037179,0.030612,0.020408,0.075,0.032164,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.001048
2,0.037179,0.020408,0.020408,0.025,0.95614,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.00136
3,0.037179,0.020408,0.0,0.025,0.032164,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.000712
4,0.037179,0.040816,0.030612,0.075,0.95614,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.003725


### Train Test Split

In [14]:
# Use Correct Dataframe
is_deep_learning = False
df_tts = df_scaling.copy() if is_deep_learning else df_drop.copy()
df_tts.head(3)

Unnamed: 0,postalCode,bed,bath,car,suburbs_encoded,pt_duplex/semi-detached,pt_house,pt_other,pt_terrace,pt_townhouse,pt_villa,pt_warehouse,TARGET
0,2107,4.0,2,2.0,22,0,1,0,0,0,0,0,1210000
1,2107,4.0,3,4.0,22,0,1,0,0,0,0,0,2250000
2,2107,3.0,3,2.0,654,0,1,0,0,0,0,0,2920000


In [16]:
# Split X and y data
X = df_tts.iloc[:, : -1].values
y = df_tts.iloc[:, -1].values
print("X Values: \n", X[:2])
print("y Values: \n", y[:5])

X Values: 
 [[2.107e+03 4.000e+00 2.000e+00 2.000e+00 2.200e+01 0.000e+00 1.000e+00
  0.000e+00 0.000e+00 0.000e+00 0.000e+00 0.000e+00]
 [2.107e+03 4.000e+00 3.000e+00 4.000e+00 2.200e+01 0.000e+00 1.000e+00
  0.000e+00 0.000e+00 0.000e+00 0.000e+00 0.000e+00]]
y Values: 
 [1210000 2250000 2920000 1530000 8000000]


In [17]:
# Train Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=1, shuffle=True)
print("X_train: ", X_train.shape)
print("X_test: ", X_test.shape)
print("y_train: ", y_train.shape)
print("y_test: ", y_test.shape)

X_train:  (179553, 12)
X_test:  (19951, 12)
y_train:  (179553,)
y_test:  (19951,)


### Machine Learning

In [19]:
# Train Regressor
regressor = RandomForestRegressor(n_estimators=100, max_depth=10, random_state=0)
regressor.fit(X_train, y_train)

RandomForestRegressor(max_depth=10, random_state=0)

In [24]:
# Make Predictions on Test Set
y_pred = regressor.predict(X_test)
y_pred = [round(x, 0) for x in y_pred]
print("Test Predictions", y_pred[:5])
print("Test Actuals", y_test[:5])

Test Predictions [590029.0, 2022675.0, 1112679.0, 1045638.0, 869990.0]
Test Actuals [ 730000 1350100  860000 1390000  985000]


In [25]:
# Check Accuracy
cv = RepeatedKFold(n_splits=5, n_repeats=3, random_state=1)
n_scores = cross_val_score(regressor, X_train, y_train, scoring="neg_mean_absolute_error", cv=cv, n_jobs=-1, 
                           error_score="raise")

In [26]:
# Report Performance
print("MAE Avg: ", abs(n_scores.mean()))
print("MAE Std: ", n_scores.std())

MAE Avg:  389118.30686148786
MAE Std:  20152.482105475094


### Resources and Useful Reading

Data from: https://www.kaggle.com/datasets/mihirhalai/sydney-house-prices

One Hot Encoding vs Dummy Encoding: https://www.analyticsvidhya.com/blog/2020/03/one-hot-encoding-vs-label-encoding-using-scikit-learn/

Mean Absolute Error Scoring (also using House Prices): http://www.andrewgurung.com/2018/12/28/regression-model-evaluation-mean-absolute-error/