In [1]:
import numpy as np
import pandas as pd
import sklearn as sk
from sklearn import datasets
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn import tree
from sklearn.metrics import mean_squared_error
import matplotlib.pyplot as plt
from math import sqrt

# Multi-Task Learning
# Use 4 locations (ocean proximity)
# Location 5 not used due to small number of data points

# Update path according to file system
PathToFile = str('')

# Train and Test sizes
TrainSize = float(0.8)
TestSize = float(1.0 - TrainSize)

# Label size
LA = int(0.0)
LB = int(0.0)
LC = int(0.0)
LD = int(0.0)
T = int(0.0)

# Removing rows with empty cells
# Lines necessary only if using dataset from source
# MyDataP = pd.read_csv(PathToFile+'HousingMarket1.csv')
# MyDataNewP = MyDataP.dropna(axis=0)
# MyDataNewP.to_csv(PathToFile+'HousingMarket2.csv', sep=',', encoding='utf-8')

# Acquire the datasets from the file system
MyData = np.genfromtxt(PathToFile+'HousingMarket2.csv', delimiter=',', dtype='unicode')

# Print dimensions of dataset
print('Number of Rows: ', MyData.shape[0])
print('Number of Columns: ', MyData.shape[1])

# Extract dimensions of sub-datasets based on location (ocean proximity)
for i in range(0, MyData.shape[0]):
    if str(MyData[i, 10]) == 'NEAR BAY':
        LA = LA + 1
    if str(MyData[i][10]) == 'NEAR OCEAN':
        LB = LB + 1
    if str(MyData[i][10]) == 'INLAND':
        LC = LC + 1
    if str(MyData[i][10]) == '<1H OCEAN':
        LD = LD + 1

# Total number of rows
T = int(LA + LB + LC + LD)

print('LA: ', LA)
print('LB: ', LB)
print('LC: ', LC)
print('LD: ', LD)
print('T: ', T)

MyDataNew = np.empty(shape=(T, 10), dtype=float)

# Re-arrange dataset based on location (ocean proximity)
# Add new column [0] to indicate location (0 - 3)
k = int(0)
for i in range(0, MyData.shape[0]):
    if str(MyData[i][10]) == 'NEAR BAY':
        MyDataNew[k][0] = int(0)
        for j in range(1, 10):
            try:
                MyDataNew[k][j] = float(MyData[i][j])
            except:
                print('Error')
        k = k + 1
k = int(0)
for i in range(0, MyData.shape[0]):
    if str(MyData[i][10]) == 'NEAR OCEAN':
        MyDataNew[LA+k][0] = int(1)
        for j in range(1, 10):
            try:
                MyDataNew[LA+k][j] = float(MyData[i][j])
            except:
                print('Error')
        k = k + 1
k = int(0)
for i in range(0, MyData.shape[0]):
    if str(MyData[i][10]) == 'INLAND':
        MyDataNew[LA+LB+k][0] = int(2)
        for j in range(1, 10):
            try:
                MyDataNew[LA+LB+k][j] = float(MyData[i][j])
            except:
                print('Error')
        k = k + 1
k = int(0)
for i in range(0, MyData.shape[0]):
    if str(MyData[i][10]) == '<1H OCEAN':
        MyDataNew[LA+LB+LC+k][0] = int(3)
        for j in range(1, 10):
            try:
                MyDataNew[LA+LB+LC+k][j] = float(MyData[i][j])
            except:
                print('Error')
        k = k + 1

# np.savetxt(PathToFile+'HousingMarket3.csv', MyDataNew, fmt='%f', delimiter=',')

# Used for stratifying the dataset
S = np.empty(shape=T, dtype=int)
for i in range(0, LA):
    S[i] = int(1)
for i in range(LA, LA+LB):
    S[i] = int(2)
for i in range(LA+LB, LA+LB+LC):
    S[i] = int(3)
for i in range(LA+LB+LC, T):
    S[i] = int(4)

# Allocate space for Features and Labels
FeaturesAll = np.empty(shape=(T, 9), dtype=float)
LabelsAll = np.empty(shape=T, dtype=float)

# Extract Features and Labels from dataset
for i in range(0, T):
    for j in range(0, 9):
        FeaturesAll[i, j] = float(MyDataNew[i, j])
    LabelsAll[i] = float(MyDataNew[i][9])

# Split the combined dataset with stratifying
TrainFeatures, TestFeatures, TrainLabels, TestLabels = train_test_split(FeaturesAll, LabelsAll, test_size=TestSize, stratify=S, random_state=5)

# Confirm dimensionality
print('Train Features: ', TrainFeatures.shape)
print('Train Labels: ', TrainLabels.shape)
print('Test Features: ', TestFeatures.shape)
print('Test Labels: ', TestLabels.shape)

# Initialise the Random Forest Regression object
rf = RandomForestRegressor(n_estimators=50, random_state=10)

# Train the model
rf.fit(TrainFeatures, TrainLabels)

# Run the model
Predict = rf.predict(TestFeatures)

# Number of Test Label values
TestNumber = int(TestLabels.shape[0])

SizeGroup1 = int(0)
SizeGroup2 = int(0)
SizeGroup3 = int(0)
SizeGroup4 = int(0)

# Calculate the sizes of each group
# Comparing float data type
for i in range(0, TestNumber):
    if TestFeatures[i][0] < float(0.5):
        SizeGroup1 = SizeGroup1 + 1
    if float(0.5) < TestFeatures[i][0] < float(1.5):
        SizeGroup2 = SizeGroup2 + 1
    if float(1.5) < TestFeatures[i][0] < float(2.5):
        SizeGroup3 = SizeGroup3 + 1
    if float(2.5) < TestFeatures[i][0] < float(3.5):
        SizeGroup4 = SizeGroup4 + 1

print('Size Group 1:', SizeGroup1)
print('Size Group 2:', SizeGroup2)
print('Size Group 3:', SizeGroup3)
print('Size Group 4:', SizeGroup4)

PredictionsAll = np.empty(shape=TestNumber, dtype=float)
REAll = np.empty(shape=TestNumber, dtype=float)

# Predictions for each Group
PredictionsGroup1 = np.empty(shape=SizeGroup1, dtype=float)
PredictionsGroup2 = np.empty(shape=SizeGroup2, dtype=float)
PredictionsGroup3 = np.empty(shape=SizeGroup3, dtype=float)
PredictionsGroup4 = np.empty(shape=SizeGroup4, dtype=float)

# Test Labels for each Group
TestLabelsGroup1 = np.empty(shape=SizeGroup1, dtype=float)
TestLabelsGroup2 = np.empty(shape=SizeGroup2, dtype=float)
TestLabelsGroup3 = np.empty(shape=SizeGroup3, dtype=float)
TestLabelsGroup4 = np.empty(shape=SizeGroup4, dtype=float)

# Convert to NumPy array to be used in further processing
for i in range(0, TestNumber):
    PredictionsAll[i] = float(Predict[i])
    REAll[i] = float(abs((PredictionsAll[i]-TestLabels[i])/TestLabels[i]))

# Save Predictions and Test Labels into Groups
k = int(0)
for i in range(0, TestNumber):
    if TestFeatures[i][0] < float(0.5):
        PredictionsGroup1[k] = PredictionsAll[i]
        TestLabelsGroup1[k] = TestLabels[i]
        k = k + 1
k = int(0)
for i in range(0, TestNumber):
    if float(0.5) < TestFeatures[i][0] < float(1.5):
        PredictionsGroup2[k] = PredictionsAll[i]
        TestLabelsGroup2[k] = TestLabels[i]
        k = k + 1
k = int(0)
for i in range(0, TestNumber):
    if float(1.5) < TestFeatures[i][0] < float(2.5):
        PredictionsGroup3[k] = PredictionsAll[i]
        TestLabelsGroup3[k] = TestLabels[i]
        k = k + 1
k = int(0)
for i in range(0, TestNumber):
    if float(2.5) < TestFeatures[i][0] < float(3.5):
        PredictionsGroup4[k] = PredictionsAll[i]
        TestLabelsGroup4[k] = TestLabels[i]
        k = k + 1

# Calculate Root Mean Square Error
RMSE = sqrt(mean_squared_error(TestLabels, PredictionsAll))
RMSE1 = sqrt(mean_squared_error(TestLabelsGroup1, PredictionsGroup1))
RMSE2 = sqrt(mean_squared_error(TestLabelsGroup2, PredictionsGroup2))
RMSE3 = sqrt(mean_squared_error(TestLabelsGroup3, PredictionsGroup3))
RMSE4 = sqrt(mean_squared_error(TestLabelsGroup4, PredictionsGroup4))

# Calculate Average Relative Error
ARE = np.average(REAll)

print('Root Mean Square Error All: ', round(RMSE, 2))
print('Root Mean Square Error 1: ', round(RMSE1, 2))
print('Root Mean Square Error 2: ', round(RMSE2, 2))
print('Root Mean Square Error 3: ', round(RMSE3, 2))
print('Root Mean Square Error 4: ', round(RMSE4, 2))

print('Average Relative Error: ', ARE)

print('End')


Number of Rows:  20434
Number of Columns:  11
LA:  2270
LB:  2628
LC:  6496
LD:  9034
T:  20428
Train Features:  (16342, 9)
Train Labels:  (16342,)
Test Features:  (4086, 9)
Test Labels:  (4086,)
Size Group 1: 454
Size Group 2: 526
Size Group 3: 1299
Size Group 4: 1807
Root Mean Square Error All:  48691.76
Root Mean Square Error 1:  51109.45
Root Mean Square Error 2:  66737.32
Root Mean Square Error 3:  37830.25
Root Mean Square Error 4:  48780.07
Average Relative Error:  0.17320544955676412
End
