In [1]:
import numpy as np
import pandas as pd
import sklearn as sk
from sklearn import datasets
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn import tree
from sklearn.metrics import mean_squared_error
import matplotlib.pyplot as plt
from math import sqrt

# Single Task Learning
# Use 4 locations (ocean proximity)
# Location 5 is not used due to small number of data points

# Update path according to file system
PathToFile = str('')

# Train and Test sizes
TrainSize = float(0.9)
TestSize = float(1.0 - TrainSize)

# Label size
LA = int(0.0)
LB = int(0.0)
LC = int(0.0)
LD = int(0.0)
T = int(0.0)

# Removing rows with empty cells
# Lines necessary only for the first run
# MyDataP = pd.read_csv(PathToFile+'HousingMarket1.csv')
# MyDataNewP = MyDataP.dropna(axis=0)
# MyDataNewP.to_csv(PathToFile+'HousingMarket2.csv', sep=',', encoding='utf-8')

# Acquire the datasets from the file system
MyData = np.genfromtxt(PathToFile+'HousingMarket2.csv', delimiter=',', dtype='unicode')

# Print dimensions of dataset
print('Number of Rows: ', MyData.shape[0])
print('Number of Columns: ', MyData.shape[1])

# Extract dimensions of sub-datasets based on location (ocean proximity)
for i in range(0, MyData.shape[0]):
    if str(MyData[i][10]) == 'NEAR BAY':
        LA = LA + 1
    if str(MyData[i][10]) == 'NEAR OCEAN':
        LB = LB + 1
    if str(MyData[i][10]) == 'INLAND':
        LC = LC + 1
    if str(MyData[i][10]) == '<1H OCEAN':
        LD = LD + 1

# Total number of rows
T = int(LA + LB + LC + LD)

print('LA: ', LA)
print('LB: ', LB)
print('LC: ', LC)
print('LD: ', LD)
print('T: ', T)

# Re-arranged dataset
MyDataNew = np.empty(shape=(T, 9), dtype=float)
# Features and labels for non stratified model
Features = np.empty(shape=(MyData.shape[0]-1, 8), dtype=float)
Labels = np.empty(shape=MyData.shape[0]-1, dtype=float)

# Allocate memory for Features for Groups 1 to 4
# Group 5 is not used due to small size
FeaturesGroup1 = np.empty(shape=(LA, 8), dtype=float)
FeaturesGroup2 = np.empty(shape=(LB, 8), dtype=float)
FeaturesGroup3 = np.empty(shape=(LC, 8), dtype=float)
FeaturesGroup4 = np.empty(shape=(LD, 8), dtype=float)

# Allocate memory for Labels for Groups 1 to 4
LabelsGroup1 = np.empty(shape=LA, dtype=float)
LabelsGroup2 = np.empty(shape=LB, dtype=float)
LabelsGroup3 = np.empty(shape=LC, dtype=float)
LabelsGroup4 = np.empty(shape=LD, dtype=float)

# Re-arrange dataset based on location (ocean proximity)
k = int(0)
for i in range(0, MyData.shape[0]):
    if str(MyData[i][10]) == 'NEAR BAY':
        for j in range(0, 9):
            try:
                MyDataNew[k][j] = float(MyData[i][j+1])
            except:
                print('Error')
        k = k + 1
k = int(0)
for i in range(0, MyData.shape[0]):
    if str(MyData[i][10]) == 'NEAR OCEAN':
        for j in range(0, 9):
            try:
                MyDataNew[LA+k][j] = float(MyData[i][j+1])
            except:
                print('Error')
        k = k + 1
k = int(0)
for i in range(0, MyData.shape[0]):
    if str(MyData[i][10]) == 'INLAND':
        for j in range(0, 9):
            try:
                MyDataNew[LA+LB+k][j] = float(MyData[i][j+1])
            except:
                print('Error')
        k = k + 1
k = int(0)
for i in range(0, MyData.shape[0]):
    if str(MyData[i][10]) == '<1H OCEAN':
        for j in range(0, 9):
            try:
                MyDataNew[LA+LB+LC+k][j] = float(MyData[i][j+1])
            except:
                print('Error')
        k = k + 1

# Extract Groups from dataset
for i in range(0, LA):
    for j in range(0, 8):
        FeaturesGroup1[i][j] = MyDataNew[i][j]
    LabelsGroup1[i] = MyDataNew[i][8]
for i in range(0, LB):
    for j in range(0, 8):
        FeaturesGroup2[i][j] = MyDataNew[LA+i][j]
    LabelsGroup2[i] = MyDataNew[LA+i][8]
for i in range(0, LC):
    for j in range(0, 8):
        FeaturesGroup3[i][j] = MyDataNew[LA+LB+i][j]
    LabelsGroup3[i] = MyDataNew[LA+LB+i][8]
for i in range(0, LD):
    for j in range(0, 8):
        FeaturesGroup4[i][j] = MyDataNew[LA+LB+LC+i][j]
    LabelsGroup4[i] = MyDataNew[LA+LB+LC+i][8]

# Split dataset
TrainFeaturesG1, TestFeaturesG1, TrainLabelsG1, TestLabelsG1 = train_test_split(FeaturesGroup1, LabelsGroup1, test_size=TestSize, random_state=10)
TrainFeaturesG2, TestFeaturesG2, TrainLabelsG2, TestLabelsG2 = train_test_split(FeaturesGroup2, LabelsGroup2, test_size=TestSize, random_state=20)
TrainFeaturesG3, TestFeaturesG3, TrainLabelsG3, TestLabelsG3 = train_test_split(FeaturesGroup3, LabelsGroup3, test_size=TestSize, random_state=30)
TrainFeaturesG4, TestFeaturesG4, TrainLabelsG4, TestLabelsG4 = train_test_split(FeaturesGroup4, LabelsGroup4, test_size=TestSize, random_state=40)

# Confirm dimensionality
print('Train Features Task 1: ', TrainFeaturesG1.shape)
print('Train Labels Task 1: ', TrainLabelsG1.shape)
print('Test Features Task 1: ', TestFeaturesG1.shape)
print('Test Labels Task 1: ', TestLabelsG1.shape)
print('Train Features Task 2: ', TrainFeaturesG2.shape)
print('Train Labels Task 2: ', TrainLabelsG2.shape)
print('Test Features Task 2: ', TestFeaturesG2.shape)
print('Test Labels Task 2: ', TestLabelsG2.shape)
print('Train Features Task 3: ', TrainFeaturesG3.shape)
print('Train Labels Task 3: ', TrainLabelsG3.shape)
print('Test Features Task 3: ', TestFeaturesG3.shape)
print('Test Labels Task 3: ', TestLabelsG3.shape)
print('Train Features Task 4: ', TrainFeaturesG4.shape)
print('Train Labels Task 4: ', TrainLabelsG4.shape)
print('Test Features Task 4: ', TestFeaturesG4.shape)
print('Test Labels Task 4: ', TestLabelsG4.shape)

# Initialise the Random Forest Regression objects
rf1 = RandomForestRegressor(n_estimators=50, random_state=10)
rf2 = RandomForestRegressor(n_estimators=50, random_state=20)
rf3 = RandomForestRegressor(n_estimators=50, random_state=30)
rf4 = RandomForestRegressor(n_estimators=50, random_state=40)

# Train the model
rf1.fit(TrainFeaturesG1, TrainLabelsG1)
rf2.fit(TrainFeaturesG2, TrainLabelsG2)
rf3.fit(TrainFeaturesG3, TrainLabelsG3)
rf4.fit(TrainFeaturesG4, TrainLabelsG4)

# Run the model
PredictG1 = rf1.predict(TestFeaturesG1)
PredictG2 = rf2.predict(TestFeaturesG2)
PredictG3 = rf3.predict(TestFeaturesG3)
PredictG4 = rf4.predict(TestFeaturesG4)

# Number of Test Label values
TestNumberG1 = int(TestLabelsG1.shape[0])
TestNumberG2 = int(TestLabelsG2.shape[0])
TestNumberG3 = int(TestLabelsG3.shape[0])
TestNumberG4 = int(TestLabelsG4.shape[0])

PredictionsG1 = np.empty(shape=TestNumberG1, dtype=float)
PredictionsG2 = np.empty(shape=TestNumberG2, dtype=float)
PredictionsG3 = np.empty(shape=TestNumberG3, dtype=float)
PredictionsG4 = np.empty(shape=TestNumberG4, dtype=float)

# Relative Error
RE1 = np.empty(shape=TestNumberG1, dtype=float)
RE2 = np.empty(shape=TestNumberG2, dtype=float)
RE3 = np.empty(shape=TestNumberG3, dtype=float)
RE4 = np.empty(shape=TestNumberG4, dtype=float)

# Convert to NumPy array to be used in further processing
for i in range(0, TestNumberG1):
    PredictionsG1[i] = float(PredictG1[i])
    RE1[i] = float(abs((PredictionsG1[i]-TestLabelsG1[i])/TestLabelsG1[i]))
for i in range(0, TestNumberG2):
    PredictionsG2[i] = float(PredictG2[i])
    RE2[i] = float(abs((PredictionsG2[i]-TestLabelsG2[i])/TestLabelsG2[i]))
for i in range(0, TestNumberG3):
    PredictionsG3[i] = float(PredictG3[i])
    RE3[i] = float(abs((PredictionsG3[i]-TestLabelsG3[i])/TestLabelsG3[i]))
for i in range(0, TestNumberG4):
    PredictionsG4[i] = float(PredictG4[i])
    RE4[i] = float(abs((PredictionsG4[i]-TestLabelsG4[i])/TestLabelsG4[i]))

# Root Mean Square Error
RMSE1 = sqrt(mean_squared_error(TestLabelsG1, PredictionsG1))
RMSE2 = sqrt(mean_squared_error(TestLabelsG2, PredictionsG2))
RMSE3 = sqrt(mean_squared_error(TestLabelsG3, PredictionsG3))
RMSE4 = sqrt(mean_squared_error(TestLabelsG4, PredictionsG4))

# Average Relative Error
ARE1 = np.average(RE1)
ARE2 = np.average(RE2)
ARE3 = np.average(RE3)
ARE4 = np.average(RE4)

print('Task 1 Root Mean Square Error: ', round(RMSE1, 2))
print('Task 1 Average Relative Error: ', round(ARE1, 2))

print('Task 2 Root Mean Square Error: ', round(RMSE2, 2))
print('Task 2 Average Relative Error: ', round(ARE2, 2))

print('Task 3 Root Mean Square Error: ', round(RMSE3, 2))
print('Task 3 Average Relative Error: ', round(ARE3, 2))

print('Task 4 Root Mean Square Error: ', round(RMSE4, 2))
print('Task 4 Average Relative Error: ', round(ARE4, 2))


Number of Rows:  20434
Number of Columns:  11
LA:  2270
LB:  2628
LC:  6496
LD:  9034
T:  20428
Train Features Task 1:  (2043, 8)
Train Labels Task 1:  (2043,)
Test Features Task 1:  (227, 8)
Test Labels Task 1:  (227,)
Train Features Task 2:  (2365, 8)
Train Labels Task 2:  (2365,)
Test Features Task 2:  (263, 8)
Test Labels Task 2:  (263,)
Train Features Task 3:  (5846, 8)
Train Labels Task 3:  (5846,)
Test Features Task 3:  (650, 8)
Test Labels Task 3:  (650,)
Train Features Task 4:  (8130, 8)
Train Labels Task 4:  (8130,)
Test Features Task 4:  (904, 8)
Test Labels Task 4:  (904,)
Task 1 Root Mean Square Error:  50184.1
Task 1 Average Relative Error:  0.18
Task 2 Root Mean Square Error:  56174.5
Task 2 Average Relative Error:  0.18
Task 3 Root Mean Square Error:  34625.73
Task 3 Average Relative Error:  0.2
Task 4 Root Mean Square Error:  50484.5
Task 4 Average Relative Error:  0.15
