In [1]:
import numpy as np
import pandas as pd
import sklearn as sk
from sklearn import datasets
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn import tree
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import StratifiedKFold
import matplotlib.pyplot as plt
from math import sqrt

# Multi-Task Learning
# With Cross Validation
# Use 4 locations (ocean proximity)
# Location 5 not used due to small number of data points

# Update path according to file system
PathToFile = str('')

# Train to Test ratio
TrainSize = float(0.8)
TestSize = float(1.0 - TrainSize)

# Group size
LA = int(0.0)
LB = int(0.0)
LC = int(0.0)
LD = int(0.0)
T = int(0.0)

# Removing rows with empty cells
# Lines necessary only for the first run
# MyDataP = pd.read_csv(PathToFile+'HousingMarket1.csv')
# MyDataNewP = MyDataP.dropna(axis=0)
# MyDataNewP.to_csv(PathToFile+'HousingMarket2.csv', sep=',', encoding='utf-8')

# Acquire the datasets from the file system
# Full dataset (20428 rows)
# MyData = np.genfromtxt(PathToFile+'HousingMarket2.csv', delimiter=',', dtype='unicode')
# Small dataset (80 rows)
MyData = np.genfromtxt(PathToFile+'HousingMarket2.csv', delimiter=',', dtype='unicode')

# Print dimensions of dataset
print('Number of Rows: ', MyData.shape[0])
print('Number of Columns: ', MyData.shape[1])

# Extract dimensions of sub-datasets based on location (ocean proximity)
for i in range(0, MyData.shape[0]):
    if str(MyData[i, 10]) == 'NEAR BAY':
        LA = LA + 1
    if str(MyData[i][10]) == 'NEAR OCEAN':
        LB = LB + 1
    if str(MyData[i][10]) == 'INLAND':
        LC = LC + 1
    if str(MyData[i][10]) == '<1H OCEAN':
        LD = LD + 1

# Total number of rows
T = int(LA + LB + LC + LD)

print('LA: ', LA)
print('LB: ', LB)
print('LC: ', LC)
print('LD: ', LD)
print('T: ', T)

MyDataNew = np.empty(shape=(T, 10), dtype=float)

# Re-arrange dataset based on location (ocean proximity)
# Add new column [0] to indicate location (0 - 3)
# Ignoring the fifth location due to small number of data point
k = int(0)
for i in range(0, MyData.shape[0]):
    if str(MyData[i][10]) == 'NEAR BAY':
        MyDataNew[k][0] = int(0)
        for j in range(1, 10):
            try:
                MyDataNew[k][j] = float(MyData[i][j])
            except:
                print('Error')
        k = k + 1
k = int(0)
for i in range(0, MyData.shape[0]):
    if str(MyData[i][10]) == 'NEAR OCEAN':
        MyDataNew[LA+k][0] = int(1)
        for j in range(1, 10):
            try:
                MyDataNew[LA+k][j] = float(MyData[i][j])
            except:
                print('Error')
        k = k + 1
k = int(0)
for i in range(0, MyData.shape[0]):
    if str(MyData[i][10]) == 'INLAND':
        MyDataNew[LA+LB+k][0] = int(2)
        for j in range(1, 10):
            try:
                MyDataNew[LA+LB+k][j] = float(MyData[i][j])
            except:
                print('Error')
        k = k + 1
k = int(0)
for i in range(0, MyData.shape[0]):
    if str(MyData[i][10]) == '<1H OCEAN':
        MyDataNew[LA+LB+LC+k][0] = int(3)
        for j in range(1, 10):
            try:
                MyDataNew[LA+LB+LC+k][j] = float(MyData[i][j])
            except:
                print('Error')
        k = k + 1

# Save pre-processed dataset
# np.savetxt(PathToFile+'HousingMarket4.csv', MyDataNew, fmt='%f', delimiter=',')

# Used for stratifying the dataset
S = np.empty(shape=T, dtype=int)
for i in range(0, LA):
    S[i] = int(1)
for i in range(LA, LA+LB):
    S[i] = int(2)
for i in range(LA+LB, LA+LB+LC):
    S[i] = int(3)
for i in range(LA+LB+LC, T):
    S[i] = int(4)

# Allocate space for Features and Labels
FeaturesAll = np.empty(shape=(T, 9), dtype=float)
LabelsAll = np.empty(shape=T, dtype=float)

# Extract Features and Labels from dataset
for i in range(0, T):
    for j in range(0, 9):
        FeaturesAll[i, j] = float(MyDataNew[i, j])
    LabelsAll[i] = float(MyDataNew[i][9])

# Split into 4 folds
Splits = StratifiedKFold(n_splits=4, random_state=None, shuffle=False)

i = int(0)
# Cross Validation
for TrainIndex, TestIndex in Splits.split(FeaturesAll, S):
    i = i + 1
    # print('Train: ', TrainIndex)
    # print('Test: ', TestIndex)
    TrainFeatures = FeaturesAll[TrainIndex]
    TestFeatures = FeaturesAll[TestIndex]
    TrainLabels = LabelsAll[TrainIndex]
    TestLabels = LabelsAll[TestIndex]

    # Initialise the Random Forest Regression object
    rf = RandomForestRegressor(n_estimators=50, random_state=10)
    # Train the model
    rf.fit(TrainFeatures, TrainLabels)
    # Run the model
    if i == int(1):
        PredictFold1 = rf.predict(TestFeatures)
    if i == int(2):
        PredictFold2 = rf.predict(TestFeatures)
    if i == int(3):
        PredictFold3 = rf.predict(TestFeatures)
    if i == int(4):
        PredictFold4 = rf.predict(TestFeatures)

# Split the combined dataset with stratifying
# TrainFeatures, TestFeatures, TrainLabels, TestLabels = train_test_split(FeaturesAll, LabelsAll, test_size=TestSize, stratify=S, random_state=5)

# Number of Test Label values
TestNumber = int(TestLabels.shape[0])

SizeGroup1 = int(0)
SizeGroup2 = int(0)
SizeGroup3 = int(0)
SizeGroup4 = int(0)

# Calculate the sizes of each group
# Comparing float data type
for i in range(0, TestNumber):
    if TestFeatures[i][0] < float(0.5):
        SizeGroup1 = SizeGroup1 + 1
    if float(0.5) < TestFeatures[i][0] < float(1.5):
        SizeGroup2 = SizeGroup2 + 1
    if float(1.5) < TestFeatures[i][0] < float(2.5):
        SizeGroup3 = SizeGroup3 + 1
    if float(2.5) < TestFeatures[i][0] < float(3.5):
        SizeGroup4 = SizeGroup4 + 1

print('Size Group 1:', SizeGroup1)
print('Size Group 2:', SizeGroup2)
print('Size Group 3:', SizeGroup3)
print('Size Group 4:', SizeGroup4)

PredictionsFold1 = np.empty(shape=TestNumber, dtype=float)
PredictionsFold2 = np.empty(shape=TestNumber, dtype=float)
PredictionsFold3 = np.empty(shape=TestNumber, dtype=float)
PredictionsFold4 = np.empty(shape=TestNumber, dtype=float)

# Predictions for each Group
PredictionsGroup1 = np.empty(shape=SizeGroup1, dtype=float)
PredictionsGroup2 = np.empty(shape=SizeGroup2, dtype=float)
PredictionsGroup3 = np.empty(shape=SizeGroup3, dtype=float)
PredictionsGroup4 = np.empty(shape=SizeGroup4, dtype=float)

# Test Labels for each Group
TestLabelsGroup1 = np.empty(shape=SizeGroup1, dtype=float)
TestLabelsGroup2 = np.empty(shape=SizeGroup2, dtype=float)
TestLabelsGroup3 = np.empty(shape=SizeGroup3, dtype=float)
TestLabelsGroup4 = np.empty(shape=SizeGroup4, dtype=float)

# Convert to NumPy array to be used in further processing
for i in range(0, TestNumber):
    PredictionsFold1[i] = float(PredictFold1[i])
    PredictionsFold2[i] = float(PredictFold2[i])
    PredictionsFold3[i] = float(PredictFold3[i])
    PredictionsFold4[i] = float(PredictFold4[i])

# From this point we have 4 tasks x 4 folds = 16 predicted targets

# Using Fold 1
# Change to PredictionsFold[2,3,4] to test other folds
# Save Predictions and Test Labels into Groups
k = int(0)
for i in range(0, TestNumber):
    if TestFeatures[i][0] < float(0.5):
        PredictionsGroup1[k] = PredictionsFold1[i]
        TestLabelsGroup1[k] = TestLabels[i]
        k = k + 1
k = int(0)
for i in range(0, TestNumber):
    if float(0.5) < TestFeatures[i][0] < float(1.5):
        PredictionsGroup2[k] = PredictionsFold1[i]
        TestLabelsGroup2[k] = TestLabels[i]
        k = k + 1
k = int(0)
for i in range(0, TestNumber):
    if float(1.5) < TestFeatures[i][0] < float(2.5):
        PredictionsGroup3[k] = PredictionsFold1[i]
        TestLabelsGroup3[k] = TestLabels[i]
        k = k + 1
k = int(0)
for i in range(0, TestNumber):
    if float(2.5) < TestFeatures[i][0] < float(3.5):
        PredictionsGroup4[k] = PredictionsFold1[i]
        TestLabelsGroup4[k] = TestLabels[i]
        k = k + 1

# Calculate Root Mean Square Error
Task1RMSE = sqrt(mean_squared_error(TestLabelsGroup1, PredictionsGroup1))
Task2RMSE = sqrt(mean_squared_error(TestLabelsGroup2, PredictionsGroup2))
Task3RMSE = sqrt(mean_squared_error(TestLabelsGroup3, PredictionsGroup3))
Task4RMSE = sqrt(mean_squared_error(TestLabelsGroup4, PredictionsGroup4))

RMSE = sqrt(mean_squared_error(TestLabels, PredictionsFold1))

print('Root Mean Square Error All: ', round(RMSE, 2))
print('Root Mean Square Error 1: ', round(Task1RMSE, 2))
print('Root Mean Square Error 2: ', round(Task2RMSE, 2))
print('Root Mean Square Error 3: ', round(Task3RMSE, 2))
print('Root Mean Square Error 4: ', round(Task4RMSE, 2))

print('End')


Number of Rows:  20434
Number of Columns:  11
LA:  2270
LB:  2628
LC:  6496
LD:  9034
T:  20428
Size Group 1: 567
Size Group 2: 657
Size Group 3: 1624
Size Group 4: 2258
Root Mean Square Error All:  123213.93
Root Mean Square Error 1:  151806.45
Root Mean Square Error 2:  158091.86
Root Mean Square Error 3:  80882.44
Root Mean Square Error 4:  128709.63
End
