In [2]:
import numpy as np
import pandas as pd
import sklearn as sk
from sklearn import datasets
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn import tree
from sklearn.metrics import mean_squared_error
import matplotlib.pyplot as plt
from math import sqrt

# Use median income (5 groups)
# 0.04999 - 3.0
# 3.0 - 6.0
# 6.0 - 9.0
# 9.0 - 12.0
# 12.0 - 15.0001

# Update path according to file system
PathToFile = str('')

TrainSize = float(0.9)
TestSize = float(1.0 - TrainSize)

# Label size
LA = int(0.0)
LB = int(0.0)
LC = int(0.0)
LD = int(0.0)
LE = int(0.0)
T = int(0.0)

# Remove rows with empty cells
# Linen necessary only for the first run
# MyDataP = pd.read_csv(PathToFile+'HousingMarket1.csv')
# MyDataNewP = MyDataP.dropna(axis=0)
# MyDataNewP.to_csv(PathToFile+'HousingMarket2.csv', sep=',', encoding='utf-8')

# Acquire the datasets from the file system
MyData = np.genfromtxt(PathToFile+'HousingMarket2.csv', delimiter=',', dtype='unicode')

# Print dimension of dataset
print('Number of Rows: ', MyData.shape[0])
print('Number of Columns: ', MyData.shape[1])

# Extract dimensions of sub-datasets based on median income
for i in range(1, MyData.shape[0]):
    if float(MyData[i][8]) < float(3.0):
        LA = LA + 1
    if float(3.0) <= float(MyData[i][8]) < float(6.0):
        LB = LB + 1
    if float(6.0) <= float(MyData[i][8]) < float(9.0):
        LC = LC + 1
    if float(9.0) <= float(MyData[i][8]) < float(12.0):
        LD = LD + 1
    if float(12.0) <= float(MyData[i][8]):
        LE = LE + 1

T = int(LA + LB + LC + LD + LE)

print('LA: ', LA)
print('LB: ', LB)
print('LC: ', LC)
print('LD: ', LD)
print('LE: ', LE)
print('T: ', T)

MyDataNew = np.empty(shape=(T, 9), dtype=float)
Features = np.empty(shape=(MyData.shape[0]-1, 8), dtype=float)
Labels = np.empty(shape=MyData.shape[0]-1, dtype=float)

# Dataset without stratifying
# General exception handling will report NAN values and their indices
for i in range(0, MyData.shape[0]-1):
    for j in range(0, 8):
        try:
            Features[i, j] = float(MyData[i+1, j+1])
        except:
            print('Feature Error at: ', i, '  ', j)
for i in range(0, MyData.shape[0]-1):
    try:
        Labels[i] = float(MyData[i+1, 9])
    except:
        print('Label Error at: ', i, ' ', j)

# Re-arrange dataset based on median income
k = int(-1.0)
for i in range(1, MyData.shape[0]):
    if float(MyData[i][8]) < float(3.0):
        k = k + 1
        for j in range(0, 9):
            try:
                MyDataNew[k][j] = float(MyData[i][j+1])
            except:
                print('Error')
k = int(-1.0)
for i in range(1, MyData.shape[0]):
    if float(3.0) <= float(MyData[i][8]) < float(6.0):
        k = k + 1
        for j in range(0, 9):
            try:
                MyDataNew[LA+k][j] = float(MyData[i][j+1])
            except:
                print('Error')
k = int(-1.0)
for i in range(1, MyData.shape[0]):
    if float(6.0) <= float(MyData[i][8]) < float(9.0):
        k = k + 1
        for j in range(0, 9):
            try:
                MyDataNew[LA+LB+k][j] = float(MyData[i][j+1])
            except:
                print('Error')
k = int(-1.0)
for i in range(1, MyData.shape[0]):
    if float(9.0) <= float(MyData[i][8]) < float(12.0):
        k = k + 1
        for j in range(0, 9):
            try:
                MyDataNew[LA+LB+LC+k, j] = float(MyData[i, j+1])
            except:
                print('Error')
k = int(-1.0)
for i in range(1, MyData.shape[0]):
    if float(12.0) <= float(MyData[i][8]):
        k = k + 1
        for j in range(0, 9):
            try:
                MyDataNew[LA+LB+LC+LD+k, j] = float(MyData[i, j+1])
            except:
                print('Error')

# Print new dataset
# for i in range(LA, LB):
#     for j in range(0, 9):
#         print(MyDataNew[i, j], ' ', end='')
#     print(' ')

# Used for stratifying the dataset
S = np.empty(shape=T, dtype=int)
for i in range(0, LA):
    S[i] = int(1.0)
for i in range(LA, LA+LB):
    S[i] = int(2.0)
for i in range(LA+LB, LA+LB+LC):
    S[i] = int(3.0)
for i in range(LA+LB+LC, LA+LB+LC+LD):
    S[i] = int(4.0)
for i in range(LA+LB+LC+LD, T):
    S[i] = int(5.0)

# Allocate space for Features and Labels
FeaturesAll = np.empty(shape=(T, 8), dtype=float)
LabelsAll = np.empty(shape=T, dtype=float)

# Extract the Features from the dataset
for i in range(0, T):
    for j in range(0, 8):
        FeaturesAll[i, j] = float(MyDataNew[i, j])

# Print Features from the dataset
#for i in range(LB, LC):
#    for j in range(0, 8):
#        print(FeaturesAll[i, j], ' ', end='')
#    print(' ')

# Extract the Labels from the dataset
for i in range(0, T):
    LabelsAll[i] = float(MyDataNew[i, 8])

# Split the combined dataset with stratifying
TrainFeatures, TestFeatures, TrainLabels, TestLabels = train_test_split(FeaturesAll, LabelsAll, test_size=TestSize, stratify=S, random_state=10)
# Split without stratifying
# TrainFeatures, TestFeatures, TrainLabels, TestLabels = train_test_split(Features, Labels, test_size=TestSize, random_state=10)

# Confirm dimensionality
print('Train Features: ', TrainFeatures.shape)
print('Train Labels: ', TrainLabels.shape)
print('Test Features: ', TestFeatures.shape)
print('Test Labels: ', TestLabels.shape)

# Initialise the Random Forest Regression object
rf = RandomForestRegressor(n_estimators=50, random_state=10)

# Train the model
rf.fit(TrainFeatures, TrainLabels)

# Run the model
Predictions = rf.predict(TestFeatures)

# Number of Test Label values
TestNumber = int(TestLabels.shape[0])

Result = np.empty(shape=TestNumber, dtype=float)
RE = np.empty(shape=TestNumber, dtype=float)

# Convert to NumPy array to be used in further processing
# 0.9-0.1 split amounts to 2043 predicted values
for i in range(0, TestNumber):
    Result[i] = float(Predictions[i])
    RE[i] = float(abs((Result[i]-TestLabels[i])/TestLabels[i]))
    print(i+1, ' ', TestLabels[i], ' ', Result[i], ' ', round(RE[i], 2))

RMSE = sqrt(mean_squared_error(TestLabels, Predictions))
ARE = np.average(RE)

print('Root Mean Square Error: ', round(RMSE, 2))
print('Average Relative Error: ', ARE)

print('End')


Number of Rows:  20434
Number of Columns:  11
LA:  7292
LB:  10793
LC:  1946
LD:  290
LE:  112
T:  20433
Train Features:  (18389, 8)
Train Labels:  (18389,)
Test Features:  (2044, 8)
Test Labels:  (2044,)
1   467600.0   465306.56   0.0
2   500001.0   476214.66   0.05
3   45600.0   51304.0   0.13
4   248700.0   277020.02   0.11
5   230400.0   221630.0   0.04
6   197900.0   205226.0   0.04
7   457200.0   422948.06   0.07
8   101900.0   140082.0   0.37
9   225200.0   279890.06   0.24
10   217500.0   208006.0   0.04
11   101600.0   125820.0   0.24
12   185200.0   172338.0   0.07
13   104200.0   106398.0   0.02
14   350000.0   361054.18   0.03
15   114800.0   120130.0   0.05
16   97300.0   116714.0   0.2
17   224900.0   235038.0   0.05
18   234100.0   254090.0   0.09
19   170800.0   210888.0   0.23
20   96600.0   91302.0   0.05
21   262400.0   262176.02   0.0
22   135500.0   195788.0   0.44
23   150600.0   154982.0   0.03
24   156800.0   148400.0   0.05
25   69100.0   82670.0   0.2
26   444