In [141]:
# Imports, as always...
import pandas as pd
import numpy as np
from pathlib import Path
from datetime import datetime
import seaborn as sns
import ast

import create_data

import warnings
warnings.filterwarnings('ignore')

In [142]:
# Beautification.
sns.set_style('darkgrid')
sns.set_style({'font.family':'serif', 'font.serif':'Times New Roman'})
sns.set_context('paper')
sns.set_palette('Set2')

# Preprocessing

In [143]:
# Create master data for Edinburgh (if not created already).
if not Path('datasets/master_edinburgh.csv').is_file():
    create_data.city_data_generation('edinburgh', 'datasets', datetime(2022, 12, 16), True)
    print('Master data file generated.')

else:
    print('Pre-existing master data file found (no new data created).')

# Note: This takes around 3 minutes on my laptop for ~7500 listings and ~500000 reviews (Edinburgh).

# Get the master data.
master_data = pd.read_csv('datasets/master_edinburgh.csv').iloc[:,2:]

Pre-existing master data file found (no new data created).


In [144]:
# Drop features used in success score definition.
master_data.drop(columns=[
    #'price',
    'minimum_nights_avg_ntm',
    'number_of_reviews_ltm', 
    'review_scores_rating'
], inplace=True, axis=1)

# It would also be in the spirit of things to remove other review scores (e.g. for cleanliness), as that is sort of cheating.
master_data.drop(columns=[
    'review_scores_accuracy', 
    'review_scores_cleanliness',
    'review_scores_checkin', 
    'review_scores_communication',
    'review_scores_location', 
    'review_scores_value'
], inplace=True, axis=1)

# Drop unhelpful features.
master_data.drop(columns=[
    'calculated_host_listings_count_shared_rooms',
    'neighbourhood',
    'neighbourhood_cleansed',
    'property_type',
    'bathrooms_text',
    'amenities',
], inplace=True, axis=1)

In [145]:
master_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7389 entries, 0 to 7388
Data columns (total 67 columns):
 #   Column                                        Non-Null Count  Dtype  
---  ------                                        --------------  -----  
 0   id                                            7389 non-null   int64  
 1   host_id                                       7389 non-null   int64  
 2   host_since                                    7389 non-null   object 
 3   host_response_time                            5861 non-null   object 
 4   host_response_rate                            5861 non-null   object 
 5   host_acceptance_rate                          6663 non-null   object 
 6   host_is_superhost                             7387 non-null   object 
 7   host_neighbourhood                            3007 non-null   object 
 8   host_listings_count                           7389 non-null   int64  
 9   host_total_listings_count                     7389 non-null   i

## Convert to Numerical

In [146]:
# Helper function for converting dates to days since.
def elapsed_days(from_date_as_string, to_date=datetime(2022, 12, 16)):
    from_date = datetime.strptime(from_date_as_string, '%Y-%m-%d')
    return max(0, (to_date - from_date).days)

In [147]:
# Convert pseudonumeric types (e.g. dates) to numeric...

master_data.host_since = master_data.host_since.apply(elapsed_days)

master_data.price = master_data.price.apply(
    lambda x : float(x.replace(',', '').replace('$', '')) if not pd.isna(x) 
    else x
)

master_data.host_response_rate = master_data.host_response_rate.apply(
    lambda x : float(x[:-1]) if not pd.isna(x) 
    else x
)

master_data.host_acceptance_rate = master_data.host_acceptance_rate.apply(
    lambda x : float(x[:-1]) if not pd.isna(x) 
    else x
)

master_data.host_verifications = master_data.host_verifications.apply(
    lambda x : len(x)
)

master_data.first_review = master_data.first_review.apply(
    lambda x : elapsed_days(x) if not pd.isna(x)
    else x
)

master_data.last_review = master_data.last_review.apply(
    lambda x : elapsed_days(x) if not pd.isna(x)
    else x
)

In [148]:
# Convert categorical (e.g. boolean) types to numeric...

master_data.host_response_time = master_data.host_response_time.map(
    lambda x : {'within an hour' : 1, 'within a few hours' : 2, 'within a day' : 3, 'a few days or more' : 4}.get(x, 0)
)

master_data.host_is_superhost = master_data.host_is_superhost.map(
    lambda x : {'t' : 1, 'f' : 0}.get(x, 0)
)

master_data.host_identity_verified = master_data.host_identity_verified.map(
    lambda x : {'t' : 1, 'f' : 0}.get(x, 0)
)

master_data.room_type = master_data.room_type.map(
    lambda x : {'Entire home/apt' : 1, 'Private room' : 2, 'Hotel room' : 3, 'Shared room' : 4}.get(x, 0)
)

master_data.instant_bookable = master_data.instant_bookable.map(
    lambda x : {'t' : 1, 'f' : 0}.get(x, 0)
)

master_data.host_has_profile_pic = master_data.host_has_profile_pic.map(
    lambda x : {'t' : 1, 'f' : 0}.get(x, 0)
)

master_data.has_availability = master_data.has_availability.map(
    lambda x : {'t' : 1, 'f' : 0}.get(x, 0)
)

## Handle Missing Values

In [149]:
# Count the number of missing values in each feature.
missing_values = master_data.isna().sum().sort_values(ascending=False)

# Drop features with majority missing values.
threshold = master_data.shape[0] // 2
master_data.drop(columns=missing_values[missing_values > threshold].index, inplace=True, axis=1)
print('dropped:', missing_values[missing_values > threshold].index)

dropped: Index(['bathrooms', 'neighbourhood_group_cleansed', 'license',
       'host_neighbourhood'],
      dtype='object')


In [150]:
# Drop rows with missing a succuess score.
print('Number of dropped rows (no success score):', len(master_data[master_data.success_score.isna()]))
master_data = master_data[master_data.success_score.notna()]

Number of dropped rows (no success score): 669


In [151]:
# Drop rows missing image label extractions.
print('Number of dropped rows (no image labels):', len(master_data[master_data.images_keywords.isna()]))
master_data = master_data[master_data.images_keywords.notna()]

Number of dropped rows (no image labels): 2091


In [152]:
master_data.columns

Index(['id', 'host_id', 'host_since', 'host_response_time',
       'host_response_rate', 'host_acceptance_rate', 'host_is_superhost',
       'host_listings_count', 'host_total_listings_count',
       'host_verifications', 'host_has_profile_pic', 'host_identity_verified',
       'latitude', 'longitude', 'room_type', 'accommodates', 'bedrooms',
       'beds', 'price', 'minimum_nights', 'maximum_nights',
       'minimum_minimum_nights', 'maximum_minimum_nights',
       'minimum_maximum_nights', 'maximum_maximum_nights',
       'maximum_nights_avg_ntm', 'has_availability', 'availability_30',
       'availability_60', 'availability_90', 'availability_365',
       'number_of_reviews', 'number_of_reviews_l30d', 'first_review',
       'last_review', 'instant_bookable', 'calculated_host_listings_count',
       'calculated_host_listings_count_entire_homes',
       'calculated_host_listings_count_private_rooms', 'reviews_per_month',
       'title_neg', 'title_neu', 'title_pos', 'title_compound',


In [153]:
# Replace missing sentiment analyses with 0 scores.
master_data.title_neg = master_data.title_neg.fillna(0)
master_data.title_neu = master_data.title_neu.fillna(0)
master_data.title_pos = master_data.title_pos.fillna(0)
master_data.title_compound = master_data.title_compound.fillna(0)
master_data.description_neg = master_data.description_neg.fillna(0)
master_data.description_neu = master_data.description_neu.fillna(0)
master_data.description_pos = master_data.description_pos.fillna(0)
master_data.description_compound = master_data.description_compound.fillna(0)
master_data.neighborhood_overview_neg = master_data.neighborhood_overview_neg.fillna(0)
master_data.neighborhood_overview_neu = master_data.neighborhood_overview_neu.fillna(0)
master_data.neighborhood_overview_pos = master_data.neighborhood_overview_pos.fillna(0)
master_data.neighborhood_overview_compound = master_data.neighborhood_overview_compound.fillna(0)
master_data.host_about_neg = master_data.host_about_neg.fillna(0)
master_data.host_about_neu = master_data.host_about_neu.fillna(0)
master_data.host_about_pos = master_data.host_about_pos.fillna(0)
master_data.host_about_compound = master_data.host_about_compound.fillna(0)
master_data.perceived_review_neg = master_data.perceived_review_neg.fillna(0)
master_data.perceived_review_neu = master_data.perceived_review_neu.fillna(0)
master_data.perceived_review_pos = master_data.perceived_review_pos.fillna(0)
master_data.perceived_review_compound = master_data.perceived_review_compound.fillna(0)

# Replace missing host_response_rate, host_acceptance_rate with mean values.
master_data.host_response_rate = master_data.host_response_rate.fillna(master_data.host_response_rate.mean())
master_data.host_acceptance_rate = master_data.host_acceptance_rate.fillna(master_data.host_acceptance_rate.mean()) 

# Replace missing host_response_time with mode values.
master_data.host_response_time = master_data.host_response_time.fillna(master_data.host_response_time.mode().values[0])

# Replace missing bedroom and beds with mean values.
master_data.bedrooms = master_data.bedrooms.fillna(master_data.bedrooms.mean())
master_data.beds = master_data.beds.fillna(master_data.beds.mean())

# Remove all remaining rows with missing values.
master_data = master_data.dropna()

print("Remaining data examples:", len(master_data))

Remaining data examples: 4629


## Create New Features

In [154]:
# Calcualte the success of the image keywords.
keyword_success = create_data.mean_keyword_scores(master_data[['id', 'success_score']], 'datasets/image_keywords_edinburgh.csv')

In [155]:
# For each listing, compute the mean and standard deviation in the perceived success of image keywords.
weighted_image_score_mean = []
weighted_image_score_max = []
weighted_image_score_min = []
weighted_image_score_std = []

for i in range(len(master_data.index)):
    entry = master_data.iloc[i]

    keywords = ast.literal_eval(entry.images_keywords)
    confidences = ast.literal_eval(entry.images_confidences)

    scores = []
    for j in range(len(keywords)):
        scores.append(keyword_success[keywords[j]] * (confidences[j] / 100)) 

    weighted_image_score_mean.append(np.mean(scores))
    weighted_image_score_max.append(max(scores))
    weighted_image_score_min.append(min(scores))
    weighted_image_score_std.append(np.std(scores))

master_data['weighted_image_score_mean'] = weighted_image_score_mean
master_data['weighted_image_score_max'] = weighted_image_score_max
master_data['weighted_image_score_min'] = weighted_image_score_min
master_data['weighted_image_score_std'] = weighted_image_score_std

## Handle Skewness

In [156]:
# Handle skewness (to a maximum point).
max_handling_steps = 5

for i in range(max_handling_steps):
    # Calculate the skewness of the features.
    skewness = master_data.skew().sort_values(ascending=False)

    # Deal with positive skewness by performing a square root transformation.
    for skew_feature in skewness[skewness > 1].index:
        master_data[skew_feature] = np.power(master_data[skew_feature], 1/2)

    # Deal with negative skewness by performing a square transformation.
    for skew_feature in skewness[skewness < -1].index:
        master_data[skew_feature] = np.power(master_data[skew_feature], 2)
    
print("Final skewness...")
skewness = master_data.skew().sort_values(ascending=False)
print(skewness[abs(skewness) > 1])

Final skewness...
title_neg                         12.307575
weighted_image_score_max           6.019608
minimum_minimum_nights             3.039192
minimum_nights                     2.963458
host_about_neg                     2.685665
maximum_minimum_nights             2.218628
neighborhood_overview_neg          2.104362
host_listings_count                1.780214
host_total_listings_count          1.738642
calculated_host_listings_count     1.639883
longitude                          1.588900
host_identity_verified            -2.619935
host_has_profile_pic              -8.690194
has_availability                 -33.985281
dtype: float64


## Normalisation

In [157]:
from sklearn.preprocessing import MinMaxScaler

# Create the scaler object
scaler = MinMaxScaler()

# Normalise the data
data_to_normalise = master_data.drop(columns=['images_keywords', 'images_confidences'], axis=1)
normalised_data = pd.DataFrame(scaler.fit_transform(data_to_normalise), columns=data_to_normalise.columns)

In [158]:
# Remove introduded missing values (?).
normalised_data = normalised_data.dropna()

print("Remaining (normalised) data examples:", len(normalised_data))

Remaining (normalised) data examples: 4629


## PCA Feature Extraction

In [159]:
normalised_data.columns

Index(['id', 'host_id', 'host_since', 'host_response_time',
       'host_response_rate', 'host_acceptance_rate', 'host_is_superhost',
       'host_listings_count', 'host_total_listings_count',
       'host_verifications', 'host_has_profile_pic', 'host_identity_verified',
       'latitude', 'longitude', 'room_type', 'accommodates', 'bedrooms',
       'beds', 'price', 'minimum_nights', 'maximum_nights',
       'minimum_minimum_nights', 'maximum_minimum_nights',
       'minimum_maximum_nights', 'maximum_maximum_nights',
       'maximum_nights_avg_ntm', 'has_availability', 'availability_30',
       'availability_60', 'availability_90', 'availability_365',
       'number_of_reviews', 'number_of_reviews_l30d', 'first_review',
       'last_review', 'instant_bookable', 'calculated_host_listings_count',
       'calculated_host_listings_count_entire_homes',
       'calculated_host_listings_count_private_rooms', 'reviews_per_month',
       'title_neg', 'title_neu', 'title_pos', 'title_compound',


In [160]:
# Create structured-only, unstructured-only, and hybrid datasets
X_struct = normalised_data[[
    'host_since', 'host_response_time',
    'host_response_rate', 'host_acceptance_rate', 'host_is_superhost',
    'host_listings_count', 'host_total_listings_count',
    'host_verifications', 'host_has_profile_pic', 'host_identity_verified',
    'latitude', 'longitude', 'room_type', 'accommodates', 'bedrooms',
    'beds', 'minimum_nights', 'maximum_nights', 'minimum_minimum_nights',
    'maximum_minimum_nights', 'minimum_maximum_nights',
    'maximum_maximum_nights', 'maximum_nights_avg_ntm', 'has_availability',
    'availability_30', 'availability_60', 'availability_90',
    'availability_365', 'number_of_reviews', 'number_of_reviews_l30d',
    'first_review', 'last_review', 'instant_bookable',
    'calculated_host_listings_count',
    'calculated_host_listings_count_entire_homes',
    'calculated_host_listings_count_private_rooms', 'reviews_per_month'
]]

X_unstruct = normalised_data[[
    'title_neg', 'title_neu', 'title_pos', 'title_compound',
    'description_neg', 'description_neu', 'description_pos',
    'description_compound', 'neighborhood_overview_neg',
    'neighborhood_overview_neu', 'neighborhood_overview_pos',
    'neighborhood_overview_compound', 'host_about_neg', 'host_about_neu',
    'host_about_pos', 'host_about_compound', 'perceived_review_neg',
    'perceived_review_neu', 'perceived_review_pos',
    'perceived_review_compound',
    'weighted_image_score_mean', 'weighted_image_score_max',
    'weighted_image_score_min', 'weighted_image_score_std'
]]

X_hybrid = normalised_data.drop(columns=['id', 'host_id', 'success_score', 'price'], axis=1)

In [161]:
# Seperate succes scores from the rest of the data.
# CHANGE THIS TO BE EITHER SUCCESS SCORE OR PRICE, DEPENDING ON WHAT YOU WANT.
y = normalised_data.success_score

In [162]:
from sklearn.decomposition import PCA

In [163]:
# Conduct PCA on each dataset...

# Reduce the dimensionality as much as possible while retaining some threshold variance explanation.
threshold_explained_variance = 0.9

struct_pca = PCA()
X_struct_pca = struct_pca.fit_transform(X=X_struct, y=y)

for i in range(len(struct_pca.components_)):
    if sum(struct_pca.explained_variance_ratio_[:i]) > threshold_explained_variance:
        break

X_struct_pca = pd.DataFrame(X_struct_pca).iloc[:,:i]

unstruct_pca = PCA()
X_unstruct_pca = unstruct_pca.fit_transform(X=X_unstruct, y=y)


for i in range(len(unstruct_pca.components_)):
    if sum(unstruct_pca.explained_variance_ratio_[:i]) > threshold_explained_variance:
        break

X_unstruct_pca = pd.DataFrame(X_unstruct_pca).iloc[:,:i]

hybrid_pca = PCA()
X_hybrid_pca = hybrid_pca.fit_transform(X=X_hybrid, y=y)

for i in range(len(hybrid_pca.components_)):
    if sum(hybrid_pca.explained_variance_ratio_[:i]) > threshold_explained_variance:
        break

X_hybrid_pca = pd.DataFrame(X_hybrid_pca).iloc[:,:i]

# Model Experimentation

In [164]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

from sklearn.linear_model import LinearRegression
from sklearn.svm import SVR
from sklearn.neural_network import MLPRegressor
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor

In [165]:
# Train-test split for each dataset.
X_struct_train, X_struct_test, y_struct_train, y_struct_test = train_test_split(X_struct_pca, y, test_size=0.2, random_state=47)
X_unstruct_train, X_unstruct_test, y_unstruct_train, y_unstruct_test = train_test_split(X_unstruct_pca, y, test_size=0.2, random_state=47)
X_hybrid_train, X_hybrid_test, y_hybrid_train, y_hybrid_test = train_test_split(X_hybrid_pca, y, test_size=0.2, random_state=47)

In [166]:
# Linear Regression...
lin_reg_struct = LinearRegression().fit(X_struct_train, y_struct_train)
y_struct_pred = lin_reg_struct.predict(X_struct_test)
struct_mae = mean_squared_error(y_true=y_struct_test, y_pred=y_struct_pred) 
struct_mse = mean_absolute_error(y_true=y_struct_test, y_pred=y_struct_pred)
struct_r2 = r2_score(y_true=y_struct_test, y_pred=y_struct_pred)

lin_reg_unstruct = LinearRegression().fit(X_unstruct_train, y_unstruct_train)
y_unstruct_pred = lin_reg_unstruct.predict(X_unstruct_test)
unstruct_mae = mean_squared_error(y_true=y_unstruct_test, y_pred=y_unstruct_pred) 
unstruct_mse = mean_absolute_error(y_true=y_unstruct_test, y_pred=y_unstruct_pred)
unstruct_r2 = r2_score(y_true=y_unstruct_test, y_pred=y_unstruct_pred)

lin_reg_hybrid = LinearRegression().fit(X_hybrid_train, y_hybrid_train)
y_hybrid_pred = lin_reg_hybrid.predict(X_hybrid_test)
hybrid_mae = mean_squared_error(y_true=y_hybrid_test, y_pred=y_hybrid_pred) 
hybrid_mse = mean_absolute_error(y_true=y_struct_test, y_pred=y_hybrid_pred)
hybrid_r2 = r2_score(y_true=y_hybrid_test, y_pred=y_hybrid_pred)

print('Linear Regression...')
print('Structured: MAE = {}, MSE = {}, R2 Score = {}'.format(round(struct_mae, 5), round(struct_mse, 5), round(struct_r2, 5)))
print('Unstructured: MAE = {}, MSE = {}, R2 Score = {}'.format(round(unstruct_mae, 5), round(unstruct_mse, 5), round(unstruct_r2, 5)))
print('Hybrid: MAE = {}, MSE = {}, R2 Score = {}'.format(round(hybrid_mae, 5), round(hybrid_mse, 5), round(hybrid_r2, 5)))

Linear Regression...
Structured: MAE = 0.00468, MSE = 0.04907, R2 Score = 0.41896
Unstructured: MAE = 0.00794, MSE = 0.06546, R2 Score = 0.01284
Hybrid: MAE = 0.00421, MSE = 0.04533, R2 Score = 0.47628


In [167]:
# Support Vector Regression (SVR)...
svr_struct = SVR().fit(X_struct_train, y_struct_train)
y_struct_pred = svr_struct.predict(X_struct_test)
struct_mae = mean_squared_error(y_true=y_struct_test, y_pred=y_struct_pred) 
struct_mse = mean_absolute_error(y_true=y_struct_test, y_pred=y_struct_pred)
struct_r2 = r2_score(y_true=y_struct_test, y_pred=y_struct_pred)

svr_unstruct = SVR().fit(X_unstruct_train, y_unstruct_train)
y_unstruct_pred = svr_unstruct.predict(X_unstruct_test)
unstruct_mae = mean_squared_error(y_true=y_unstruct_test, y_pred=y_unstruct_pred) 
unstruct_mse = mean_absolute_error(y_true=y_unstruct_test, y_pred=y_unstruct_pred)
unstruct_r2 = r2_score(y_true=y_unstruct_test, y_pred=y_unstruct_pred)

svr_hybrid = SVR().fit(X_hybrid_train, y_hybrid_train)
y_hybrid_pred = svr_hybrid.predict(X_hybrid_test)
hybrid_mae = mean_squared_error(y_true=y_hybrid_test, y_pred=y_hybrid_pred) 
hybrid_mse = mean_absolute_error(y_true=y_struct_test, y_pred=y_hybrid_pred)
hybrid_r2 = r2_score(y_true=y_hybrid_test, y_pred=y_hybrid_pred)

print('Support Vector Regression (SVR)...')
print('Structured: MAE = {}, MSE = {}, R2 Score = {}'.format(round(struct_mae, 5), round(struct_mse, 5), round(struct_r2, 5)))
print('Unstructured: MAE = {}, MSE = {}, R2 Score = {}'.format(round(unstruct_mae, 5), round(unstruct_mse, 5), round(unstruct_r2, 5)))
print('Hybrid: MAE = {}, MSE = {}, R2 Score = {}'.format(round(hybrid_mae, 5), round(hybrid_mse, 5), round(hybrid_r2, 5)))

Support Vector Regression (SVR)...
Structured: MAE = 0.00496, MSE = 0.05358, R2 Score = 0.38333
Unstructured: MAE = 0.00844, MSE = 0.06811, R2 Score = -0.04837
Hybrid: MAE = 0.00499, MSE = 0.05371, R2 Score = 0.37994


In [172]:
# Multi-Layer Perceptron (MLP)...
mlp_struct = MLPRegressor().fit(X_struct_train, y_struct_train)
y_struct_pred = mlp_struct.predict(X_struct_test)
struct_mae = mean_squared_error(y_true=y_struct_test, y_pred=y_struct_pred) 
struct_mse = mean_absolute_error(y_true=y_struct_test, y_pred=y_struct_pred)
struct_r2 = r2_score(y_true=y_struct_test, y_pred=y_struct_pred)

mlp_unstruct = MLPRegressor().fit(X_unstruct_train, y_unstruct_train)
y_unstruct_pred = mlp_unstruct.predict(X_unstruct_test)
unstruct_mae = mean_squared_error(y_true=y_unstruct_test, y_pred=y_unstruct_pred) 
unstruct_mse = mean_absolute_error(y_true=y_unstruct_test, y_pred=y_unstruct_pred)
unstruct_r2 = r2_score(y_true=y_unstruct_test, y_pred=y_unstruct_pred)

mlp_hybrid = MLPRegressor().fit(X_hybrid_train, y_hybrid_train)
y_hybrid_pred = mlp_hybrid.predict(X_hybrid_test)
hybrid_mae = mean_squared_error(y_true=y_hybrid_test, y_pred=y_hybrid_pred) 
hybrid_mse = mean_absolute_error(y_true=y_struct_test, y_pred=y_hybrid_pred)
hybrid_r2 = r2_score(y_true=y_hybrid_test, y_pred=y_hybrid_pred)

print('Multi-Layer Perceptron (MLP) Regression...')
print('Structured: MAE = {}, MSE = {}, R2 Score = {}'.format(round(struct_mae, 5), round(struct_mse, 5), round(struct_r2, 5)))
print('Unstructured: MAE = {}, MSE = {}, R2 Score = {}'.format(round(unstruct_mae, 5), round(unstruct_mse, 5), round(unstruct_r2, 5)))
print('Hybrid: MAE = {}, MSE = {}, R2 Score = {}'.format(round(hybrid_mae, 5), round(hybrid_mse, 5), round(hybrid_r2, 5)))

Multi-Layer Perceptron (MLP) Regression...
Structured: MAE = 0.00473, MSE = 0.04971, R2 Score = 0.41259
Unstructured: MAE = 0.00803, MSE = 0.06596, R2 Score = 0.00233
Hybrid: MAE = 0.00499, MSE = 0.05327, R2 Score = 0.38031


In [173]:
# Gaussian Process Regression...
gauss_struct = GaussianProcessRegressor().fit(X_struct_train, y_struct_train)
y_struct_pred = gauss_struct.predict(X_struct_test)
struct_mae = mean_squared_error(y_true=y_struct_test, y_pred=y_struct_pred) 
struct_mse = mean_absolute_error(y_true=y_struct_test, y_pred=y_struct_pred)
struct_r2 = r2_score(y_true=y_struct_test, y_pred=y_struct_pred)

gauss_unstruct = GaussianProcessRegressor().fit(X_unstruct_train, y_unstruct_train)
y_unstruct_pred = gauss_unstruct.predict(X_unstruct_test)
unstruct_mae = mean_squared_error(y_true=y_unstruct_test, y_pred=y_unstruct_pred) 
unstruct_mse = mean_absolute_error(y_true=y_unstruct_test, y_pred=y_unstruct_pred)
unstruct_r2 = r2_score(y_true=y_unstruct_test, y_pred=y_unstruct_pred)

gauss_hybrid = GaussianProcessRegressor().fit(X_hybrid_train, y_hybrid_train)
y_hybrid_pred = gauss_hybrid.predict(X_hybrid_test)
hybrid_mae = mean_squared_error(y_true=y_hybrid_test, y_pred=y_hybrid_pred) 
hybrid_mse = mean_absolute_error(y_true=y_struct_test, y_pred=y_hybrid_pred)
hybrid_r2 = r2_score(y_true=y_hybrid_test, y_pred=y_hybrid_pred)

print('Gaussian Process Regression...')
print('Structured: MAE = {}, MSE = {}, R2 Score = {}'.format(round(struct_mae, 5), round(struct_mse, 5), round(struct_r2, 5)))
print('Unstructured: MAE = {}, MSE = {}, R2 Score = {}'.format(round(unstruct_mae, 5), round(unstruct_mse, 5), round(unstruct_r2, 5)))
print('Hybrid: MAE = {}, MSE = {}, R2 Score = {}'.format(round(hybrid_mae, 5), round(hybrid_mse, 5), round(hybrid_r2, 5)))

Gaussian Process Regression...
Structured: MAE = 0.12942, MSE = 0.20384, R2 Score = -15.08494
Unstructured: MAE = 19.56933, MSE = 1.58762, R2 Score = -2431.11164
Hybrid: MAE = 0.00567, MSE = 0.05642, R2 Score = 0.29515


In [188]:
# Decision Tree Regression...
dt_struct = DecisionTreeRegressor().fit(X_struct_train, y_struct_train)
y_struct_pred = dt_struct.predict(X_struct_test)
struct_mae = mean_squared_error(y_true=y_struct_test, y_pred=y_struct_pred) 
struct_mse = mean_absolute_error(y_true=y_struct_test, y_pred=y_struct_pred)
struct_r2 = r2_score(y_true=y_struct_test, y_pred=y_struct_pred)

dt_unstruct = DecisionTreeRegressor().fit(X_unstruct_train, y_unstruct_train)
y_unstruct_pred = dt_unstruct.predict(X_unstruct_test)
unstruct_mae = mean_squared_error(y_true=y_unstruct_test, y_pred=y_unstruct_pred) 
unstruct_mse = mean_absolute_error(y_true=y_unstruct_test, y_pred=y_unstruct_pred)
unstruct_r2 = r2_score(y_true=y_unstruct_test, y_pred=y_unstruct_pred)

dt_hybrid = DecisionTreeRegressor().fit(X_hybrid_train, y_hybrid_train)
y_hybrid_pred = dt_hybrid.predict(X_hybrid_test)
hybrid_mae = mean_squared_error(y_true=y_hybrid_test, y_pred=y_hybrid_pred) 
hybrid_mse = mean_absolute_error(y_true=y_struct_test, y_pred=y_hybrid_pred)
hybrid_r2 = r2_score(y_true=y_hybrid_test, y_pred=y_hybrid_pred)

print('Decision Tree Regression...')
print('Structured: MAE = {}, MSE = {}, R2 Score = {}'.format(round(struct_mae, 5), round(struct_mse, 5), round(struct_r2, 5)))
print('Unstructured: MAE = {}, MSE = {}, R2 Score = {}'.format(round(unstruct_mae, 5), round(unstruct_mse, 5), round(unstruct_r2, 5)))
print('Hybrid: MAE = {}, MSE = {}, R2 Score = {}'.format(round(hybrid_mae, 5), round(hybrid_mse, 5), round(hybrid_r2, 5)))

Decision Tree Regression...
Structured: MAE = 0.00862, MSE = 0.06398, R2 Score = -0.0715
Unstructured: MAE = 0.01823, MSE = 0.098, R2 Score = -1.2653
Hybrid: MAE = 0.0089, MSE = 0.06612, R2 Score = -0.10659


In [189]:
# Random Forest Regression...
rf_struct = RandomForestRegressor().fit(X_struct_train, y_struct_train)
y_struct_pred = rf_struct.predict(X_struct_test)
struct_mae = mean_squared_error(y_true=y_struct_test, y_pred=y_struct_pred) 
struct_mse = mean_absolute_error(y_true=y_struct_test, y_pred=y_struct_pred)
struct_r2 = r2_score(y_true=y_struct_test, y_pred=y_struct_pred)

rf_unstruct = RandomForestRegressor().fit(X_unstruct_train, y_unstruct_train)
y_unstruct_pred = rf_unstruct.predict(X_unstruct_test)
unstruct_mae = mean_squared_error(y_true=y_unstruct_test, y_pred=y_unstruct_pred) 
unstruct_mse = mean_absolute_error(y_true=y_unstruct_test, y_pred=y_unstruct_pred)
unstruct_r2 = r2_score(y_true=y_unstruct_test, y_pred=y_unstruct_pred)

rf_hybrid = RandomForestRegressor().fit(X_hybrid_train, y_hybrid_train)
y_hybrid_pred = rf_hybrid.predict(X_hybrid_test)
hybrid_mae = mean_squared_error(y_true=y_hybrid_test, y_pred=y_hybrid_pred) 
hybrid_mse = mean_absolute_error(y_true=y_struct_test, y_pred=y_hybrid_pred)
hybrid_r2 = r2_score(y_true=y_hybrid_test, y_pred=y_hybrid_pred)

print('Random Forest Regression...')
print('Structured: MAE = {}, MSE = {}, R2 Score = {}'.format(round(struct_mae, 5), round(struct_mse, 5), round(struct_r2, 5)))
print('Unstructured: MAE = {}, MSE = {}, R2 Score = {}'.format(round(unstruct_mae, 5), round(unstruct_mse, 5), round(unstruct_r2, 5)))
print('Hybrid: MAE = {}, MSE = {}, R2 Score = {}'.format(round(hybrid_mae, 5), round(hybrid_mse, 5), round(hybrid_r2, 5)))

Random Forest Regression...
Structured: MAE = 0.00391, MSE = 0.04531, R2 Score = 0.51353
Unstructured: MAE = 0.00834, MSE = 0.06686, R2 Score = -0.03603
Hybrid: MAE = 0.00397, MSE = 0.04541, R2 Score = 0.50676
