In [12]:
# Standard scientific Python imports
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns # for visualisation
import numpy as np
from scipy.stats import uniform, randint

from sklearn.preprocessing import StandardScaler, MinMaxScaler
# from sklearn.decomposition import PCA
# from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn import linear_model
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import mean_squared_error

from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_validate
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_val_predict
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV

import random
random.seed(15)

# Default plotting parameters
font = {'size'   : 18}
plt.rc('font', **font)

In [2]:
# Load training dataset
base_dir = "/Users/Cherry0904/desktop/ArtWorldInsights/ML_modelling/" 
Xy = pd.read_csv(base_dir + 'all_data.csv', squeeze = True)

y = Xy[['logprice']]
X_cts = Xy[['database', 'medium', 'dimensions', 'Followers Per Post (FPP)', 'Instagram performance', 'ArtfactsPresence', 'InsPresence', 'WebsitePresence']]
X = Xy[['database', 'medium', 'dimensions', 'Followers Per Post (FPP)', 'Instagram performance', 'ArtfactsPresence', 'InsPresence', 'WebsitePresence']]
# print(Xy.columns)

# Create instance of one-hot-encoder
encoder = OneHotEncoder(handle_unknown='ignore')

# Perform one-hot encoding on the columns of categorical variables 
X_encoder_df1 = pd.DataFrame(encoder.fit_transform(X[['database']]).toarray())
X_encoder_df2 = pd.DataFrame(encoder.fit_transform(X[['medium']]).toarray())
# print(X_encoder_df1.columns)
X_encoder_df1.columns = ['artprice', 'artsper', 'degreeart', 'riseart', 'singulart']
X_encoder_df2.columns = ['drawing', 'painting', 'photo']
# print(X_encoder_df2)

# Merge one-hot encoded columns back with original DataFrame
X_final = X.join(X_encoder_df1)
X_final = X_final.join(X_encoder_df2)
X_final = X_final.drop(['database', 'medium'], axis=1)
# X_final.drop(['medium'], axis=1)
# print(X_final)

# Train-test split
X_tr, X_te, y_tr, y_te = train_test_split(X_final, y, test_size = 0.20 , random_state=15)

# Create version with them together
Xy_tr = pd.concat([X_tr, y_tr], axis = 1)

# Normalise according to training data
scaler = MinMaxScaler()
scaler.fit(X_tr)
X_tr_sc = scaler.transform(X_tr)
X_te_sc = scaler.transform(X_te)

# Futher split to Train-validation sets - 0.8, 0.1, 0.2
# X_train, X_val, y_train, y_val = train_test_split(X_tr, y_tr, test_size = 0.10 , random_state=250)

### Bagging (Bootstrap Aggregation) of Decision Trees

In [7]:
# Set up the bagging ensemble regressor 
from sklearn.ensemble import BaggingRegressor
regressor = BaggingRegressor()

# Grid search on the number of decision trees and the maximum tree depth
grid = {'n_estimators': [10, 25, 50, 100, 150, 200]} 

# Define search - maximise the defined regression metric
search = GridSearchCV(regressor, grid, scoring='neg_mean_squared_error', cv=6, n_jobs=-1)

# Perform the search
results = search.fit(X_tr_sc, y_tr)

# Summarize
print('Negative_mean_squared_error: %.3f' % results.best_score_)      
print('Config: %s' % results.best_params_)

  return f(*args, **kwargs)


Negative_mean_squared_error: -0.260
Config: {'n_estimators': 200}


### Random Forests


In [8]:
# Set up the random forest (RF) regressor 
# Compared to bagging, RF further decorrelates the decision trees by sampling subsets of features, thus reduce overfitting
# Since the number of features for this dataset is not large, bagging and RF give similar results
from sklearn.ensemble import RandomForestRegressor
regressor = RandomForestRegressor(random_state = 0)

# Grid search on the number of decision trees 
grid = {'n_estimators': [10, 25, 50, 100, 150, 200]} #, 'max_depth': [2, 3, 4]} 

# Define search - maximise the defined regression metric
search = GridSearchCV(regressor, grid, scoring='neg_mean_squared_error', cv=6, n_jobs=-1)

# Perform the search
results = search.fit(X_tr_sc, y_tr)

# Summarize
print('Negative_mean_squared_error: %.3f' % results.best_score_)      
print('Config: %s' % results.best_params_)

  self.best_estimator_.fit(X, y, **fit_params)


Negative_mean_squared_error: -0.260
Config: {'n_estimators': 200}


### Boosting

##### AdaBoost

In [9]:
# boosting combines multiple weak learners into a strong learner

# AdaBoost - each time a new weak learner is built, each datapoint receives a new weight depending on how well it is classified
from sklearn.ensemble import AdaBoostRegressor
from sklearn.tree import DecisionTreeRegressor
 
# Choosing Decision Tree with 1 level as the weak learner
DTR = DecisionTreeRegressor(max_depth = 1)
regressor = AdaBoostRegressor(base_estimator = DTR)

# The learning rate is the weight applied to each regressor at each boosting iteration
# A higher learning rate increases the contribution of each regressor
# There is a trade-off between the learning_rate and n_estimators parameters
grid = {'n_estimators': [10, 25, 50, 100, 150, 200], 'learning_rate':[0.01, 0.05, 0.1, 0.2]} 

# Define search - maximise the defined regression metric
search = GridSearchCV(regressor, grid, scoring='neg_mean_squared_error', cv=6, n_jobs=-1)

# Perform the search
results = search.fit(X_tr_sc, y_tr)

# Summarize
print('Negative_mean_squared_error: %.3f' % results.best_score_)      
print('Config: %s' % results.best_params_)

  return f(*args, **kwargs)


Negative_mean_squared_error: -0.766
Config: {'learning_rate': 0.1, 'n_estimators': 50}


##### Gradient Boosting

In [11]:
# gradient boosting - each new tree has parameters which are chosen to minimize the loss
# This optimization step is done with gradient descent
# That is, algorithms that optimize a cost function over function space by iteratively choosing a function (weak hypothesis) that points in the negative gradient direction
from sklearn.ensemble import GradientBoostingRegressor
regressor = GradientBoostingRegressor(verbose = 1)

grid = {'n_estimators': [100, 150, 200, 250, 300], 'max_depth': [2, 4, 6, 8, 10]} 

# Define search - maximise the defined regression metric
search = GridSearchCV(regressor, grid, scoring='neg_mean_squared_error', cv=6, n_jobs=-1)

# Perform the search
results = search.fit(X_tr_sc, y_tr)

# Summarize
print('Negative_mean_squared_error: %.3f' % results.best_score_)      
print('Config: %s' % results.best_params_)

  return f(*args, **kwargs)


      Iter       Train Loss   Remaining Time 
         1           1.0755           42.89s
         2           0.9504           43.13s
         3           0.8488           55.35s
         4           0.7650           51.72s
         5           0.6957           48.95s
         6           0.6394           46.86s
         7           0.5929           45.53s
         8           0.5536           44.33s
         9           0.5218           43.63s
        10           0.4928           42.78s
        20           0.3604           38.71s
        30           0.3123           37.70s
        40           0.2841           36.22s
        50           0.2678           35.02s
        60           0.2553           34.21s
        70           0.2439           33.54s
        80           0.2364           32.37s
        90           0.2308           30.79s
       100           0.2229           29.06s
       200           0.1738           14.03s
       300           0.1469            0.00s
Negative_

##### XGBoost

In [13]:
# XGBoost - Extreme Gradient Boosting
# XGBoost includes regression penalties in the boosting equation (like elastic net)
# It also leverages the structure of your hardware to speed up computing times (parallelization) and facilitate memory usage
import xgboost as xgb
xgb_model = xgb.XGBRegressor()

# Gamma is the minimum loss reduction required to make a further partition on a leaf node of the tree
# The larger gamma is, the more conservative the algorithm will be.
params = {
    "gamma": uniform(0, 0.5),
    "learning_rate": uniform(0.03, 0.3), # default 0.1 
    "max_depth": randint(2, 6), # default 3
    "n_estimators": randint(100, 200), # default 100
} 
# "subsample": uniform(0.6, 0.4)
# "colsample_bytree": uniform(0.7, 0.3),

# Define random search - maximise the defined regression metric
search = RandomizedSearchCV(xgb_model, param_distributions=params, random_state=42, n_iter=100, cv=6, verbose=1, n_jobs=1, return_train_score=True)

# Perform the search
results = search.fit(X_tr_sc, y_tr)

# Summarize
print('Negative_mean_squared_error: %.3f' % results.best_score_)      
print('Config: %s' % results.best_params_)


Fitting 6 folds for each of 100 candidates, totalling 600 fits
Negative_mean_squared_error: 0.740
Config: {'gamma': 0.22863258080686427, 'learning_rate': 0.2826069225035944, 'max_depth': 5, 'n_estimators': 193}
