##  Least Squares Regression - Ridge


In [1]:
# Standard scientific Python imports
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns # for visualisation
import numpy as np

from sklearn.preprocessing import StandardScaler, MinMaxScaler
# from sklearn.decomposition import PCA
# from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn import linear_model
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import mean_squared_error

from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_validate
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_val_predict
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.model_selection import GridSearchCV

import random
random.seed(15)

# Default plotting parameters
font = {'size'   : 18}
plt.rc('font', **font)



In [2]:
# Load training dataset
base_dir = "/Users/Cherry0904/desktop/ArtWorldInsights/ML_modelling/" 
Xy = pd.read_csv(base_dir + 'all_data.csv', squeeze = True)

y = Xy[['logprice']]
X_cts = Xy[['database', 'medium', 'dimensions', 'Followers Per Post (FPP)', 'Instagram performance', 'ArtfactsPresence', 'InsPresence', 'WebsitePresence']]
X = Xy[['database', 'medium', 'dimensions', 'Followers Per Post (FPP)', 'Instagram performance', 'ArtfactsPresence', 'InsPresence', 'WebsitePresence']]
# print(Xy.columns)

# Create instance of one-hot-encoder
encoder = OneHotEncoder(handle_unknown='ignore')

# Perform one-hot encoding on the columns of categorical variables 
X_encoder_df1 = pd.DataFrame(encoder.fit_transform(X[['database']]).toarray())
X_encoder_df2 = pd.DataFrame(encoder.fit_transform(X[['medium']]).toarray())
# print(X_encoder_df1.columns)
X_encoder_df1.columns = ['artprice', 'artsper', 'degreeart', 'riseart', 'singulart']
X_encoder_df2.columns = ['drawing', 'painting', 'photo']
# print(X_encoder_df2)

# Merge one-hot encoded columns back with original DataFrame
X_final = X.join(X_encoder_df1)
X_final = X_final.join(X_encoder_df2)
X_final = X_final.drop(['database', 'medium'], axis=1)
# X_final.drop(['medium'], axis=1)
print(X_final)

# Train-test split
X_tr, X_te, y_tr, y_te = train_test_split(X_final, y, test_size = 0.20 , random_state=15)

# Create version with them together
Xy_tr = pd.concat([X_tr, y_tr], axis = 1)

# Normalise according to training data
scaler = MinMaxScaler()
scaler.fit(X_tr)
X_tr_sc = scaler.transform(X_tr)
X_te_sc = scaler.transform(X_te)

# Futher split to Train-validation sets - 0.8, 0.1, 0.2
# X_train, X_val, y_train, y_val = train_test_split(X_tr, y_tr, test_size = 0.10 , random_state=250)


       dimensions  Followers Per Post (FPP)  Instagram performance  \
0        613.2000                  0.163456                   7914   
1        630.0000                  0.163456                   7914   
2        630.0000                  0.163456                   7914   
3         63.0000                  0.000000                      0   
4          8.5000                  0.000000                      0   
...           ...                       ...                    ...   
87285  12173.0000                  0.000000                5327801   
87286  33389.0000                  0.000000                  27404   
87287  19587.0576                  0.000000                   1233   
87288  29929.0000                  0.000000                  27404   
87289  10006.4316                  1.845291                      0   

       ArtfactsPresence  InsPresence  WebsitePresence  artprice  artsper  \
0                     0            1                0       0.0      0.0   
1      

In [8]:
# Set up the least-squares regressor
clf = linear_model.Ridge(alpha = 0)
# Use 6-fold on the training set
# cv = RepeatedStratifiedKFold(n_splits=6, n_repeats=1, random_state=1)

# Grid Search    
# Define grid on alpha - the weight on L2 penalty term
grid = dict()
grid['alpha'] = (0.0001, 0.001, 0.01, np.arange(0, 5, 0.5))
# Define search - maximise the defined regression metric
search = GridSearchCV(clf, grid, scoring='neg_mean_squared_error', cv=6, n_jobs=-1)
# Perform the search
results = search.fit(X_tr_sc, y_tr)
# Summarize
print('Negative_mean_squared_error: %.3f' % results.best_score_)      
print('Config: %s' % results.best_params_)

Negative_mean_squared_error: -0.768
Config: {'alpha': 0.01}




In [9]:
# Final evaluation on the held-out test set
clf = linear_model.Ridge(alpha = 0.0001)
clf.fit(X_tr_sc, y_tr)
pred = clf.predict(X_te_sc)
rmse = np.sqrt(mean_squared_error(y_te, pred))

# print("Train R-square:", clf.score(X_tr, y_tr))
print("Test R-square:", clf.score(X_te_sc, y_te))
print("Test RMSE:", rmse)

Test R-square: 0.4292328073519055
Test RMSE: 0.8474990016270869
