In [None]:
# Standard scientific Python imports
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns # for visualisation
import numpy as np
from scipy.stats import uniform, randint

from sklearn.preprocessing import StandardScaler, MinMaxScaler
# from sklearn.decomposition import PCA
# from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn import linear_model
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import mean_squared_error

from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_validate
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_val_predict
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV

import random
random.seed(15)

# Default plotting parameters
font = {'size'   : 18}
plt.rc('font', **font)

In [None]:
# Load training dataset
base_dir = "/Users/Cherry0904/desktop/ArtWorldInsights/ML_modelling/" 
Xy = pd.read_csv(base_dir + 'all_data.csv', squeeze = True)

y = Xy[['logprice']]
X_cts = Xy[['database', 'medium', 'dimensions', 'Followers Per Post (FPP)', 'Instagram performance', 'ArtfactsPresence', 'InsPresence', 'WebsitePresence']]
X = Xy[['database', 'medium', 'dimensions', 'Followers Per Post (FPP)', 'Instagram performance', 'ArtfactsPresence', 'InsPresence', 'WebsitePresence']]
# print(Xy.columns)

# Create instance of one-hot-encoder
encoder = OneHotEncoder(handle_unknown='ignore')

# Perform one-hot encoding on the columns of categorical variables 
X_encoder_df1 = pd.DataFrame(encoder.fit_transform(X[['database']]).toarray())
X_encoder_df2 = pd.DataFrame(encoder.fit_transform(X[['medium']]).toarray())
# print(X_encoder_df1.columns)
X_encoder_df1.columns = ['artprice', 'artsper', 'degreeart', 'riseart', 'singulart']
X_encoder_df2.columns = ['drawing', 'painting', 'photo']
# print(X_encoder_df2)

# Merge one-hot encoded columns back with original DataFrame
X_final = X.join(X_encoder_df1)
X_final = X_final.join(X_encoder_df2)
X_final = X_final.drop(['database', 'medium'], axis=1)
# X_final.drop(['medium'], axis=1)
# print(X_final)

# Train-test split
X_tr, X_te, y_tr, y_te = train_test_split(X_final, y, test_size = 0.20 , random_state=15)

# Create version with them together
Xy_tr = pd.concat([X_tr, y_tr], axis = 1)

# Normalise according to training data
scaler = MinMaxScaler()
scaler.fit(X_tr)
X_tr_sc = scaler.transform(X_tr)
X_te_sc = scaler.transform(X_te)

# Futher split to Train-validation sets - 0.8, 0.1, 0.2
# X_train, X_val, y_train, y_val = train_test_split(X_tr, y_tr, test_size = 0.10 , random_state=250)