Resources:

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn.model_selection import train_test_split
from clean_images import ImageClense
from clean_tabular import TabularCleanse
from PIL import Image
import os
from numpy import asarray
from PIL import UnidentifiedImageError

tabular_cleanse = TabularCleanse('fb_marketplace_conn.json', 'aicore2022!')
links =  pd.read_csv('/home/adamw/Documents/AiCore/fb_marketplace/images_fb/Links.csv')
img_tabular = tabular_cleanse.get_data_table('images')
img_tabular.drop(['bucket_link',	'image_ref', 'create_time'], axis=1, inplace=True)
directory = os.path.join(os.getcwd(),'images_fb/clean_images/original/')
img_list = []
file_names = []
for file in os.listdir(directory):
    try:
        img = Image.open(os.path.join(directory, file))
        img = asarray(img)
    except UnidentifiedImageError:
        img = None
    img_list.append(img)
    file_name = file.replace('.jpg', '')
    file_names.append(file_name)
img_dict = {
    'id': file_names,
    'img_array': img_list
}
img_data = pd.DataFrame(img_dict, columns=['id', 'img_array'])
img_data =  img_data.dropna(subset=['img_array'])
img_data = pd.merge(img_data, img_tabular, how='left', on='id')
# 62 rows with images in zip file but not in image database
img_data = img_data.dropna(subset=['product_id'])



Prepare list of products to include in classficiation
###### This is based on the same data included in the tabular classification

In [2]:
products = pd.read_pickle('final_cleaned_products.pkl')
products = products.dropna(subset='long')
products = tabular_cleanse.remove_rows_conditonal(products, 'price_gbp', 1000.00, '<')
products.drop(['location'], axis=1, inplace=True)
# Drop the cat0 fields to test the model
# Consider that dummy encodign has been applied and therefore on of the clasesses is missing.
# https://machinelearningmastery.com/types-of-classification-in-machine-learning/
#https://machinelearningmastery.com/discrete-probability-distributions-for-machine-learning/
products = products[products.columns.drop(list(products.filter(regex='cat_1')))]
# This puts category appliances back in as it was removed when cleaning the data using dummy encoding. This was used because i did not want to run the geocoder again due to un time.
categories = list(products.filter(regex='cat_0'))
products['cat_0__Appliances'] = np.where(products[categories].sum(axis=1) == 0, 1, 0)
Applicances = products.pop('cat_0__Appliances')
products.insert(4,'cat_0__Appliances', Applicances)
# k-Nearest Neighbors.
# Decision Trees.
# Naive Bayes.
# Random Forest.
# Gradient Boosting.
products.sum()

id                                          243809c0-9cfc-4486-ad12-3b7a16605ba91c58d3f9-8...
price_gbp                                                                            736256.0
long                                                                            344014.976773
lat                                                                             -22414.651779
cat_0__Appliances                                                                         494
cat_0__Baby & Kids Stuff                                                                  392
cat_0__Clothes, Footwear & Accessories                                                    368
cat_0__Computers & Software                                                               547
cat_0__DIY Tools & Materials                                                              505
cat_0__Health & Beauty                                                                    566
cat_0__Home & Garden                                        

In [3]:
model_data = pd.merge(img_data, products, how='right', left_on='product_id', right_on='id', suffixes=('', '_dupe')).filter(regex='^(?!.*_dupe)')
# model_data.info()
model_data

Unnamed: 0,id,img_array,product_id,price_gbp,long,lat,cat_0__Appliances,cat_0__Baby & Kids Stuff,"cat_0__Clothes, Footwear & Accessories",cat_0__Computers & Software,cat_0__DIY Tools & Materials,cat_0__Health & Beauty,cat_0__Home & Garden,"cat_0__Music, Films, Books & Games",cat_0__Office Furniture & Equipment,cat_0__Other Goods,"cat_0__Phones, Mobile Phones & Telecoms","cat_0__Sports, Leisure & Travel",cat_0__Video Games & Consoles
0,64aa79f3-e9fa-417c-a332-714b8ce933f1,"[[[66, 56, 44], [47, 39, 28], [64, 57, 47], [5...",243809c0-9cfc-4486-ad12-3b7a16605ba9,5.0,51.453489,-1.031873,0,0,0,0,0,0,1,0,0,0,0,0,0
1,a864ee52-d91e-46e7-94d1-2418e9bb2877,"[[[255, 255, 255], [255, 255, 255], [255, 255,...",1c58d3f9-8b93-47ea-9415-204fcc2a22e6,20.0,57.479012,-4.225739,0,0,0,0,0,0,1,0,0,0,0,0,0
2,4e670f9e-7feb-458f-b529-ac52547abe2b,"[[[255, 255, 255], [255, 255, 255], [255, 255,...",1c58d3f9-8b93-47ea-9415-204fcc2a22e6,20.0,57.479012,-4.225739,0,0,0,0,0,0,1,0,0,0,0,0,0
3,bfe77c38-c9eb-47fb-b3d6-31ffdefb6ff9,"[[[209, 253, 254], [206, 255, 255], [193, 255,...",860673f1-57f6-47ba-8d2f-13f9e05b8f9a,20.0,53.150228,0.329093,0,0,0,0,0,0,1,0,0,0,0,0,0
4,a92e56b7-94fc-41b4-ba6c-f2f224f42bb8,"[[[181, 180, 178], [181, 180, 178], [181, 180,...",59948726-29be-4b35-ade5-bb2fd7331856,115.0,51.291949,-2.447623,0,0,0,0,0,0,1,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11716,99374787-d4b4-4264-980c-954123446777,"[[[204, 152, 165], [206, 154, 167], [205, 155,...",88d2d66b-2685-46b8-af84-f495fd2ccb14,380.0,53.450693,-2.994883,0,0,0,0,0,0,0,0,0,0,0,0,1
11717,bcc22d62-5285-4fbc-bbe0-d75259dbff71,"[[[14, 43, 113], [12, 40, 113], [9, 39, 112], ...",8ca91ce8-49e7-4746-b06c-ac838d94ef35,650.0,53.485152,-2.898906,0,0,0,0,0,0,0,0,0,0,0,0,1
11718,04722d18-acac-4d7a-8eec-6171c064d382,"[[[46, 89, 184], [46, 89, 184], [46, 89, 184],...",8ca91ce8-49e7-4746-b06c-ac838d94ef35,650.0,53.485152,-2.898906,0,0,0,0,0,0,0,0,0,0,0,0,1
11719,c05faddc-9ae7-4043-8b39-62ff1e3bfcfe,"[[[96, 93, 88], [89, 86, 81], [91, 88, 83], [1...",df8ef910-03cc-4c9e-97a9-7f0a7e838102,10.0,51.272337,-0.721647,0,0,0,0,0,0,0,0,0,0,0,0,1


In [8]:
flatten_list = []
for i in model_data['img_array']:
    if type(i) is np.ndarray:
        flatten_list.append(i.flatten())
    else:
        flatten_list.append(None)

model_data['flatten_img_array'] = flatten_list
model_data





Unnamed: 0,id,img_array,product_id,price_gbp,long,lat,cat_0__Appliances,cat_0__Baby & Kids Stuff,"cat_0__Clothes, Footwear & Accessories",cat_0__Computers & Software,cat_0__DIY Tools & Materials,cat_0__Health & Beauty,cat_0__Home & Garden,"cat_0__Music, Films, Books & Games",cat_0__Office Furniture & Equipment,cat_0__Other Goods,"cat_0__Phones, Mobile Phones & Telecoms","cat_0__Sports, Leisure & Travel",cat_0__Video Games & Consoles,flatten_img_array
0,64aa79f3-e9fa-417c-a332-714b8ce933f1,"[[[66, 56, 44], [47, 39, 28], [64, 57, 47], [5...",243809c0-9cfc-4486-ad12-3b7a16605ba9,5.0,51.453489,-1.031873,0,0,0,0,0,0,1,0,0,0,0,0,0,"[66, 56, 44, 47, 39, 28, 64, 57, 47, 51, 47, 3..."
1,a864ee52-d91e-46e7-94d1-2418e9bb2877,"[[[255, 255, 255], [255, 255, 255], [255, 255,...",1c58d3f9-8b93-47ea-9415-204fcc2a22e6,20.0,57.479012,-4.225739,0,0,0,0,0,0,1,0,0,0,0,0,0,"[255, 255, 255, 255, 255, 255, 255, 255, 255, ..."
2,4e670f9e-7feb-458f-b529-ac52547abe2b,"[[[255, 255, 255], [255, 255, 255], [255, 255,...",1c58d3f9-8b93-47ea-9415-204fcc2a22e6,20.0,57.479012,-4.225739,0,0,0,0,0,0,1,0,0,0,0,0,0,"[255, 255, 255, 255, 255, 255, 255, 255, 255, ..."
3,bfe77c38-c9eb-47fb-b3d6-31ffdefb6ff9,"[[[209, 253, 254], [206, 255, 255], [193, 255,...",860673f1-57f6-47ba-8d2f-13f9e05b8f9a,20.0,53.150228,0.329093,0,0,0,0,0,0,1,0,0,0,0,0,0,"[209, 253, 254, 206, 255, 255, 193, 255, 250, ..."
4,a92e56b7-94fc-41b4-ba6c-f2f224f42bb8,"[[[181, 180, 178], [181, 180, 178], [181, 180,...",59948726-29be-4b35-ade5-bb2fd7331856,115.0,51.291949,-2.447623,0,0,0,0,0,0,1,0,0,0,0,0,0,"[181, 180, 178, 181, 180, 178, 181, 180, 178, ..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11716,99374787-d4b4-4264-980c-954123446777,"[[[204, 152, 165], [206, 154, 167], [205, 155,...",88d2d66b-2685-46b8-af84-f495fd2ccb14,380.0,53.450693,-2.994883,0,0,0,0,0,0,0,0,0,0,0,0,1,"[204, 152, 165, 206, 154, 167, 205, 155, 166, ..."
11717,bcc22d62-5285-4fbc-bbe0-d75259dbff71,"[[[14, 43, 113], [12, 40, 113], [9, 39, 112], ...",8ca91ce8-49e7-4746-b06c-ac838d94ef35,650.0,53.485152,-2.898906,0,0,0,0,0,0,0,0,0,0,0,0,1,"[14, 43, 113, 12, 40, 113, 9, 39, 112, 6, 38, ..."
11718,04722d18-acac-4d7a-8eec-6171c064d382,"[[[46, 89, 184], [46, 89, 184], [46, 89, 184],...",8ca91ce8-49e7-4746-b06c-ac838d94ef35,650.0,53.485152,-2.898906,0,0,0,0,0,0,0,0,0,0,0,0,1,"[46, 89, 184, 46, 89, 184, 46, 89, 184, 47, 90..."
11719,c05faddc-9ae7-4043-8b39-62ff1e3bfcfe,"[[[96, 93, 88], [89, 86, 81], [91, 88, 83], [1...",df8ef910-03cc-4c9e-97a9-7f0a7e838102,10.0,51.272337,-0.721647,0,0,0,0,0,0,0,0,0,0,0,0,1,"[96, 93, 88, 89, 86, 81, 91, 88, 83, 102, 99, ..."


In [8]:
model_data = model_data.sample(frac=0.01, random_state=42)
model_data

Unnamed: 0,id,img_array,product_id,price_gbp,long,lat,cat_0__Appliances,cat_0__Baby & Kids Stuff,"cat_0__Clothes, Footwear & Accessories",cat_0__Computers & Software,cat_0__DIY Tools & Materials,cat_0__Health & Beauty,cat_0__Home & Garden,"cat_0__Music, Films, Books & Games",cat_0__Office Furniture & Equipment,cat_0__Other Goods,"cat_0__Phones, Mobile Phones & Telecoms","cat_0__Sports, Leisure & Travel",cat_0__Video Games & Consoles,flatten_img_array
6119,dddd4d95-bcfc-4550-8a2a-9560a9023d51,"[[[132, 138, 126], [138, 144, 132], [140, 146,...",49e5b00b-dffc-42cf-ad47-913c0aa16f81,20.0,51.06161,-2.754857,0,0,0,0,0,0,0,1,0,0,0,0,0,"[132, 138, 126, 138, 144, 132, 140, 146, 134, ..."
8178,c26d58d9-91d9-4112-9c35-b50b1bf67ce4,"[[[169, 71, 96], [169, 65, 92], [176, 61, 92],...",bcc8ee8c-3319-406f-ac64-d6c04c8bb2d0,4.0,51.347193,-2.977892,0,0,0,0,0,0,0,0,0,0,1,0,0,"[169, 71, 96, 169, 65, 92, 176, 61, 92, 188, 6..."
3698,56fa219f-92a1-4a0e-960b-72f5307d4319,"[[[202, 220, 234], [202, 220, 234], [202, 220,...",8f7fa73a-d738-4051-92e7-7711043c825b,650.0,51.783885,-1.485286,0,0,0,0,0,0,0,0,0,0,0,1,0,"[202, 220, 234, 202, 220, 234, 202, 220, 234, ..."
6923,10f341b5-2960-43d7-98fa-0ccf761a5687,"[[[100, 83, 67], [90, 73, 57], [83, 66, 50], [...",2a684561-dc86-459d-bc51-bc2425cb702c,0.01,52.057949,1.152635,0,0,0,0,0,0,0,1,0,0,0,0,0,"[100, 83, 67, 90, 73, 57, 83, 66, 50, 86, 69, ..."
10302,2011ff95-f9ad-4b19-9fc1-cb0ded1967af,"[[[193, 211, 197], [193, 211, 197], [193, 211,...",44a5c7f8-c9d5-471f-9d2e-47ee0dd62374,10.0,52.63614,-1.133079,0,0,0,0,0,0,0,0,1,0,0,0,0,"[193, 211, 197, 193, 211, 197, 193, 211, 197, ..."
5792,14c18b9d-ffc7-4d5d-870f-dccd8cdd4102,"[[[8, 9, 13], [36, 37, 41], [146, 147, 149], [...",90f7bb12-da35-4a9b-b15a-95488444d803,60.0,53.707437,-1.67545,1,0,0,0,0,0,0,0,0,0,0,0,0,"[8, 9, 13, 36, 37, 41, 146, 147, 149, 138, 138..."
764,68eabed7-fd6c-43da-ae39-d3fe218eff78,"[[[93, 90, 71], [95, 92, 73], [99, 96, 77], [1...",e391cb7a-5b7f-4f0a-a4db-b57c8ba0a4f2,30.0,55.856009,-4.243251,0,0,0,0,0,0,1,0,0,0,0,0,0,"[93, 90, 71, 95, 92, 73, 99, 96, 77, 102, 99, ..."
9083,334acc96-ed5a-43a5-8690-4ca801f66649,"[[[243, 167, 115], [245, 175, 126], [235, 172,...",98008111-abcf-4ca5-a588-04553b85bb92,375.0,51.536468,-3.140719,0,0,0,0,0,1,0,0,0,0,0,0,0,"[243, 167, 115, 245, 175, 126, 235, 172, 129, ..."
99,d965ba6a-4424-43b1-96e9-de6ad5095ca3,"[[[51, 46, 40], [61, 56, 50], [74, 69, 63], [8...",cedbf235-b6e4-4a54-98c6-6a02e6aeba52,50.0,51.474571,-0.092542,0,0,0,0,0,0,1,0,0,0,0,0,0,"[51, 46, 40, 61, 56, 50, 74, 69, 63, 86, 81, 7..."
6702,1bc4f786-90f8-4b94-b3fd-b129be2c08f0,"[[[126, 109, 89], [114, 97, 77], [112, 95, 75]...",773f152b-90f5-44b8-8491-340271db9458,2.0,51.544951,-0.481667,0,0,0,0,0,0,0,1,0,0,0,0,0,"[126, 109, 89, 114, 97, 77, 112, 95, 75, 113, ..."


In [23]:
from sklearn.preprocessing import LabelEncoder
# X = model_data[['flatten_img_array', 'price_gbp', 'long', 'lat']]
X = list(model_data['flatten_img_array'])
y = model_data.filter(regex='cat_0')
le = LabelEncoder()
y_labels = y.idxmax(axis=1)
label = list(le.fit_transform(y_labels))
y['category']= label
y = y.filter(regex='category')
y = list(y['category'])
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  y['category']= label


142848

In [25]:
dimensions = set([len(i) for i in X])
print(dimensions)

TypeError: object of type 'NoneType' has no len()

In [14]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

log_reg = LogisticRegression(penalty='none', 
                         tol=0.1, solver='saga',
                         multi_class='multinomial')

In [5]:
from sklearn.preprocessing import StandardScaler
# create scaler
scaler = StandardScaler()
# fit scaler to train set - which means it identifies the values for scaling only on the training set and stores this in the scaler
scaler.fit(X_train)
# The scaler can now be applied to the train, validation and test sets which holds the info from the train set only
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

In [16]:
log_reg.fit(X_train, y_train)


ValueError: setting an array element with a sequence. The requested array has an inhomogeneous shape after 1 dimensions. The detected shape was (9376,) + inhomogeneous part.

In [6]:
from sklearn.metrics import accuracy_score
from sklearn.svm import SVR
from sklearn.linear_model import Lasso
from sklearn.linear_model import ElasticNet
from sklearn.tree import DecisionTreeRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.linear_model import SGDRegressor
from sklearn.kernel_ridge import KernelRidge
from sklearn.linear_model import BayesianRidge
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import GridSearchCV


lin = LinearRegression()
# svr = SVR()
lasso = Lasso(alpha=0.01, max_iter=100, selection='random')
# elas = ElasticNet()
dt = DecisionTreeRegressor(max_depth=5, max_features='auto', max_leaf_nodes=20,
                      min_samples_leaf=8, min_weight_fraction_leaf=0.1)
knn = KNeighborsRegressor(algorithm='kd_tree', leaf_size=10, n_neighbors=35, p=1)
gbr = GradientBoostingRegressor(learning_rate=0.01, max_depth=6, n_estimators=500,
                          subsample=0.5)
# sgd = SGDRegressor()
# ker = KernelRidge()
# bayes = BayesianRidge()

regressor_dict = {
    'Linear' : [lin],
    'Lasso' : [lasso],
    'DecisionTree' : [dt],
    'KNeighbours' : [knn],
    'GradientBoost' : [gbr],
}
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# df = pd.DataFrame({'Actual': y_test, 'Predicted': y_pred.flatten()})
# print(df.head())
# print(np.mean(y_test))


# save model weights
# joblib.dump(lin_regression, "linear_regression.joblib")

Hyperparamater tuning

In [60]:
# create scaler
scaler = StandardScaler()
# fit scaler to train set - which means it identifies the values for scaling only on the training set and stores this in the scaler
scaler.fit(X_train)
# The scaler can now be applied to the train, validation and test sets which holds the info from the train set only
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

## knn

In [None]:
from sklearn.model_selection import GridSearchCV

grid_knn = {
    'n_neighbors': [5, 35],
    'weights': ['uniform', 'distance'],
    'algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute'],
    'leaf_size': [10, 20, 30],
    'p': [1, 2]
    }

gs = GridSearchCV(
    knn, 
    grid_knn, 
    cv=5,
    verbose=2
    )

gs.fit(X_train, y_train)
gs.best_estimator_


## Lasso

In [None]:
# Must scale the independent varibles for a regularisation model like this.
# https://towardsdatascience.com/hyperparameter-tuning-in-lasso-and-ridge-regressions-70a4b158ae6d
# Consider using r2 
# alphas = np.linspace(0, 0.2, 21)
# print(alphas)

grid_lasso = {
    'alpha': [0.01],
    'max_iter': [10, 10, 30, 40, 50, 70, 90, 100],
    'selection': ['random'],
    }

gs = GridSearchCV(
    lasso, 
    grid_lasso, 
    cv=5,
    verbose=2
    )

gs.fit(X_train, y_train)
gs.best_estimator_

Decision tree

In [None]:
grid_dt = {
    "splitter":["best","random"],
    "max_depth" : [5, 10],
    "min_samples_leaf":[4, 5, 8, 9, 10],
    "min_weight_fraction_leaf":[0.1,0.2],
    "max_features":["auto"],
    "max_leaf_nodes":[20] 
    }

gs = GridSearchCV(
    dt, 
    grid_dt, 
    cv=5,
    verbose=2
    )

gs.fit(X_train, y_train)
gs.best_estimator_

GrdientBoost

In [58]:
grid_gbr = {
    'learning_rate': [0.01,0.02,0.03],
    'subsample'    : [0.5, 0.2, 0.1],
    'n_estimators' : [100,500],
    'max_depth'    : [4,6,8]
    }

gs = GridSearchCV(
    gbr, 
    grid_gbr, 
    cv=5,
    verbose=2
    )

gs.fit(X_train, y_train)
gs.best_estimator_

Fitting 5 folds for each of 54 candidates, totalling 270 fits
[CV] END learning_rate=0.01, max_depth=4, n_estimators=100, subsample=0.5; total time=   0.4s
[CV] END learning_rate=0.01, max_depth=4, n_estimators=100, subsample=0.5; total time=   0.4s
[CV] END learning_rate=0.01, max_depth=4, n_estimators=100, subsample=0.5; total time=   0.4s
[CV] END learning_rate=0.01, max_depth=4, n_estimators=100, subsample=0.5; total time=   0.4s
[CV] END learning_rate=0.01, max_depth=4, n_estimators=100, subsample=0.5; total time=   0.4s
[CV] END learning_rate=0.01, max_depth=4, n_estimators=100, subsample=0.2; total time=   0.2s
[CV] END learning_rate=0.01, max_depth=4, n_estimators=100, subsample=0.2; total time=   0.2s
[CV] END learning_rate=0.01, max_depth=4, n_estimators=100, subsample=0.2; total time=   0.2s
[CV] END learning_rate=0.01, max_depth=4, n_estimators=100, subsample=0.2; total time=   0.2s
[CV] END learning_rate=0.01, max_depth=4, n_estimators=100, subsample=0.2; total time=   0.2

GradientBoostingRegressor(learning_rate=0.01, max_depth=6, n_estimators=500,
                          subsample=0.5)

Create pipeline

In [8]:
from sklearn.pipeline import Pipeline
from sklearn.model_selection import RepeatedStratifiedKFold, RepeatedKFold
from sklearn.model_selection import cross_val_score
from sklearn.metrics import make_scorer
# split
# create scaler
# fit scaler to train set - which means it identifies the values for scaling only on the training set and stores this in the scaler
# The scaler can now be applied to the train, validation and test sets which holds the info from the train set only
# the above is peromes using scaler.transform([dataset_name])

# if using k fold validation
mse_scorer = make_scorer(mean_squared_error)
model = knn

pipe = Pipeline(
    steps= [
        ('scaler', StandardScaler()), # is this scaling the target variable too? Will this impact output and MSE
        ('model',  model)
        ]
)
cv = RepeatedKFold(n_splits=10, n_repeats=3, random_state=42)
# do i only put the train in or with cross validation it is OK to go with all
#failing is it beucse im parsign a df and not an matrix
scores = cross_val_score(pipe, X_train, y_train, scoring='neg_root_mean_squared_error', cv=cv, n_jobs=1)
scores = abs(scores)
print(len(scores))
print(scores)
print(np.mean(scores))

30
[157.92115882 142.61368825 159.65134259 159.77340842 136.30888519
 160.17735183 140.61373045 162.4471424  157.9658604  146.27734027
 177.27935939 137.74270237 167.37114202 159.8628136  138.84743094
 145.24414759 156.85135054 129.89196302 167.82417354 137.28230797
 146.16891276 157.31902907 167.57458003 149.68944081 158.73580393
 160.93025928 138.7066297  159.72718543 137.7429765  148.27270195]
152.22716063526875


Run an assess models

No scaling or cross validation

In [61]:
for model in regressor_dict:
    print(f'Training the {model} model')
    regressor_dict[model][0].fit(X_train, y_train)
    y_pred = regressor_dict[model][0].predict(X_test)
    y_pred_train = regressor_dict[model][0].predict(X_train)
    print('Mean Absolute Error: \n Test', metrics.mean_absolute_error(y_test, y_pred) )
    # print('Train', metrics.mean_absolute_error(y_train, y_pred_train))  
    print('Mean Squared Error: \n Test', metrics.mean_squared_error(y_test, y_pred))  
    # print('Train', metrics.mean_squared_error(y_train, y_pred_train))
    print('Root Mean Squared Error: \n Test', np.sqrt(metrics.mean_squared_error(y_test, y_pred)))
    # print('Train', np.sqrt(metrics.mean_squared_error(y_train, y_pred_train)))
    df = pd.DataFrame({'Actual': y_test, 'Predicted': y_pred.flatten()})
    print(df.head(5))

Training the Linear model
Mean Absolute Error: 
 Test 102.14207390823681
Mean Squared Error: 
 Test 27123.34663594705
Root Mean Squared Error: 
 Test 164.6916714225314
      Actual   Predicted
4783    30.0   35.967773
4662     5.0   20.959961
4089     1.0   32.624023
4965   150.0  272.327148
4713    60.0  204.624023
Training the Lasso model
Mean Absolute Error: 
 Test 102.12771252843028
Mean Squared Error: 
 Test 27116.987185352456
Root Mean Squared Error: 
 Test 164.67236314983901
      Actual   Predicted
4783    30.0   31.170155
4662     5.0   20.292309
4089     1.0   32.049758
4965   150.0  272.863243
4713    60.0  204.276196
Training the DecisionTree model
Mean Absolute Error: 
 Test 121.42765891085537
Mean Squared Error: 
 Test 33687.90488564102
Root Mean Squared Error: 
 Test 183.5426514073528
      Actual   Predicted
4783    30.0   94.777509
4662     5.0  136.923947
4089     1.0  134.319266
4965   150.0  134.319266
4713    60.0  136.923947
Training the KNeighbours model


  model = cd_fast.enet_coordinate_descent(


Mean Absolute Error: 
 Test 103.36077246283128
Mean Squared Error: 
 Test 27248.064650912856
Root Mean Squared Error: 
 Test 165.06987808474585
      Actual   Predicted
4783    30.0   16.824571
4662     5.0  264.114286
4089     1.0   10.028571
4965   150.0  222.171429
4713    60.0  196.599429
Training the GradientBoost model
Mean Absolute Error: 
 Test 103.38019721959998
Mean Squared Error: 
 Test 27665.21872382287
Root Mean Squared Error: 
 Test 166.328646732374
      Actual   Predicted
4783    30.0   70.732218
4662     5.0   53.097352
4089     1.0   28.213481
4965   150.0  209.545570
4713    60.0  189.344754


Scaling without cross validation

In [34]:
# create scaler
scaler = StandardScaler()
# fit scaler to train set - which means it identifies the values for scaling only on the training set and stores this in the scaler
scaler.fit(X_train)
# The scaler can now be applied to the train, validation and test sets which holds the info from the train set only
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)
# the above is peromes using scaler.transform([dataset_name])
for model in regressor_dict:
    print(f'Training the {model} model')
    regressor_dict[model][0].fit(X_train, y_train)
    y_pred = regressor_dict[model][0].predict(X_test)
    y_pred_train = regressor_dict[model][0].predict(X_train)
    print('Mean Absolute Error: \n Test', metrics.mean_absolute_error(y_test, y_pred) )
    # print('Train', metrics.mean_absolute_error(y_train, y_pred_train))  
    print('Mean Squared Error: \n Test', metrics.mean_squared_error(y_test, y_pred))  
    # print('Train', metrics.mean_squared_error(y_train, y_pred_train))
    print('Root Mean Squared Error: \n Test', np.sqrt(metrics.mean_squared_error(y_test, y_pred)))
    # print('Train', np.sqrt(metrics.mean_squared_error(y_train, y_pred_train)))
    df = pd.DataFrame({'Actual': y_test, 'Predicted': y_pred.flatten()})
    print(df.head(5))

Training the Linear model
Mean Absolute Error: 
 Test 102.14207390823681
Mean Squared Error: 
 Test 27123.34663594705
Root Mean Squared Error: 
 Test 164.6916714225314
      Actual   Predicted
4783    30.0   35.967773
4662     5.0   20.959961
4089     1.0   32.624023
4965   150.0  272.327148
4713    60.0  204.624023
Training the KNeighbours model
Mean Absolute Error: 
 Test 101.11288084464555
Mean Squared Error: 
 Test 29496.97488751131
Root Mean Squared Error: 
 Test 171.74683370447127
      Actual  Predicted
4783    30.0     27.286
4662     5.0     18.000
4089     1.0      1.000
4965   150.0    123.200
4713    60.0    247.798
Training the KNeighbours_Optimised model
Mean Absolute Error: 
 Test 103.36077246283128
Mean Squared Error: 
 Test 27248.064650912856
Root Mean Squared Error: 
 Test 165.06987808474585
      Actual   Predicted
4783    30.0   16.824571
4662     5.0  264.114286
4089     1.0   10.028571
4965   150.0  222.171429
4713    60.0  196.599429


In [9]:


for model in regressor_dict:
    print(f'Training the {model} model')
    model = regressor_dict[model][0]
    pipe = Pipeline(
    steps= [
        ('scaler', StandardScaler()), # is this scaling the target variable too? Will this impact output and MSE
        ('model',  model)
        ]
    )
    cv = RepeatedKFold(n_splits=10, n_repeats=3, random_state=42)
    # do i only put the train in or with cross validation it is OK to go with all
    #failing is it beucse im parsign a df and not an matrix
    scores = cross_val_score(pipe, X_train, y_train, scoring='neg_root_mean_squared_error', cv=cv, n_jobs=1)
    scores = abs(scores)
    print(len(scores))
    print(scores)
    print(np.mean(scores))


Training the Linear model
30
[2.51600379e+14 1.43499085e+02 4.46236720e+11 1.59273996e+02
 1.94433317e+13 1.54912901e+02 1.41013178e+02 1.59946148e+02
 1.60186482e+02 2.10760735e+12 1.74727677e+02 1.42520635e+02
 1.66226227e+02 8.54193509e+14 9.41180355e+12 1.51494473e+02
 1.57803361e+02 1.29698881e+02 1.67997305e+02 1.57133638e+14
 3.94278363e+14 1.48206286e+02 1.75624404e+02 8.10018191e+13
 1.58237794e+02 1.59125806e+02 1.42904373e+02 2.49331597e+14
 7.01735454e+13 1.49794962e+02]
69637394346105.25
Training the Lasso model


  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = c

30
[157.04606031 143.45953383 162.85568419 158.68585355 129.87972438
 154.94837174 140.92387679 159.84410877 159.92988351 151.1249997
 174.73123062 141.80244267 165.82641349 163.10238425 132.62411876
 150.50106188 157.73092691 129.64968437 167.81797923 134.39191633
 146.151811   148.21256101 175.45750229 151.68030289 157.98065273
 159.03925431 140.99457776 157.49385402 135.02037709 148.49085101]
151.9132666457949
Training the DecisionTree model
30
[181.73259863 162.74946391 182.64800933 182.65628043 147.92569729
 170.37054683 159.91336336 174.72777427 180.95132527 170.54130993
 200.71104248 167.11959437 186.26637546 179.24709327 150.83781612
 167.29144115 171.13225986 141.68504328 185.94205838 161.77765798
 173.03736749 165.95406265 187.65672679 167.87969828 180.48830546
 174.91558899 160.31874704 177.71496024 154.59063874 172.37887458]
171.37205739566875
Training the KNeighbours model
30
[157.92115882 142.61368825 159.65134259 159.77340842 136.30888519
 160.17735183 140.61373045 162.4