In [90]:
# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import pickle
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import GridSearchCV

In [None]:
# Load the encoded data from a CSV file
encoded = pd.read_csv('../Datasets/ecoded.csv',sep=';')

# Convert the embedding column from a string to a list of floats
encoded.embedding = encoded.embedding.map(lambda x: [float(num) for num in x[1:-1].split(' ') if num != ''])

# Create a new DataFrame from the embeddings list and add it to the encoded DataFrame
embeddings = pd.DataFrame(encoded.embedding.to_list(), columns=['embedding_1','embedding_2','embedding_3','embedding_4'])
encoded = pd.concat([encoded,embeddings],axis=1).drop(columns=['embedding'])

In [None]:
# Remove the 'gt' and 'product_id' columns from the encoded DataFrame and assign the result to X
X = encoded.drop(columns=['gt','product_id'])

# Assign the 'gt' column from the encoded DataFrame to Y
Y = encoded['gt']

In [96]:
# Split the data into training and testing sets
# Shuffle the data and set a random seed for reproducibility
X_train, X_test, y_train, y_test   = train_test_split(X,Y,shuffle=True,random_state=69)

In [None]:
# Load the best XGBoost hyperparameters from two pickle files
# best_params1 from 'best_params_xgboost.pkl'
# best_params2 from 'best_params_xgboost_2.pkl'
with open('best_params_xgboost_2.pkl','rb') as file2, open('best_params_xgboost.pkl','rb') as file1:
    best_params1 = pickle.load(file1)
    best_params2 = pickle.load(file2)

In [98]:
# Create a Gradient Boosting Regressor with the best hyperparameters from the pickle file
xgboost_encoded = GradientBoostingRegressor(**best_params2)

# Fit the model using the training data
result_f_sklearn_gooddf = xgboost_encoded.fit(X_train, y_train)

In [99]:
# Use the trained model to predict the target variable for the testing and training sets
y_predicted_test = xgboost_encoded.predict(X_test)
y_predicted_train = xgboost_encoded.predict(X_train)

# Calculate the root mean squared error (RMSE) for the testing and training sets
rms_test = mean_squared_error(y_test, y_predicted_test, squared=False)
rms_train = mean_squared_error(y_train, y_predicted_train, squared=False)

# Print the RMSE for the testing and training sets
print(f'train: {rms_train}\ntest: {rms_test}')

train: 66.68456764837829
test: 111.94892310939883


In [100]:
# Define a GridSearchCV object to search for the best hyperparameters
gsc1 = GridSearchCV(
            estimator=GradientBoostingRegressor(),
            param_grid={"n_estimators": list(range(60,80,10)),
                        "learning_rate": [0.1,0.05],
                        "max_depth": [5,10],#,20],
                        'min_impurity_decrease':[2],#,10],
                        'min_samples_split':[50,100],
                        'min_samples_leaf':[10,20],
                        'max_features':['auto'],
                        'max_leaf_nodes':list(range(100,300,100)),
                        'ccp_alpha':[0.1,0.5]},
            cv=5, scoring='neg_root_mean_squared_error', verbose=10, n_jobs=-1)

# Use the GridSearchCV object to search for the best hyperparameters
grid_result_encoded = gsc1.fit(X_train, y_train)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.


Fitting 5 folds for each of 128 candidates, totalling 640 fits


[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed:    4.6s
[Parallel(n_jobs=-1)]: Done   8 tasks      | elapsed:    4.7s
[Parallel(n_jobs=-1)]: Done  17 tasks      | elapsed:    5.5s
[Parallel(n_jobs=-1)]: Done  26 tasks      | elapsed:    6.2s
[Parallel(n_jobs=-1)]: Done  37 tasks      | elapsed:    6.9s
[Parallel(n_jobs=-1)]: Done  48 tasks      | elapsed:    7.6s
[Parallel(n_jobs=-1)]: Done  61 tasks      | elapsed:    8.2s
[Parallel(n_jobs=-1)]: Done  74 tasks      | elapsed:    8.9s
[Parallel(n_jobs=-1)]: Done  89 tasks      | elapsed:   10.6s
[Parallel(n_jobs=-1)]: Done 104 tasks      | elapsed:   11.9s
[Parallel(n_jobs=-1)]: Done 121 tasks      | elapsed:   13.9s
[Parallel(n_jobs=-1)]: Done 138 tasks      | elapsed:   16.0s
[Parallel(n_jobs=-1)]: Done 157 tasks      | elapsed:   17.5s
[Parallel(n_jobs=-1)]: Done 176 tasks      | elapsed:   18.7s
[Parallel(n_jobs=-1)]: Done 197 tasks      | elapsed:   19.8s
[Parallel(n_jobs=-1)]: Done 218 tasks      | elapsed:   21.2s
[Paralle

In [101]:
# Print the training and testing scores of the GridSearchCV object
print(f'train:{grid_result_encoded.score(X_train,y_train)}\ntest:{grid_result_encoded.score(X_test, y_test)}')

# Store the training and testing scores in a tuple
results = (grid_result_encoded.score(X_train,y_train), grid_result_encoded.score(X_test, y_test))
# rmse.append(results)

train:-91.48912846917312
test:-106.76120430417743
