In [4]:
import pandas as pd
import numpy as np
import os
import warnings
warnings.filterwarnings('ignore')

In [5]:
data_folder = "../data/training"
csv_files = [] # initialize empty list for storing CSV filename
csv_dfs = [] # initialize empty list to store the individual DataFrames

for filename in os.listdir(data_folder): 
    if filename.endswith('.csv'):
        csv_files.append(filename)

# sort the list of filenames by extracting the numeric part of the filename
csv_files.sort(key=lambda x: int(x.lstrip("lidar_data").rstrip('.csv')))

for filename in csv_files:
    csv_filename = os.path.join(data_folder, filename)
    df = pd.read_csv(csv_filename)
    csv_dfs.append(df)
    print(f"Length of {filename}: {len(df)}")

# concatenate all the individual DataFrames into a single DataFrame
csv_combined_df = pd.concat(csv_dfs, ignore_index=True)

# print the length of the combined DataFrame
print(f"Total length of lidar_data_combined.csv: {len(csv_combined_df)}\n")

# # set the maximum number of columns to be displayed to None (i.e. display all columns)
# pd.set_option('display.max_columns', None)

# display the combined DataFrame
display(csv_combined_df.head())

Length of lidar_data1.csv: 2771
Length of lidar_data2.csv: 4458
Length of lidar_data3.csv: 2562
Length of lidar_data4.csv: 3859
Length of lidar_data5.csv: 4225
Length of lidar_data6.csv: 7976
Length of lidar_data7.csv: 7873
Length of lidar_data8.csv: 2913
Length of lidar_data9.csv: 2397
Length of lidar_data10.csv: 2289
Length of lidar_data11.csv: 8576
Length of lidar_data12.csv: 647
Length of lidar_data13.csv: 667
Length of lidar_data14.csv: 6615
Length of lidar_data15.csv: 7361
Length of lidar_data16.csv: 667
Length of lidar_data17.csv: 768
Length of lidar_data18.csv: 700
Total length of lidar_data_combined.csv: 67324



Unnamed: 0,Keyboard Input,Range0,Range1,Range2,Range3,Range4,Range5,Range6,Range7,Range8,...,Range118,Range119,Range120,Range121,Range122,Range123,Range124,Range125,Range126,Range127
0,,0.249914,0.250067,0.250371,0.250828,0.251437,0.252202,0.253125,0.254208,0.255455,...,3.340168,3.321843,3.305699,3.291685,3.279757,3.269877,3.262016,3.25615,3.25226,3.250335
1,,0.249914,0.250067,0.250371,0.250828,0.251437,0.252202,0.253125,0.254208,0.255455,...,3.340168,3.321843,3.305699,3.291685,3.279757,3.269877,3.262016,3.25615,3.25226,3.250335
2,,0.249914,0.250067,0.250371,0.250828,0.251437,0.252202,0.253125,0.254208,0.255455,...,3.340168,3.321843,3.305699,3.291685,3.279756,3.269877,3.262016,3.25615,3.25226,3.250335
3,,0.249914,0.250067,0.250371,0.250828,0.251437,0.252202,0.253125,0.254208,0.255455,...,3.340167,3.321843,3.305699,3.291684,3.279756,3.269876,3.262016,3.256149,3.25226,3.250335
4,,0.249914,0.250067,0.250371,0.250828,0.251437,0.252202,0.253125,0.254208,0.255455,...,3.340167,3.321843,3.305699,3.291684,3.279756,3.269876,3.262016,3.256149,3.25226,3.250335


In [6]:
csv_combined_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 67324 entries, 0 to 67323
Columns: 129 entries, Keyboard Input to Range127
dtypes: float64(129)
memory usage: 66.3 MB


In [Webots](https://www.cyberbotics.com/doc/reference/lidar?version=released) "the `minRange` field defines the minimum range of the lidar, objects closer to the lidar than the minimum range are not detected (but still occlude other objects). If the range value is smaller than the `minRange` value then infinity is returned."

"The `maxRange` field defines the distance between the lidar and the far clipping plane of the OpenGL view frustum. This field defines the maximum range that the lidar can achieve and so the maximum possible value of the range image (in meter). If the range value is bigger than the `maxRange` value then infinity is returned."

##### Check for any 'inf' values

In [7]:
# initialize a dictionary to store the results
inf_cols = []

# iterate through the columns and check for 'inf' values
for col in csv_combined_df.columns:
    has_inf = csv_combined_df[col].isin([np.inf, -np.inf]).any()
    if has_inf:
        inf_cols.append(col)
        
# display the columns with 'inf' values
if inf_cols:  
    for col in inf_cols:
        print(f"Column '{col}' contains 'inf' values.")
else:
    print(f"No columns contain 'inf' values.")

No columns contain 'inf' values.


Previously, in the `user_control.py` controller file for Webots, we had noticed that there were 4 keys that could be pressed.
* To move forward: 'W' or 'w' (corresponding ASCII value of 87).
* To move backward: 'S' or 's' (corresponding ASCII value of 83).
* To turn right: 'D' or 'd' (corresponding ASCII value of 68).
* To turn left: 'A' or 'a' (corresponding ASCII value of 65).

However, when a key was not pressed at each 32ms 'TIME_STEP', a value of -1 was returned, we replaced this with a NaN or null value.

##### Check for any 'null' (missing) values

In [8]:
# initialize a dictionary to store the results
null_cols = []

# iterate through the columns and check for missing values
for col in csv_combined_df.columns:
    has_null = csv_combined_df[col].isnull().any() # can use isna() or isnull()
    if has_null:
        null_cols.append(col)

# display the columns with missing values
if null_cols:
    for col in null_cols:
        print(f"Column '{col}' contains missing values.")
else:
    print(f"No columns contain missing values.")

Column 'Keyboard Input' contains missing values.


In [9]:
# check for 'NaN' values in "Keyboard Input"
csv_combined_df["Keyboard Input"].isna().value_counts()

Keyboard Input
False    66059
True      1265
Name: count, dtype: int64

In [10]:
# remove rows that contain NaN values (for 'Keyboard Input' or y)
csv_combined_df = csv_combined_df.dropna().reset_index(drop=True)

In [11]:
csv_combined_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 66059 entries, 0 to 66058
Columns: 129 entries, Keyboard Input to Range127
dtypes: float64(129)
memory usage: 65.0 MB


In [13]:
import joblib

# serialize the combined DataFrame using joblib with compression
joblib_filepath = f"../data/model/lidar_data_combined.joblib.gz"
with open(joblib_filepath, 'wb') as jlib_file_cmp:
    joblib.dump(csv_combined_df, jlib_file_cmp, compress=('gzip', 5))

# # save the combine DataFrame to a CSV file without row index value
# csv_combined_df.to_csv("../data/model/lidar_data_combined.csv", index = False)

In [16]:
display(csv_combined_df)

Unnamed: 0,Keyboard Input,Range0,Range1,Range2,Range3,Range4,Range5,Range6,Range7,Range8,...,Range118,Range119,Range120,Range121,Range122,Range123,Range124,Range125,Range126,Range127
0,87.0,0.249914,0.250067,0.250371,0.250828,0.251437,0.252202,0.253125,0.254208,0.255455,...,3.340168,3.321843,3.305699,3.291685,3.279757,3.269877,3.262016,3.256150,3.252260,3.250335
1,87.0,0.249915,0.250068,0.250373,0.250829,0.251439,0.252204,0.253126,0.254209,0.255456,...,3.340166,3.321841,3.305697,3.291683,3.279755,3.269875,3.262014,3.256148,3.252258,3.250333
2,87.0,0.249916,0.250070,0.250374,0.250830,0.251440,0.252205,0.253128,0.254211,0.255458,...,2.614563,3.321841,3.305696,3.291682,3.279754,3.269874,3.262014,3.256147,3.252257,3.250332
3,87.0,0.249918,0.250071,0.250375,0.250832,0.251441,0.252206,0.253129,0.254212,0.255459,...,2.600432,3.321839,3.305695,3.291680,3.279752,3.269872,3.262012,3.256145,3.252255,3.250330
4,87.0,0.249919,0.250072,0.250377,0.250833,0.251443,0.252208,0.253130,0.254213,0.255460,...,2.586474,3.321838,3.305694,3.291680,3.279751,3.269871,3.262011,3.256144,3.252255,3.250330
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
66054,87.0,0.518966,0.519421,0.520189,0.521274,0.522678,0.524406,0.526462,0.528854,0.531587,...,0.190872,0.189874,0.189001,0.188249,0.187616,0.187099,0.186698,0.186411,0.186237,0.186176
66055,87.0,0.519002,0.519457,0.520225,0.521310,0.522714,0.524442,0.526499,0.528890,0.531624,...,0.190835,0.189838,0.188965,0.188213,0.187580,0.187063,0.186663,0.186376,0.186202,0.186140
66056,87.0,0.519038,0.519492,0.520261,0.521346,0.522750,0.524478,0.526535,0.528927,0.531660,...,0.190799,0.189801,0.188928,0.188177,0.187544,0.187028,0.186627,0.186340,0.186166,0.186104
66057,87.0,0.519074,0.519528,0.520297,0.521382,0.522786,0.524514,0.526571,0.528963,0.531697,...,0.190762,0.189765,0.188892,0.188140,0.187508,0.186992,0.186591,0.186304,0.186130,0.186069


In [14]:
csv_combined_df.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
Keyboard Input,66059.0,84.479011,6.721298,65.000000,87.000000,87.000000,87.000000,87.000000
Range0,66059.0,0.499950,0.539576,0.135760,0.240381,0.359215,0.522851,3.355003
Range1,66059.0,0.502830,0.544385,0.134051,0.240270,0.359172,0.523250,3.351730
Range2,66059.0,0.506527,0.549989,0.132467,0.240281,0.359424,0.523998,3.358245
Range3,66059.0,0.510037,0.553358,0.131003,0.240367,0.359857,0.525016,3.345227
...,...,...,...,...,...,...,...,...
Range123,66059.0,0.608772,0.671915,0.135208,0.255463,0.457096,0.573000,3.363909
Range124,66059.0,0.607416,0.675782,0.136418,0.255286,0.455974,0.571175,3.368968
Range125,66059.0,0.605592,0.675790,0.137740,0.255173,0.455107,0.569777,3.370622
Range126,66059.0,0.603915,0.674901,0.139178,0.255246,0.454512,0.568712,3.374640


When training a machine learning model, we typically follow a series of steps such as:
1) Data collection: used to train and evaluate model.
2) Data preprocessing: 
    * Data cleaning: handling missing values, outliers, and any data inconsistencies.
    * Feature engineering: create or modify features that are relevant to the problem and the algorithm to be used.
    * Data transformation: scale, normalize, or standardize features as necessary.
3) Data splitting: divide dataset into training and test (or validation) sets, where the training set is used to train the model and the test set is used to evaluate its performance.
4) Model selection: choose an appropriate machine learning algorithm that best suits the problem, the type of data, and the goal you are trying to achieve.
5) Model training: fit the selected model to the training data so the model can learn from the data and adapt its parameters to make predictions.
6) Hyperparameter tuning: optimize the model's hyperparameters to find the best combination of settings for your problem (this can involve techniques like 'GridSearchCV').
7) Model evaluation: assess the model's performance on the test dataset using appropriate evaluation metrics (e.g. accuracy, F1-score, MSE).

In this case, we checked for data inconsistencies and handled missing values. However, it does not make sense to perform feature engineering, data transformation, or data splitting as each row of data consists of 128 lidar sensor points with the corresponding keyboard input. All the features are of the same scale and are important to help determine the robot's pose (position and orientation) at every given timestep, and splitting this data into train and test sets will most likely result in undesirable behaviour.

Due to the nature of the data, the robot will have to make decisions based on the lidar distance measurements and accordingly predict which input ('w', 'a', 's', or 'd') to issue in an attempt to travel in the direction that is furthest away, while avoiding obstacles (e.g. walls). As such, the appropriate models to train the data may involve ensemble methods like xgboost or random forest to handle these multi-class decisions. However, we will examine other machine learning models as it will enable us to perform comparisions on performance as needed.

In [11]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import GridSearchCV
import joblib

# extract features (sensor readings) and labels (control inputs)
X = csv_combined_df.iloc[:, 1:].values # sensor readings
y = csv_combined_df.iloc[:, 0].values # keyboard input

display(X)

array([[0.24991353, 0.25006682, 0.25037119, ..., 3.25614953, 3.25225997,
        3.25033474],
       [0.24991496, 0.25006825, 0.25037259, ..., 3.25614786, 3.2522583 ,
        3.25033331],
       [0.24991623, 0.25006953, 0.25037387, ..., 3.25614715, 3.25225735,
        3.25033236],
       ...,
       [0.51903802, 0.51949245, 0.52026093, ..., 0.18633986, 0.18616596,
        0.18610446],
       [0.51907372, 0.51952815, 0.52029675, ..., 0.18630403, 0.18613015,
        0.18606867],
       [0.51910949, 0.51956397, 0.52033257, ..., 0.18626842, 0.1860946 ,
        0.18603311]])

To determine optimal parameters for our machine learning models, we can use a technique such as hyperparameter tuning with grid search to systematically search through a range of hyperparameter values to find the best combination.

The best set of hyperparameters is used to improve the machine learning model performance on evaluation metrics by enchancing the model's ability to make accurate predictions or classifications. It is essential in optimizing model performance, controlling the amount of overfitting and underfitting, and so on, but it will depend on the problem. For instance, while we expect the model accuracy for random forest classifier to be higher than that of logistic regression (which deals with simple linear problems, and as such fails when making more complicated non-linear decisions), accuracy alone would not really be provide a meaningful evaluation metric. Instead, we may get a better sense of model performance through quantitative measures such as completion time and distance-to-obstacles (closeness) or qualitative measures such as smoothness of motion, collisions (can also be quantitative), if it is able to complete the track (finish or did not finish), and how often it gets stuck.

Hyperparameter tuning with grid search can be a computationally expensive and time consuming process (and can be dependent on things like CPU capabilities, amount of data to train, as well as the amount of parameters to search over). In this scenario, we will evaluate four models (logistic regression, random forest classifier, mlp classifier, and xgboost) with their "default" parameters as well as their "best" parameters (depends on the problem) but for our case will consist of hyperparameters typically seen for the model along with a the respective common/recommended combination of values.

##### Logistic regression model

In [12]:
# create and train the logistic regression model (with default values)
log_reg_model = LogisticRegression(n_jobs=-1, verbose=1)

# train the model on the entire datatset
log_reg_model.fit(X, y)

# save the trained model to a file using joblib
joblib.dump(log_reg_model, "../data/model/log_reg_model.joblib")

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 out of   1 | elapsed:    4.1s finished


['../data/model/log_reg_model.joblib']

##### Hyperparameter tuning using grid search for logistic regression model

In [19]:
# define the hyperparameter grid to search over
param_grid_log_reg = {
    'max_iter': [100, 500, 1000, 5000, 10000],
    'C': [0.001, 0.01, 0.1, 1, 10, 100], # regularization strength
    'penalty': ['l1', 'l2'], # regularization type (L1 and L2)
    'solver': ['lbfgs', 'saga', 'newton-cg'] # optimization algorithm
}

# create a GridSearchCV object with 5-fold cross-validation
grid_search_log_reg = GridSearchCV(log_reg_model, param_grid_log_reg, cv=5, n_jobs=-1, verbose=1)

# fit the grid search to data
grid_search_log_reg.fit(X, y)

# get the best hyperparameters and the best model
best_log_reg_params = grid_search_log_reg.best_params_
best_log_reg_model = grid_search_log_reg.best_estimator_

# display the best hyperparameter values for the logistic regression model
print(f"Best hyperparameter values: {best_log_reg_params}")

# save the best trained model to a file using joblib
joblib.dump(best_log_reg_model, "../data/model/best_log_reg_model.joblib")

Fitting 5 folds for each of 180 candidates, totalling 900 fits


[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 12 concurrent workers.


convergence after 412 epochs took 63 seconds
Best hyperparameter values: {'C': 0.01, 'max_iter': 500, 'penalty': 'l1', 'solver': 'saga'}


[Parallel(n_jobs=-1)]: Done   1 out of   1 | elapsed:  1.1min finished


['../data/model/best_log_reg_model.joblib']

##### Random forest classifier model

In [13]:
# create and train the random forest classifier model
rfc_model = RandomForestClassifier(random_state=42, n_jobs=-1, verbose=1) # n_estimators=100 is default
rfc_model.fit(X, y)

joblib.dump(rfc_model, "../data/model/rfc_model.joblib")

[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  26 tasks      | elapsed:   14.4s
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:   43.7s finished


['../data/model/rfc_model.joblib']

##### Hyperparameter tuning using grid search for random forest classifier model

In [21]:
# define the hyperparameter grid to search over
param_grid_rfc = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5, 10], # minimum number of samples required to split an internal node
    'min_samples_leaf': [1, 2, 4], # minimum number of samples required to be at a leaf node
    'max_features': ['auto', 'sqrt', 'log2'], # features to consider when looks for best split
    'bootstrap': [True, False]
}

# create a GridSearchCV object with 5-fold cross-validation
grid_search_rfc = GridSearchCV(rfc_model, param_grid_rfc, cv=5, n_jobs=-1, verbose=1)

# fit the grid search to data
grid_search_rfc.fit(X, y)

# get the best hyperparameters and the best model
best_rfc_params = grid_search_rfc.best_params_
best_rfc_model = grid_search_rfc.best_estimator_

# display the best hyperparameter values for the random forest classifier model
print(f"Best hyperparameter values: {best_rfc_params}")

# save the best trained model to a file using joblib
joblib.dump(best_rfc_model, "../data/model/best_rfc_model.joblib")

Fitting 5 folds for each of 486 candidates, totalling 2430 fits


[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  26 tasks      | elapsed:    2.9s


Best hyperparameter values: {'bootstrap': True, 'max_depth': 10, 'max_features': 'log2', 'min_samples_leaf': 2, 'min_samples_split': 2, 'n_estimators': 100}


[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:    9.1s finished


['../data/model/best_rfc_model.joblib']

##### MLP classifier model

In [14]:
# create and train the mlp classifier model
mlpc_model = MLPClassifier(verbose=1)
mlpc_model.fit(X, y)

joblib.dump(mlpc_model, "../data/model/mlpc_model.joblib")

Iteration 1, loss = 0.29847016
Iteration 2, loss = 0.27988458
Iteration 3, loss = 0.27273794
Iteration 4, loss = 0.27096567
Iteration 5, loss = 0.26625944
Iteration 6, loss = 0.26335956
Iteration 7, loss = 0.26249401
Iteration 8, loss = 0.26048121
Iteration 9, loss = 0.25884251
Iteration 10, loss = 0.25783350
Iteration 11, loss = 0.25794304
Iteration 12, loss = 0.25562897
Iteration 13, loss = 0.25405508
Iteration 14, loss = 0.25280885
Iteration 15, loss = 0.25200316
Iteration 16, loss = 0.25132893
Iteration 17, loss = 0.25136861
Iteration 18, loss = 0.24945428
Iteration 19, loss = 0.24841270
Iteration 20, loss = 0.24786495
Iteration 21, loss = 0.24770619
Iteration 22, loss = 0.24604984
Iteration 23, loss = 0.24583560
Iteration 24, loss = 0.24441571
Iteration 25, loss = 0.24424641
Iteration 26, loss = 0.24403558
Iteration 27, loss = 0.24290834
Iteration 28, loss = 0.24264860
Iteration 29, loss = 0.24323664
Iteration 30, loss = 0.24195571
Iteration 31, loss = 0.24099179
Iteration 32, los

['../data/model/mlpc_model.joblib']

##### Hyperparameter tuning using grid search for mlp classifier model

In [20]:
# define the hyperparameter grid to search over
param_grid_mlpc = {
    'hidden_layer_sizes': [(100,), (100, 100), (1000, 100), (100, 100, 100)], # size of hidden layers
    'activation': ['logistic', 'tanh', 'relu'], # activation functions for hidden layers
    'alpha': [0.0001, 0.001, 0.01], # L2 regularization parameter
    'max_iter': [100, 200, 300],
}

# create a GridSearchCV object with 5-fold cross-validation
grid_search_mlpc = GridSearchCV(mlpc_model, param_grid_mlpc, cv=5, n_jobs=-1, verbose=1)

# fit the grid search to data
grid_search_mlpc.fit(X, y)

# get the best hyperparameters and the best model
best_mlpc_params = grid_search_mlpc.best_params_
best_mlpc_model = grid_search_mlpc.best_estimator_

# display the best hyperparameter values for the mlp classifier model
print(f"Best hyperparameter values: {best_mlpc_params}")

# save the best trained model to a file using joblib
joblib.dump(best_mlpc_model, "../data/model/best_mlpc_model.joblib")

Fitting 5 folds for each of 108 candidates, totalling 540 fits
Iteration 1, loss = 0.33760712
Iteration 2, loss = 0.29702822
Iteration 3, loss = 0.29269088
Iteration 4, loss = 0.28887125
Iteration 5, loss = 0.28578879
Iteration 6, loss = 0.28481648
Iteration 7, loss = 0.28318810
Iteration 8, loss = 0.28157528
Iteration 9, loss = 0.27938493
Iteration 10, loss = 0.27870485
Iteration 11, loss = 0.27732901
Iteration 12, loss = 0.27767001
Iteration 13, loss = 0.27650024
Iteration 14, loss = 0.27557899
Iteration 15, loss = 0.27416534
Iteration 16, loss = 0.27422458
Iteration 17, loss = 0.27325310
Iteration 18, loss = 0.27313365
Iteration 19, loss = 0.27244905
Iteration 20, loss = 0.27159438
Iteration 21, loss = 0.27115239
Iteration 22, loss = 0.26995534
Iteration 23, loss = 0.27185859
Iteration 24, loss = 0.27034215
Iteration 25, loss = 0.26923928
Iteration 26, loss = 0.27029944
Iteration 27, loss = 0.26861567
Iteration 28, loss = 0.26895298
Iteration 29, loss = 0.26801014
Iteration 30, loss

['../data/model/best_mlpc_model.joblib']

##### XGBoost model

In [15]:
y

array([87., 87., 87., ..., 87., 87., 87.])

Note: in the training, no reversing/backward movement was performed as it contradicted what we were trying to achieve (in training, e.g. drive straight and towards the direction with furthest distance measurement while attempting to "hug" the wall), as a result "S" is obsolete and was not mapped.

In [16]:
# create a mapping for the input values to integers 0, 1, 2
input_to_int = {
    65: 0,
    68: 1,
    87: 2
}

# replace values in 'y' using the mapping
y_mapped = []
for val in y:
    y_mapped.append(input_to_int[val])

# convert the mapped list to a NumPy array 
y_xgboost = np.array(y_mapped)

# create and train the xgboost model
xgboost_model = XGBClassifier(n_jobs=-1, verbosity=1)
xgboost_model.fit(X, y_xgboost)

joblib.dump(xgboost_model, "../data/model/xgboost_model.joblib")

['../data/model/xgboost_model.joblib']

##### Hyperparameter tuning using grid search for xgboost model

In [17]:
# define the hyperparameter grid to search over
param_grid_xgboost = {
    'n_estimators': [100, 200, 300], # number of boosting rounds
    'max_depth': [3, 4, 5],
    'learning_rate': [0.1, 0.01, 0.001], # step size shrinkage to prevent overfitting
    'subsample': [0.7, 0.8, 0.9], # fraction of samples used for fitting the trees
    'colsample_bytree': [0.7, 0.8, 0.9], # fraction of features used for fitting the trees
}

# create a GridSearchCV object with 5-fold cross-validation
grid_search_xgboost = GridSearchCV(xgboost_model, param_grid_xgboost, cv=5, n_jobs=-1, verbose=1)

# fit the grid search to data
grid_search_xgboost.fit(X, y_xgboost)

# get the best hyperparameters and the best model
best_xgboost_params = grid_search_xgboost.best_params_
best_xgboost_model = grid_search_xgboost.best_estimator_

# display the best hyperparameter values for the xgboost model
print(f"Best hyperparameter values: {best_xgboost_params}")

# save the best trained model to a file using joblib
joblib.dump(best_xgboost_model, "../data/model/best_xgboost_model.joblib")

Fitting 5 folds for each of 243 candidates, totalling 1215 fits
Best hyperparameter values: {'colsample_bytree': 0.7, 'learning_rate': 0.001, 'max_depth': 3, 'n_estimators': 300, 'subsample': 0.7}


['../data/model/best_xgboost_model.joblib']