In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


This dataset is designed for research and analysis of load balancing in distributed systems. It includes key features such as task size, CPU and memory demand, network latency, I/O operations, disk usage, number of connections, and priority level, along with a target variable for classification or optimization. Timestamp data is also provided for temporal analysis. It is suitable for machine learning, simulation studies, and performance optimization research. <br>

**Columns:** <br>
**task_size:** Size of the task (numeric). <br>
**cpu_demand:** CPU demand of the task (numeric). <br>
**memory_demand:** Memory demand of the task (numeric). <br>
**network_latency:** Network latency associated with the task (numeric). <br>
**io_operations:** Number of I/O operations (numeric). <br>
**disk_usage:** Disk usage for the task (numeric). <br>
**num_connections:** Number of active connections for the task (numeric). <br>
**priority_level:** Priority level assigned to the task (numeric). <br>
**target:**Target label indicating the outcome or category (binary). <br>
**timestamp:** Timestamp when the task data was recorded. <br>

In [2]:
###!pip install bayesian-optimization

In [3]:
###!pip install keras-tuner

In [4]:
###!pip uninstall tensorflow
####!pip install tensorflow==2.12.0

In [5]:

###!pip install keras==2.12.0

In [11]:
# Import packages
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from math import floor
from sklearn.metrics import make_scorer, accuracy_score
from bayes_opt import BayesianOptimization
from sklearn.model_selection import StratifiedKFold

import warnings
warnings.filterwarnings('ignore')
pd.set_option("display.max_columns", None)

In [12]:
# Suppressing Warnings
import warnings
warnings.filterwarnings('ignore')

In [13]:
# Importing Pandas and NumPy
import pandas as pd, numpy as np

In [14]:
# Importing all datasets
LoadBalancerSystem = pd.read_csv("/content/Load Balancing Improved.csv")
LoadBalancerSystem.head(4)

Unnamed: 0,task_size,cpu_demand,memory_demand,network_latency,io_operations,disk_usage,num_connections,priority_level,target,timestamp
0,-0.152124,3.75016,-0.981182,0.251507,-0.471993,1.007026,0.31379,3.050953,1,2023-03-16 03:46:22
1,0.724624,-3.97892,2.022732,1.19453,-0.010304,-2.493867,-0.073875,-1.271258,0,2023-09-02 20:15:54
2,4.650228,1.145925,2.641659,-1.899635,1.187132,4.283652,0.572666,1.243801,1,2022-02-19 08:48:52
3,-0.138208,-0.189687,-0.820848,-3.060794,-1.982086,3.620598,-0.876702,0.77677,1,2023-12-22 11:58:26


In [15]:
from sklearn.model_selection import train_test_split

In [16]:
train, test = train_test_split(LoadBalancerSystem, test_size=0.2, random_state=1)

In [17]:
print(train.shape, test.shape)

(8542, 10) (2136, 10)


In [18]:
print("The columns in train data :", train.columns)
print("The columns in test data :", test.columns)

The columns in train data : Index(['task_size', 'cpu_demand', 'memory_demand', 'network_latency',
       'io_operations', 'disk_usage', 'num_connections', 'priority_level',
       'target', 'timestamp'],
      dtype='object')
The columns in test data : Index(['task_size', 'cpu_demand', 'memory_demand', 'network_latency',
       'io_operations', 'disk_usage', 'num_connections', 'priority_level',
       'target', 'timestamp'],
      dtype='object')


In [19]:
train.to_csv("train_load_balancer.csv")

In [20]:
test.to_csv("test_load_balancer.csv")

In [21]:
# Importing all datasets
train = pd.read_csv("/content/train_load_balancer.csv")
train.head(4)

Unnamed: 0.1,Unnamed: 0,task_size,cpu_demand,memory_demand,network_latency,io_operations,disk_usage,num_connections,priority_level,target,timestamp
0,4311,-2.884349,-0.304593,1.428882,0.641865,-1.265519,-2.042585,-2.151759,2.248828,0,2024-04-30 12:47:38
1,6133,0.163591,-0.054587,0.243658,0.631062,-2.349176,-0.106621,0.016042,2.954929,0,2024-09-15 11:42:34
2,2321,1.243694,3.099626,0.301481,0.439303,-3.356344,4.85157,-1.382724,2.085178,1,2022-10-20 08:26:25
3,1595,-1.675813,0.516609,-1.972764,1.92694,-1.554229,-2.546848,1.107627,1.629831,1,2023-10-08 10:31:30


In [22]:
# Importing all datasets
test = pd.read_csv("/content/test_load_balancer.csv")
test.head(4)

Unnamed: 0.1,Unnamed: 0,task_size,cpu_demand,memory_demand,network_latency,io_operations,disk_usage,num_connections,priority_level,target,timestamp
0,6138,0.982596,2.560753,0.309671,0.484964,-2.768847,4.956329,0.780822,0.050808,1,2024-09-22 00:03:32
1,9271,-0.291851,1.488813,1.230143,1.814476,-2.828979,2.481521,-0.64922,0.193506,1,2022-04-04 12:13:28
2,4052,0.371047,-0.104333,2.951606,-0.270315,0.201554,1.474998,0.502292,-0.247073,1,2024-07-19 04:29:14
3,3068,-0.619218,-0.772362,0.013539,-1.840224,-1.292816,3.058381,2.911729,-1.782623,0,2022-08-18 00:11:56


In [23]:
###! pip install klib

In [24]:
import klib

In [25]:
train = klib.data_cleaning(train)
test = klib.data_cleaning(test)

Shape of cleaned data: (8542, 11) - Remaining NAs: 0


Dropped rows: 0
     of which 0 duplicates. (Rows (first 150 shown): [])

Dropped columns: 0
     of which 0 single valued.     Columns: []
Dropped missing values: 0
Reduced memory by at least: 0.37 MB (-51.39%)

Shape of cleaned data: (2136, 11) - Remaining NAs: 0


Dropped rows: 0
     of which 0 duplicates. (Rows (first 150 shown): [])

Dropped columns: 0
     of which 0 single valued.     Columns: []
Dropped missing values: 0
Reduced memory by at least: 0.09 MB (-50.0%)



In [26]:
train_cleaned = klib.clean_column_names(train)
test_cleaned = klib.clean_column_names(test)

In [27]:
train_cleaned = klib.convert_datatypes(train_cleaned)
test_cleaned = klib.convert_datatypes(test_cleaned)

In [28]:
train.columns

Index(['unnamed_0', 'task_size', 'cpu_demand', 'memory_demand',
       'network_latency', 'io_operations', 'disk_usage', 'num_connections',
       'priority_level', 'target', 'timestamp'],
      dtype='object')

In [29]:
train["timestamp"]  = pd.to_datetime(train["timestamp"])

In [30]:
train["day"] = train["timestamp"].dt.day
train["month"] = train["timestamp"].dt.month
train["year"] = train["timestamp"].dt.year
train["hour"] = train["timestamp"].dt.hour

In [31]:
train.columns

Index(['unnamed_0', 'task_size', 'cpu_demand', 'memory_demand',
       'network_latency', 'io_operations', 'disk_usage', 'num_connections',
       'priority_level', 'target', 'timestamp', 'day', 'month', 'year',
       'hour'],
      dtype='object')

In [40]:
#train.drop(columns="timestamp", inplace=True)

In [41]:
#train.drop(columns="unnamed_0", inplace=True)

In [34]:
test["timestamp"] = pd.to_datetime(test["timestamp"])

In [35]:
test["day"] = test["timestamp"].dt.day
test["month"] = test["timestamp"].dt.month
test["year"] = test["timestamp"].dt.year
test["hour"] = test["timestamp"].dt.hour

In [42]:
#test.drop(columns="timestamp", inplace=True)

In [43]:
#test.drop(columns="unnamed_0", inplace=True)

In [44]:
print("The DataTypes :", train.dtypes)

The DataTypes : task_size          float32
cpu_demand         float32
memory_demand      float32
network_latency    float32
io_operations      float32
disk_usage         float32
num_connections    float32
priority_level     float32
target                int8
day                  int32
month                int32
year                 int32
hour                 int32
dtype: object


In [45]:
X_train = train.drop(columns="target")
Y_train = train["target"]

In [46]:
feature_names = X_train.columns

In [47]:
X_test = test.drop(columns="target")
Y_test = test["target"]

In [48]:
print(X_train.shape, Y_train.shape, X_test.shape, Y_test.shape)

(8542, 12) (8542,) (2136, 12) (2136,)


In [49]:
from sklearn.preprocessing import StandardScaler

In [50]:
scaler = StandardScaler()

In [51]:
X_train = scaler.fit_transform(X_train)
X_test = scaler.fit_transform(X_test)

In [52]:
num_var = [feature for feature in train.columns if train[feature].dtypes != 'O']
discrete_var = [feature for feature in num_var if len(train[feature].unique()) <= 25]
cont_var = [feature for feature in num_var if feature not in discrete_var]
categ_var = [feature for feature in train.columns if feature not in num_var]

In [53]:
def find_var_type(var):


    if var in discrete_var:
        print("{} is a Numerical Variable, Discrete in nature".format(var))
    elif var in cont_var :
        print("{} is a Numerical Variable, Continuous in nature".format(var))
    else :
        print("{} is a Categorical Variable".format(var))

In [54]:
print("The continuous variables are :", cont_var)
print("The categorical variables are :", discrete_var)

The continuous variables are : ['task_size', 'cpu_demand', 'memory_demand', 'network_latency', 'io_operations', 'disk_usage', 'num_connections', 'priority_level', 'day']
The categorical variables are : ['target', 'month', 'year', 'hour']


In [55]:
from sklearn.linear_model import Lasso

from sklearn.feature_selection import SelectFromModel

In [56]:
# Perform feature selection using a variance threshold
from sklearn.feature_selection import VarianceThreshold

sel = VarianceThreshold(threshold=(0.02))
sel.fit(train)

print("Feature selection", sel.get_support())
print("Selected features:", list(train.columns[sel.get_support()]))
print("Removed features:", list(train.columns[~sel.get_support()]))

Feature selection [ True  True  True  True  True  True  True  True  True  True  True  True
  True]
Selected features: ['task_size', 'cpu_demand', 'memory_demand', 'network_latency', 'io_operations', 'disk_usage', 'num_connections', 'priority_level', 'target', 'day', 'month', 'year', 'hour']
Removed features: []


In [57]:
# Function to list features that are correlated
# Adds the first of the correlated pair only (not both)
def correlatedFeatures(dataset, threshold):
    correlated_columns = set()
    correlations = dataset.corr()
    for i in range(len(correlations)):
        for j in range(i):
            if abs(correlations.iloc[i,j]) > threshold:
                correlated_columns.add(correlations.columns[i])
    return correlated_columns

In [58]:
# Get a set of correlated features, based on threshold correlation of 0.85
cf = correlatedFeatures(train, 0.85)
cf

set()

In [59]:
X_train = pd.DataFrame(X_train)
X_test = pd.DataFrame(X_test)
Y_test = pd.DataFrame(Y_test)
Y_train = pd.DataFrame(Y_train)

In [60]:
print(X_train.shape)
print(train.shape)
print(train.columns)

(8542, 12)
(8542, 13)
Index(['task_size', 'cpu_demand', 'memory_demand', 'network_latency',
       'io_operations', 'disk_usage', 'num_connections', 'priority_level',
       'target', 'day', 'month', 'year', 'hour'],
      dtype='object')


In [68]:
X_train = X_train.rename(columns={
    0: 'task_size',
    1: 'cpu_demand',
    2: 'memory_demand',
    3: 'network_latency',
    4: 'io_operations',
    5: 'disk_usage',
    6: 'num_connections',
    7: 'priority_level',
    8: 'day',
    9: 'month',
    10: 'year',
    11: 'hour'
})


In [69]:
X_train.columns

Index(['task_size', 'cpu_demand', 'memory_demand', 'network_latency',
       'io_operations', 'disk_usage', 'num_connections', 'priority_level',
       'day', 'month', 'year', 'hour'],
      dtype='object')

In [70]:
X_test = X_test.rename(columns={
    0: 'task_size',
    1: 'cpu_demand',
    2: 'memory_demand',
    3: 'network_latency',
    4: 'io_operations',
    5: 'disk_usage',
    6: 'num_connections',
    7: 'priority_level',
    8: 'day',
    9: 'month',
    10: 'year',
    11: 'hour'
})


In [71]:
print(X_train.columns)
print(X_test.columns)

Index(['task_size', 'cpu_demand', 'memory_demand', 'network_latency',
       'io_operations', 'disk_usage', 'num_connections', 'priority_level',
       'day', 'month', 'year', 'hour'],
      dtype='object')
Index(['task_size', 'cpu_demand', 'memory_demand', 'network_latency',
       'io_operations', 'disk_usage', 'num_connections', 'priority_level',
       'day', 'month', 'year', 'hour'],
      dtype='object')


## PIPELINE  CREATION

In [73]:
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from xgboost import XGBClassifier

In [77]:
pipeline_lr=Pipeline([('scalar1',StandardScaler()),
                     ('pca1',PCA(n_components=2)),
                     ('lr_classifier',LogisticRegression())])

In [78]:
pipeline_dt=Pipeline([('scalar1',StandardScaler()),
                     ('pca1',PCA(n_components=2)),
                     ('dt_classifier',DecisionTreeClassifier())])

In [79]:
pipeline_rf=Pipeline([('scalar1',StandardScaler()),
                     ('pca1',PCA(n_components=2)),
                     ('rf_classifier',RandomForestClassifier())])

In [80]:
pipeline_gradient_boost=Pipeline([('scalar4',StandardScaler()),
                     ('pca4',PCA(n_components=2)),
                     ('gb_classifier',GradientBoostingClassifier())])

In [81]:
pipeline_xgboost=Pipeline([('scalar4',StandardScaler()),
                     ('pca4',PCA(n_components=2)),
                     ('gb_classifier',XGBClassifier())])

In [82]:
## LEts make the list of pipelines
pipelines = [pipeline_lr, pipeline_dt, pipeline_rf, pipeline_gradient_boost,pipeline_xgboost]

In [83]:
best_accuracy=0.0
best_classifier=0
best_pipeline=""

In [85]:
# Dictionary of pipelines and classifier types for ease of reference
pipe_dict = {0: 'Linear Regression', 1: 'Decision Tree', 2: 'RandomForest', 3: 'Gradient Boost', 4: 'Extreme Gradient Boost'}

# Fit the pipelines
for pipe in pipelines:
	pipe.fit(X_train, Y_train)

In [86]:
for i,model in enumerate(pipelines):
    print("{} Test Value: {}".format(pipe_dict[i],model.score(X_test, Y_test)))

Linear Regression Test Value: 0.7050561797752809
Decision Tree Test Value: 0.6999063670411985
RandomForest Test Value: 0.725187265917603
Gradient Boost Test Value: 0.7598314606741573
Extreme Gradient Boost Test Value: 0.7415730337078652


### XGBClassifier

In [87]:
from xgboost import XGBClassifier

In [89]:
import time

In [92]:
from sklearn.metrics import make_scorer, accuracy_score

In [95]:
from sklearn.metrics import accuracy_score as accuracy
accuracy = make_scorer(accuracy, greater_is_better=False)

In [96]:
def xgb_cl_bo(min_child_weight, gamma, subsample, colsample_bytree, max_depth):
    params_xgb = {}
    params_xgb['min_child_weight'] = min_child_weight
    params_xgb['gamma'] = gamma
    params_xgb['subsample'] = subsample
    params_xgb['colsample_bytree'] = int(colsample_bytree)
    params_xgb['max_depth'] = int(max_depth)
    scores = cross_val_score(XGBClassifier(random_state=123, **params_xgb),
                             X_train, Y_train, scoring=accuracy, cv=5).mean()
    score = scores.mean()
    score = -score
    return score
# Run Bayesian Optimization
start = time.time()
params_xgb ={
    'min_child_weight':(1, 20),
    'gamma':(0.5, 10),
    'subsample':(0.6, 1.0),
    'colsample_bytree':(0.6, 1.0),
    'max_depth': (3, 35)
}

In [97]:
xgb_bo = BayesianOptimization(xgb_cl_bo, params_xgb)
xgb_bo.maximize(init_points=20, n_iter=4)
print('It takes %s minutes' % ((time.time() - start)/60))

|   iter    |  target   | colsam... |   gamma   | max_depth | min_ch... | subsample |
-------------------------------------------------------------------------------------
| [39m1        [39m | [39m0.8616   [39m | [39m0.6436   [39m | [39m6.793    [39m | [39m6.231    [39m | [39m18.43    [39m | [39m0.7072   [39m |
| [35m2        [39m | [35m0.8653   [39m | [35m0.9614   [39m | [35m2.461    [39m | [35m7.225    [39m | [35m3.727    [39m | [35m0.8514   [39m |
| [39m3        [39m | [39m0.8607   [39m | [39m0.9759   [39m | [39m1.768    [39m | [39m25.47    [39m | [39m12.21    [39m | [39m0.7787   [39m |
| [39m4        [39m | [39m0.8552   [39m | [39m0.9733   [39m | [39m1.382    [39m | [39m11.94    [39m | [39m10.04    [39m | [39m0.6727   [39m |
| [39m5        [39m | [39m0.8624   [39m | [39m0.9525   [39m | [39m2.397    [39m | [39m16.68    [39m | [39m18.7     [39m | [39m0.7433   [39m |
| [39m6        [39m | [39m0.8627   [39m | [

In [98]:
params_xgb = xgb_bo.max['params']
params_xgb['max_depth'] = round(params_xgb['max_depth'])
params_xgb['min_child_weight'] = round(params_xgb['min_child_weight'])
params_xgb['gamma'] = round(params_xgb['gamma'])
params_xgb['colsample_bytree'] = round(params_xgb['colsample_bytree'])
params_xgb['subsample'] = round(params_xgb['subsample'])
params_xgb

{'colsample_bytree': 1,
 'gamma': 4,
 'max_depth': 9,
 'min_child_weight': 2,
 'subsample': 1}

In [99]:
xgb_hyp =  XGBClassifier(**params_xgb, random_state=123)

In [100]:
xgb_hyp.fit(X_train, Y_train)

In [101]:
# Predict the validation data
pred_xgb = xgb_hyp.predict(X_test)

# Compute the accuracy
print('Accuracy: ' + str(accuracy_score(Y_test, pred_xgb)))

Accuracy: 0.9480337078651685


In [102]:
pred_xgb = pd.DataFrame(pred_xgb)

In [103]:
pred_xgb.rename(columns = {0 : "Predict"}, inplace=True)

In [104]:
pred_xgb.value_counts()

Unnamed: 0_level_0,count
Predict,Unnamed: 1_level_1
0,1077
1,1059


In [105]:
import pickle

# Save model to a pickle file
with open('xgb_hyp.pkl', 'wb') as file:
    pickle.dump(xgb_hyp, file)


In [106]:
import joblib

# Save model to a joblib file
joblib.dump(xgb_hyp, 'xgb_hyp.joblib')


['xgb_hyp.joblib']

#### **Random Forest Classifier - Using Bayesian Optimization**

In [107]:
from sklearn.ensemble import RandomForestClassifier

In [108]:
def rfc_cl_bo(n_estimators, max_features, max_depth, min_samples_split, min_samples_leaf):
    params_rfc = {}
    params_rfc['n_estimators'] = int(n_estimators)
    params_rfc['max_features'] = max_features
    params_rfc['max_depth'] = round(max_depth)
    params_rfc['min_samples_split'] = int(min_samples_split)
    params_rfc['min_samples_leaf'] = int(min_samples_leaf)
    scores = cross_val_score(RandomForestClassifier(random_state=123, **params_rfc),
                             X_train, Y_train, scoring=accuracy, cv=5).mean()
    score = scores.mean()
    score = -score
    return score
# Run Bayesian Optimization
start = time.time()
params_rfc ={
    'n_estimators':(80, 300),
    'max_features':(0.8, 1),
    'max_depth':(1, 250),
    'min_samples_split':(2, 20),
    'min_samples_leaf' :(1, 40)
}

In [109]:
rfc_bo = BayesianOptimization(rfc_cl_bo, params_rfc)
rfc_bo.maximize(init_points=30, n_iter=4)
print('It takes %s minutes' % ((time.time() - start)/60))

|   iter    |  target   | max_depth | max_fe... | min_sa... | min_sa... | n_esti... |
-------------------------------------------------------------------------------------
| [39m1        [39m | [39m0.9336   [39m | [39m47.32    [39m | [39m0.8648   [39m | [39m12.14    [39m | [39m10.75    [39m | [39m181.6    [39m |
| [39m2        [39m | [39m0.9158   [39m | [39m238.0    [39m | [39m0.9706   [39m | [39m35.4     [39m | [39m6.929    [39m | [39m125.6    [39m |
| [39m3        [39m | [39m0.9185   [39m | [39m229.7    [39m | [39m0.9272   [39m | [39m31.06    [39m | [39m17.11    [39m | [39m161.1    [39m |
| [39m4        [39m | [39m0.9279   [39m | [39m7.608    [39m | [39m0.867    [39m | [39m12.62    [39m | [39m5.561    [39m | [39m263.0    [39m |
| [39m5        [39m | [39m0.9221   [39m | [39m181.5    [39m | [39m0.984    [39m | [39m25.38    [39m | [39m18.27    [39m | [39m291.7    [39m |
| [39m6        [39m | [39m0.9312   [39m | [

In [110]:
params_rfc = rfc_bo.max['params']
params_rfc['max_depth'] = round(params_rfc['max_depth'])
params_rfc['min_samples_leaf'] = round(params_rfc['min_samples_leaf'])
params_rfc['min_samples_split'] = round(params_rfc['min_samples_split'])
params_rfc['n_estimators'] = round(params_rfc['n_estimators'])
params_rfc

{'max_depth': 240,
 'max_features': np.float64(0.8473019458277843),
 'min_samples_leaf': 1,
 'min_samples_split': 6,
 'n_estimators': 282}

In [111]:
rfc_hyp =  RandomForestClassifier(**params_rfc, random_state=123)

In [112]:
rfc_hyp.fit(X_train, Y_train)


In [113]:
pred_rfc = rfc_hyp.predict(X_test)

In [114]:
pred_rfc = pd.DataFrame(pred_rfc)

In [115]:
pred_rfc.rename(columns = {0:"Label"}, inplace=True)

In [116]:
pred_rfc.value_counts()

Unnamed: 0_level_0,count
Label,Unnamed: 1_level_1
0,1085
1,1051


In [117]:
import pickle

# Save model to a pickle file
with open('rfc_hyp.pkl', 'wb') as file:
    pickle.dump(rfc_hyp, file)


In [118]:
import joblib

# Save model to a joblib file
joblib.dump(rfc_hyp, 'rfc_hyp.joblib')


['rfc_hyp.joblib']

### KNN Classifier - Using Bayesian Optimization

In [119]:
from sklearn.neighbors import KNeighborsClassifier

In [120]:
# Hyperparameter-tuning: Bayesian Optimization, bayes_opt
def knn_cl_bo(n_neighbors, weights, p):
    params_knn = {}
    weightsL = ['uniform', 'distance']

    params_knn['n_neighbors'] = round(n_neighbors)
    params_knn['weights'] = weightsL[round(weights)]
    params_knn['p'] = round(p)

    score = cross_val_score(KNeighborsClassifier(**params_knn),
                             X_train, Y_train, cv=9, scoring=accuracy).mean()
    return score

In [121]:
# Set hyperparameters spaces
params_knn ={
    'n_neighbors':(3, 10),
    'weights':(0, 1),
    'p':(1, 2)}

# Run Bayesian Optimization
knn_bo = BayesianOptimization(knn_cl_bo, params_knn, random_state=111)
knn_bo.maximize(init_points=4, n_iter=35)

|   iter    |  target   | n_neig... |     p     |  weights  |
-------------------------------------------------------------
| [39m1        [39m | [39m-0.9254  [39m | [39m7.285    [39m | [39m1.169    [39m | [39m0.4361   [39m |
| [39m2        [39m | [39m-0.9289  [39m | [39m8.385    [39m | [39m1.295    [39m | [39m0.1492   [39m |
| [35m3        [39m | [35m-0.9165  [39m | [35m3.157    [39m | [35m1.42     [39m | [35m0.2387   [39m |
| [39m4        [39m | [39m-0.9213  [39m | [39m5.364    [39m | [39m1.991    [39m | [39m0.2377   [39m |
| [35m5        [39m | [35m-0.9163  [39m | [35m3.102    [39m | [35m1.057    [39m | [35m0.9962   [39m |
| [39m6        [39m | [39m-0.9184  [39m | [39m3.036    [39m | [39m1.85     [39m | [39m0.9653   [39m |
| [39m7        [39m | [39m-0.9216  [39m | [39m3.872    [39m | [39m1.003    [39m | [39m0.08092  [39m |
| [39m8        [39m | [39m-0.9185  [39m | [39m3.039    [39m | [39m1.978    [39m | [

In [122]:
# Best hyperparameters
params_knn = knn_bo.max['params']
weightsL = ['uniform', 'distance']
params_knn['n_neighbors'] = round(params_knn['n_neighbors'])
params_knn['weights'] = weightsL[round(params_knn['weights'])]
params_knn['p'] = round(params_knn['p'])
params_knn

{'n_neighbors': 3, 'p': 1, 'weights': 'distance'}

In [123]:
# Fit the training data
knn_hyp = KNeighborsClassifier(**params_knn)
knn_hyp.fit(X_train, Y_train)

# Predict the validation data
pred_knn = knn_hyp.predict(X_test)

# Compute the accuracy
print('Accuracy: ' + str(accuracy_score(Y_test, pred_knn)))

Accuracy: 0.9241573033707865


In [124]:
import pickle

# Save model to a pickle file
with open('knn_hyp.pkl', 'wb') as file:
    pickle.dump(knn_hyp, file)


In [125]:
import joblib

# Save model to a joblib file
joblib.dump(knn_hyp, 'knn_hyp.joblib')


['knn_hyp.joblib']