# data reading

In [65]:
#%%
import pandas  as pd
from pathlib import  Path
from sklearn.preprocessing import LabelEncoder

#df = pd.read_csv(Path().joinpath('data','nboit.csv'))
#%%
#1. data reading 2. label encoding 3.  smote data balancing.
def datareading(folder_name,file_name,class_name,sample_size, x_numbers):
    '''
    Folder name:It reads the folder name
    File Name  : Data File name, it is csv file. 
    class_name : class_name, it should class-1, class-2, class-3, class-4
    sample_size : sample size must be label equally distribution
    x_numbers : number of features, for Medbiot=100, N-BaIoT 115
    '''
    print("=="*40)
    data = pd.read_csv(Path().joinpath(folder_name, file_name))
    class_name = data[class_name].name
    if len(data[class_name].unique()) ==2:
        print("Binary Classification")
    else:
        print("Multi Class classification")
    print(f"class_name:{class_name}")
    print(f"class labels:{data[class_name].unique()}")
    df = data.groupby(class_name).apply(lambda x: x.sample(n=sample_size)).reset_index(drop=True)
    le = LabelEncoder()
    cols = df.columns.to_list()
    for column in cols:
        if df[column].name==class_name:
            df[column] = le.fit_transform(df[column])
    # data samples
    X = df.iloc[:, 0:x_numbers]
    y = df[class_name]
    #X_sample, y_sample = Classbalancing(X, y).smote_balancing()
    print("=="*40)
    return X, y
#%%
# X, y = datareading('data','nboit.csv','class-3')
# print(y.value_counts())


In [66]:
folder_name = '/gpfs/mariana/home/rkalak/dataset/N-Balot/'
file_name = 'nboit_sample.csv'
class_name = 'class-2'
sample_size = 4000
x_numbers  = 115
X,y  = datareading(folder_name,file_name,class_name,sample_size,x_numbers)

Multi Class classification
class_name:class-2
class labels:['mirai_attacks' 'benign_traffic' 'gafgyt_attacks']


In [3]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y,stratify=y,random_state=42,test_size=0.2)
print("X_train: {0}\ny_train:{1}\nX_test: {2}\ny_test:{3}".format(X_train.shape,
                                                                  y_train.shape,
                                                                  X_test.shape,
                                                                  y_test.shape))

X_train: (6400, 115)
y_train:(6400,)
X_test: (1600, 115)
y_test:(1600,)


# class testing

In [1]:
from datetime import datetime
import joblib
import numpy as np
import pandas as pd
from numpy import mean
from scipy.stats import randint as sp_randint
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.model_selection import KFold
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
import xgboost as xgb
import lightgbm as lgb
from scipy.stats import uniform as sp_uniform
from sklearn.ensemble import ExtraTreesClassifier, GradientBoostingClassifier


class parameter_tuning:
    """
    Tuning Algorithms
    """

    def __init__(self, X, y, metric_type, file_location, search_type='grid_search'):
        """
        Tuning the algorithms

        :param file_location: resultant file location.
        :param metric_type: 1. accuracy, f1-score, recall, precision
        :param cv: Cross validation score
        :param search_type: Hyper Parameter Tuning type 1. Grid Search 2. Random Search type
        :return:
        """
        self.X = X,
        self.y = y,
        self.metric_type = metric_type,
        self.search_type = search_type,
        self.file_location = file_location

    def _fit_grid_random_search(self, ml_classifier, parameters):
        """ Training the model using Grid search or Random search hyperparameter tuning methods.
        :param X: Independent Variable
        :param y: Dependent Variable
        :param ml_classifier: Scikit learn classifier
        :param parameters: various combinations for parameters for classifier.
        :return:
        """
        metric_type = self.metric_type[0]
        print("metric:{0}\n{1}".format(metric_type, type(metric_type)))
        cv_results_df = pd.DataFrame()
        print("Tuning Type:{0}\n".format(self.search_type))
        # Classifier name
        mlclassifier_name = str(type(ml_classifier)).split(".")[-1][:-2]
        print("Classifier is {0}".format(mlclassifier_name))
        X = self.X[0]
        y = self.y[0]
        # data shape
        print("X variable: {0}\ny Variable:{1}".format(X.shape, y.shape))
        # check the Parameter type,
        cv = KFold(n_splits=5, random_state=100, shuffle=True)
        search_type = self.search_type[0]
        print("Search Type:{0}\n".format(search_type))
        # Grid search tuning.
        if search_type == 'grid_search':
            # Grid Search parameter type
            tuned_model = GridSearchCV(ml_classifier,
                                       param_grid=parameters,
                                       scoring=metric_type,
                                       verbose=10,
                                       refit=False)
            start_time = self.timer(0)
            tuned_model.fit(X, y)
            finishing_time = self.timer(start_time)
            print("Best parameters:{0}".format(tuned_model.best_params_))
            file_name = f'{self.file_location}/{mlclassifier_name}.pkl'
            joblib.dump(tuned_model, file_name)
            # saving the logs of model into a text file
            df = self.res_logs_text_file(mlclassifier_name,
                                         tuned_model,
                                         finishing_time,
                                         self.file_location)

            return cv_results_df.append(df)
        # random search
        elif search_type == 'random_search':
            # Random Search Parameter Tuning
            tuned_model = RandomizedSearchCV(estimator=ml_classifier,
                                             param_distributions=parameters,
                                             scoring=metric_type,
                                             return_train_score=True, 
                                             cv=cv,
                                             verbose=10, refit='AUC')
            # Tuning the model
            start_time = self.timer(0)
            model = tuned_model.fit(X, y)
            finishing_time = self.timer(start_time)

            file_name = f'{self.file_location}/{mlclassifier_name}.pkl'
            
            joblib.dump(model.best_estimator_, file_name)

            print("=="*40)
            print("Best parameters:{0}".format(model.best_params_))
            print("Best Estimator:{0}".format(model.best_estimator_))
            print("Best score:{0}".format(model.best_score_))
            print("=="*40)
            # saving the logs of model into a text file
            df = self.res_logs_text_file(mlclassifier_name,
                                         model,
                                         finishing_time,
                                         self.file_location)
            return cv_results_df.append(df)
        else:
            print("===========================================")
            print(f'{search_type} is wrong key word.'
                  f'Key word should be either 1.grid_search or 2.random_search')

        # save the model
        return cv_results_df

    def res_logs_text_file(self, mlclassifier_name, tuned_model, finish_time, file_location):
        """
        saving the result into a text files
        :param file_location: save resultant file location name. it must be with dataset name
        :param mlclassifier_name: classifier name
        :param tuned_model:  trained model
        :param finish_time: model finishing time
        :return: dataframe
        """
        with open(f'{file_location}/parameter_tuning.txt', 'a') as res_logs:
            res_logs.write('==' * 40)
            res_logs.write("\n")
            res_logs.write("1.Classifier:{0}\n".format(mlclassifier_name))
            res_logs.write("2.Best Parameters:{0}\n".format(str(tuned_model.best_params_)))
            res_logs.write("3.Duration:{0}\n".format(str(finish_time)))
            res_logs.write('4.Accuracy: %.5f ' % (tuned_model.best_score_ * 100 ))
            res_logs.write("\n5.Best Estimator{0}\n".format(str(tuned_model.best_estimator_)))
            res_logs.write('\n')
            res_logs.write('==' * 40)
            res_logs.write('\n')

        # cv results
        cv_results_df = pd.DataFrame(tuned_model.cv_results_)

        # save the model
        file_name = f'{self.file_location}/{mlclassifier_name}.pkl'
        joblib.dump(tuned_model, file_name)

        return cv_results_df

    # Time to  count the model for training.
    @staticmethod
    def timer(start_time=None):
        """

        :param start_time: 0
        :return: Completion time
        """
        time_list = []
        if not start_time:
            start_time = datetime.now()
            return start_time
        elif start_time:
            thour, temp_sec = divmod(
                (datetime.now() - start_time).total_seconds(), 3600)
            tmin, tsec = divmod(temp_sec, 60)
            # time_list.append(thour)
            # print("\n Time taken: %i hours %i minutes and %s seconds" % (thour, tmin, round(tsec,2)))
        return str("Time consumption: %i hours %i minutes and %s seconds" % (thour, tmin, round(tsec, 2)))

    def rf_classification(self):
        """
        Random Forest Classifier
        """
        # Initiate the classifier
        classifier = RandomForestClassifier(n_jobs=-1)
        # parameters
        rf_params = {
            'max_features': ['sqrt', 'auto', 'log2', None],
            'max_depth': list(range(5, 51)),
            'min_samples_leaf': list(range(1, 16)),
            'min_samples_split': list(range(2, 31)),
            'criterion': ['gini', 'entropy'],
            'random_state': [100]
        }

        print("Tuning Type:{0}\n".format(self.search_type))
        print("Classifier name:{0}\n".format(classifier.__class__.__name__))
        for key, value in rf_params.items():
            print("{0}:{1}".format(key, value))
        # parameters for grid search
        # fitting the grid search or random search
        cv_results = self._fit_grid_random_search(classifier, rf_params)
        return cv_results

    def dt_classification(self):
        """
        Decision Tree Classifier
        """
        # Initiate the classifier
        classifier = DecisionTreeClassifier()
        # parameters
        dt_params = {
            'max_features': ['sqrt', 'auto', 'log2', None],
            'max_depth': list(range(5, 51)),
            'min_samples_leaf': list(range(1, 16)),
            'min_samples_split': list(range(2, 31)),
            'criterion': ['gini', 'entropy'],
            'random_state': [100]
        }

        # print("Tuning Type:{0}\n".format(self.search_type))
        # print("Classifier name:{0}\n".format(classifier.__class__.__name__))
        # for key, value in dt_params.items():
        #     print("{0}:{1}".format(key, value))
        # parameters for grid search
        # fitting the grid search or random search
        cv_results = self._fit_grid_random_search(classifier, dt_params)
        return cv_results

    def knn_classification(self):
        """
K-nearest neighbor classification
        """
        # Initiate the classifier
        classifier = KNeighborsClassifier(n_jobs=-1)
        # parameters
        k_range = list(range(1, 31))
        knn_params = {
            'n_neighbors': list(range(1, 21, 1)),
            'weights': ['uniform', 'distance'],
            'algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute']
        }
        # print("Tuning Type:{0}\n".format(self.search_type))
        # print("Classifier name:{0}\n".format(classifier.__class__.__name__))
        for key, value in knn_params.items():
            print("{0}:{1}".format(key, value))
        # parameters for grid search
        # fitting the grid search or random search
        cv_results = self._fit_grid_random_search(classifier, knn_params)
        return cv_results

    def xgboost_classification(self):
        """
xgboost
        """
        xgb_params = rf_params = {
            'num_leaves': sp_randint(6, 50),
            'min_child_samples': sp_randint(100, 500),
            'learning_rate': list(np.arange(0, 1.1, 0.4)),
            'max_depth': list(range(5, 51, 5)),
            'min_child_weight': [1e-5, 1e-3, 1e-2, 1e-1, 1, 1e1, 1e2, 1e3, 1e4],
            'subsample': sp_uniform(loc=0.2, scale=0.8),
            'colsample_bytree': sp_uniform(loc=0.4, scale=0.6),
            'reg_alpha': [0, 1e-1, 1, 2, 5, 7, 10, 50, 100],
            'reg_lambda': [0, 1e-1, 1, 5, 10, 20, 50, 100]
        }
        xgb_classifier = xgb.XGBClassifier(objective='binary:logistic',
                                           use_label_encoder=False,
                                           random_state=100)

        cv_results = self._fit_grid_random_search(xgb_classifier, xgb_params)
        return cv_results

    def lgboost_classification(self):
        """
        Light gradient boosting
        """
        # parameters combinations
        lgb_params = {
            'num_leaves': sp_randint(6, 50),
            'learning_rate': list(np.arange(0, 1.1, 0.4)),
            'min_child_samples': sp_randint(100, 500),
            'max_depth': list(range(5, 51, 5)),
            'min_child_weight': [1e-5, 1e-3, 1e-2, 1e-1, 1, 1e1, 1e2, 1e3, 1e4],
            'subsample': sp_uniform(loc=0.2, scale=0.8),
            'colsample_bytree': sp_uniform(loc=0.4, scale=0.6),
            'reg_alpha': [0, 1e-1, 1, 2, 5, 7, 10, 50, 100],
            'reg_lambda': [0, 1e-1, 1, 5, 10, 20, 50, 100]
        }

        lgbm_classifier = lgb.LGBMClassifier(
            random_state=314, silent=True, metric='None', n_jobs=4, n_estimators=5000)

        cv_results = self._fit_grid_random_search(lgbm_classifier, lgb_params)
        return cv_results

    def et_classification(self):
        """
        Extra tree Classification
        """
        xt_clf = ExtraTreesClassifier(verbose=10,
                                      random_state=123,
                                      n_jobs=-1)

        xt_params = {
            'n_estimators': [int(x) for x in range(200, 2000, 200)],
            'max_features': ['sqrt', 'auto', 'log2', None],
            'max_depth': [int(x) for x in np.linspace(10, 110, num=11)],
            'min_samples_leaf': sp_randint(1, 15),
            'min_samples_split': sp_randint(2, 30),
            'bootstrap': [True, False]}

        cv_results = self._fit_grid_random_search(xt_clf, xt_params)
        return cv_results

    def grdient_boosting_classification(self):
        """
        Gradient Boosting classifier
        """
        lgb_params = {
            'n_estimators': [int(x) for x in range(200, 2000, 200)],
            'max_depth': [int(x) for x in np.linspace(10, 110, num=11)],
            'learning_rate': [0.1, 0.001, 0.01]}

        gbc_clf = GradientBoostingClassifier()
        cv_results = self._fit_grid_random_search(gbc_clf, lgb_params)
        return cv_results

    def fitting_models(self):
        """
        fitting all the models
        """
        model_fitting_dict = {'dt': self.dt_classification(),
                              'rf': self.rf_classification(),
                              'ext': self.et_classification(),
                              'gbc': self.grdient_boosting_classification(),
                              'xgb': self.xgboost_classification(),
                              'lgb': self.lgboost_classification(),
                              'knn': self.knn_classification()
                              }
        return model_fitting_dict



In [2]:
import os
os.chdir(r"/gpfs/mariana/home/rkalak/xai_evaluation/project/Iot_Botnet_XAI/src/trained_models/")
scoring = {'accuracy': 'accuracy',
           'AUC': 'roc_auc',
           'F1': 'f1_micro',
           'Precision': 'precision',
           'Recall': 'recall'}

tunning = parameter_tuning(X, y,scoring, 'N-BaIoT', 'random_search')

NameError: name 'X' is not defined

In [52]:
res = tunning.xgboost_classification()

metric:{'accuracy': 'accuracy', 'AUC': 'roc_auc', 'F1': 'f1_micro', 'Precision': 'precision', 'Recall': 'recall'}
<class 'dict'>
Tuning Type:('random_search',)

Classifier is XGBClassifier
X variable: (8000, 115)
y Variable:(8000,)
Search Type:random_search

Fitting 5 folds for each of 10 candidates, totalling 50 fits
[CV 1/5; 1/10] START colsample_bytree=0.948031113846764, learning_rate=0.0, max_depth=15, min_child_samples=258, min_child_weight=10000.0, num_leaves=43, reg_alpha=2, reg_lambda=0, subsample=0.25562105422078213
Parameters: { "min_child_samples", "num_leaves" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.


[CV 1/5; 1/10] END colsample_bytree=0.948031113846764, learning_rate=0.0, max_depth=15, min_child_samples=258, min_child_weight=100

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


[CV 2/5; 1/10] END colsample_bytree=0.948031113846764, learning_rate=0.0, max_depth=15, min_child_samples=258, min_child_weight=10000.0, num_leaves=43, reg_alpha=2, reg_lambda=0, subsample=0.25562105422078213; AUC: (train=0.500, test=0.500) F1: (train=0.497, test=0.510) Precision: (train=0.000, test=0.000) Recall: (train=0.000, test=0.000) accuracy: (train=0.497, test=0.510) total time=   0.1s
[CV 3/5; 1/10] START colsample_bytree=0.948031113846764, learning_rate=0.0, max_depth=15, min_child_samples=258, min_child_weight=10000.0, num_leaves=43, reg_alpha=2, reg_lambda=0, subsample=0.25562105422078213
Parameters: { "min_child_samples", "num_leaves" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.


[CV 3/5; 1/10] END colsample_bytree=0.948031113846764,

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


[CV 4/5; 1/10] END colsample_bytree=0.948031113846764, learning_rate=0.0, max_depth=15, min_child_samples=258, min_child_weight=10000.0, num_leaves=43, reg_alpha=2, reg_lambda=0, subsample=0.25562105422078213; AUC: (train=0.500, test=0.500) F1: (train=0.501, test=0.497) Precision: (train=0.000, test=0.000) Recall: (train=0.000, test=0.000) accuracy: (train=0.501, test=0.497) total time=   0.1s
[CV 5/5; 1/10] START colsample_bytree=0.948031113846764, learning_rate=0.0, max_depth=15, min_child_samples=258, min_child_weight=10000.0, num_leaves=43, reg_alpha=2, reg_lambda=0, subsample=0.25562105422078213
Parameters: { "min_child_samples", "num_leaves" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.


[CV 5/5; 1/10] END colsample_bytree=0.948031113846764,

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


[CV 1/5; 2/10] END colsample_bytree=0.9444137764884799, learning_rate=0.4, max_depth=45, min_child_samples=270, min_child_weight=1e-05, num_leaves=31, reg_alpha=50, reg_lambda=20, subsample=0.4469107185947923; AUC: (train=1.000, test=1.000) F1: (train=0.998, test=0.998) Precision: (train=0.997, test=0.995) Recall: (train=1.000, test=1.000) accuracy: (train=0.998, test=0.998) total time=   0.2s
[CV 2/5; 2/10] START colsample_bytree=0.9444137764884799, learning_rate=0.4, max_depth=45, min_child_samples=270, min_child_weight=1e-05, num_leaves=31, reg_alpha=50, reg_lambda=20, subsample=0.4469107185947923
Parameters: { "min_child_samples", "num_leaves" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.


[CV 2/5; 2/10] END colsample_bytree=0.9444137764884799

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


[CV 2/5; 3/10] END colsample_bytree=0.7665782565593782, learning_rate=0.4, max_depth=20, min_child_samples=283, min_child_weight=1000.0, num_leaves=16, reg_alpha=0, reg_lambda=5, subsample=0.6150101711579531; AUC: (train=0.500, test=0.500) F1: (train=0.497, test=0.510) Precision: (train=0.000, test=0.000) Recall: (train=0.000, test=0.000) accuracy: (train=0.497, test=0.510) total time=   0.1s
[CV 3/5; 3/10] START colsample_bytree=0.7665782565593782, learning_rate=0.4, max_depth=20, min_child_samples=283, min_child_weight=1000.0, num_leaves=16, reg_alpha=0, reg_lambda=5, subsample=0.6150101711579531
Parameters: { "min_child_samples", "num_leaves" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.


[CV 3/5; 3/10] END colsample_bytree=0.7665782565593782, 

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


[CV 4/5; 3/10] END colsample_bytree=0.7665782565593782, learning_rate=0.4, max_depth=20, min_child_samples=283, min_child_weight=1000.0, num_leaves=16, reg_alpha=0, reg_lambda=5, subsample=0.6150101711579531; AUC: (train=0.500, test=0.500) F1: (train=0.501, test=0.497) Precision: (train=0.000, test=0.000) Recall: (train=0.000, test=0.000) accuracy: (train=0.501, test=0.497) total time=   0.1s
[CV 5/5; 3/10] START colsample_bytree=0.7665782565593782, learning_rate=0.4, max_depth=20, min_child_samples=283, min_child_weight=1000.0, num_leaves=16, reg_alpha=0, reg_lambda=5, subsample=0.6150101711579531
Parameters: { "min_child_samples", "num_leaves" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.


[CV 5/5; 3/10] END colsample_bytree=0.7665782565593782, 

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


[CV 1/5; 4/10] END colsample_bytree=0.4975643207961649, learning_rate=0.0, max_depth=15, min_child_samples=172, min_child_weight=0.1, num_leaves=39, reg_alpha=100, reg_lambda=5, subsample=0.7700835643113055; AUC: (train=0.500, test=0.500) F1: (train=0.503, test=0.486) Precision: (train=0.000, test=0.000) Recall: (train=0.000, test=0.000) accuracy: (train=0.503, test=0.486) total time=   0.3s
[CV 2/5; 4/10] START colsample_bytree=0.4975643207961649, learning_rate=0.0, max_depth=15, min_child_samples=172, min_child_weight=0.1, num_leaves=39, reg_alpha=100, reg_lambda=5, subsample=0.7700835643113055
Parameters: { "min_child_samples", "num_leaves" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.




  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


[CV 2/5; 4/10] END colsample_bytree=0.4975643207961649, learning_rate=0.0, max_depth=15, min_child_samples=172, min_child_weight=0.1, num_leaves=39, reg_alpha=100, reg_lambda=5, subsample=0.7700835643113055; AUC: (train=0.500, test=0.500) F1: (train=0.497, test=0.510) Precision: (train=0.000, test=0.000) Recall: (train=0.000, test=0.000) accuracy: (train=0.497, test=0.510) total time=   0.5s
[CV 3/5; 4/10] START colsample_bytree=0.4975643207961649, learning_rate=0.0, max_depth=15, min_child_samples=172, min_child_weight=0.1, num_leaves=39, reg_alpha=100, reg_lambda=5, subsample=0.7700835643113055
Parameters: { "min_child_samples", "num_leaves" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.




  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


[CV 3/5; 4/10] END colsample_bytree=0.4975643207961649, learning_rate=0.0, max_depth=15, min_child_samples=172, min_child_weight=0.1, num_leaves=39, reg_alpha=100, reg_lambda=5, subsample=0.7700835643113055; AUC: (train=0.500, test=0.500) F1: (train=0.499, test=0.504) Precision: (train=0.000, test=0.000) Recall: (train=0.000, test=0.000) accuracy: (train=0.499, test=0.504) total time=   0.3s
[CV 4/5; 4/10] START colsample_bytree=0.4975643207961649, learning_rate=0.0, max_depth=15, min_child_samples=172, min_child_weight=0.1, num_leaves=39, reg_alpha=100, reg_lambda=5, subsample=0.7700835643113055
Parameters: { "min_child_samples", "num_leaves" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.




  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


[CV 4/5; 4/10] END colsample_bytree=0.4975643207961649, learning_rate=0.0, max_depth=15, min_child_samples=172, min_child_weight=0.1, num_leaves=39, reg_alpha=100, reg_lambda=5, subsample=0.7700835643113055; AUC: (train=0.500, test=0.500) F1: (train=0.501, test=0.497) Precision: (train=0.000, test=0.000) Recall: (train=0.000, test=0.000) accuracy: (train=0.501, test=0.497) total time=   0.3s
[CV 5/5; 4/10] START colsample_bytree=0.4975643207961649, learning_rate=0.0, max_depth=15, min_child_samples=172, min_child_weight=0.1, num_leaves=39, reg_alpha=100, reg_lambda=5, subsample=0.7700835643113055
Parameters: { "min_child_samples", "num_leaves" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.




  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


[CV 5/5; 4/10] END colsample_bytree=0.4975643207961649, learning_rate=0.0, max_depth=15, min_child_samples=172, min_child_weight=0.1, num_leaves=39, reg_alpha=100, reg_lambda=5, subsample=0.7700835643113055; AUC: (train=0.500, test=0.500) F1: (train=0.499, test=0.502) Precision: (train=0.000, test=0.000) Recall: (train=0.000, test=0.000) accuracy: (train=0.499, test=0.502) total time=   0.3s
[CV 1/5; 5/10] START colsample_bytree=0.7351046530333595, learning_rate=0.8, max_depth=45, min_child_samples=143, min_child_weight=1000.0, num_leaves=20, reg_alpha=0, reg_lambda=0.1, subsample=0.22634254006167903
Parameters: { "min_child_samples", "num_leaves" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.


[CV 1/5; 5/10] END colsample_bytree=0.7351046530333595

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


[CV 2/5; 5/10] END colsample_bytree=0.7351046530333595, learning_rate=0.8, max_depth=45, min_child_samples=143, min_child_weight=1000.0, num_leaves=20, reg_alpha=0, reg_lambda=0.1, subsample=0.22634254006167903; AUC: (train=0.500, test=0.500) F1: (train=0.497, test=0.510) Precision: (train=0.000, test=0.000) Recall: (train=0.000, test=0.000) accuracy: (train=0.497, test=0.510) total time=   0.1s
[CV 3/5; 5/10] START colsample_bytree=0.7351046530333595, learning_rate=0.8, max_depth=45, min_child_samples=143, min_child_weight=1000.0, num_leaves=20, reg_alpha=0, reg_lambda=0.1, subsample=0.22634254006167903
Parameters: { "min_child_samples", "num_leaves" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.


[CV 3/5; 5/10] END colsample_bytree=0.735104653033

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


[CV 4/5; 5/10] END colsample_bytree=0.7351046530333595, learning_rate=0.8, max_depth=45, min_child_samples=143, min_child_weight=1000.0, num_leaves=20, reg_alpha=0, reg_lambda=0.1, subsample=0.22634254006167903; AUC: (train=0.500, test=0.500) F1: (train=0.501, test=0.497) Precision: (train=0.000, test=0.000) Recall: (train=0.000, test=0.000) accuracy: (train=0.501, test=0.497) total time=   0.1s
[CV 5/5; 5/10] START colsample_bytree=0.7351046530333595, learning_rate=0.8, max_depth=45, min_child_samples=143, min_child_weight=1000.0, num_leaves=20, reg_alpha=0, reg_lambda=0.1, subsample=0.22634254006167903
Parameters: { "min_child_samples", "num_leaves" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.


[CV 5/5; 5/10] END colsample_bytree=0.735104653033

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


[CV 1/5; 6/10] END colsample_bytree=0.6487800392771721, learning_rate=0.0, max_depth=40, min_child_samples=389, min_child_weight=1000.0, num_leaves=9, reg_alpha=5, reg_lambda=50, subsample=0.9376523867859434; AUC: (train=0.500, test=0.500) F1: (train=0.503, test=0.486) Precision: (train=0.000, test=0.000) Recall: (train=0.000, test=0.000) accuracy: (train=0.503, test=0.486) total time=   0.1s
[CV 2/5; 6/10] START colsample_bytree=0.6487800392771721, learning_rate=0.0, max_depth=40, min_child_samples=389, min_child_weight=1000.0, num_leaves=9, reg_alpha=5, reg_lambda=50, subsample=0.9376523867859434
Parameters: { "min_child_samples", "num_leaves" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.


[CV 2/5; 6/10] END colsample_bytree=0.6487800392771721, 

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


[CV 3/5; 6/10] END colsample_bytree=0.6487800392771721, learning_rate=0.0, max_depth=40, min_child_samples=389, min_child_weight=1000.0, num_leaves=9, reg_alpha=5, reg_lambda=50, subsample=0.9376523867859434; AUC: (train=0.500, test=0.500) F1: (train=0.499, test=0.504) Precision: (train=0.000, test=0.000) Recall: (train=0.000, test=0.000) accuracy: (train=0.499, test=0.504) total time=   0.1s
[CV 4/5; 6/10] START colsample_bytree=0.6487800392771721, learning_rate=0.0, max_depth=40, min_child_samples=389, min_child_weight=1000.0, num_leaves=9, reg_alpha=5, reg_lambda=50, subsample=0.9376523867859434
Parameters: { "min_child_samples", "num_leaves" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.


[CV 4/5; 6/10] END colsample_bytree=0.6487800392771721, 

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


[CV 1/5; 7/10] END colsample_bytree=0.48993496423153826, learning_rate=0.4, max_depth=20, min_child_samples=342, min_child_weight=1, num_leaves=7, reg_alpha=0.1, reg_lambda=1, subsample=0.5189426197819126; AUC: (train=1.000, test=1.000) F1: (train=1.000, test=1.000) Precision: (train=1.000, test=1.000) Recall: (train=1.000, test=1.000) accuracy: (train=1.000, test=1.000) total time=   0.1s
[CV 2/5; 7/10] START colsample_bytree=0.48993496423153826, learning_rate=0.4, max_depth=20, min_child_samples=342, min_child_weight=1, num_leaves=7, reg_alpha=0.1, reg_lambda=1, subsample=0.5189426197819126
Parameters: { "min_child_samples", "num_leaves" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.


[CV 2/5; 7/10] END colsample_bytree=0.48993496423153826, learn

[CV 3/5; 8/10] END colsample_bytree=0.9562770851104112, learning_rate=0.4, max_depth=35, min_child_samples=132, min_child_weight=1e-05, num_leaves=47, reg_alpha=1, reg_lambda=0, subsample=0.8166615943825004; AUC: (train=1.000, test=1.000) F1: (train=1.000, test=0.999) Precision: (train=1.000, test=0.999) Recall: (train=1.000, test=1.000) accuracy: (train=1.000, test=0.999) total time=   0.2s
[CV 4/5; 8/10] START colsample_bytree=0.9562770851104112, learning_rate=0.4, max_depth=35, min_child_samples=132, min_child_weight=1e-05, num_leaves=47, reg_alpha=1, reg_lambda=0, subsample=0.8166615943825004
Parameters: { "min_child_samples", "num_leaves" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.


[CV 4/5; 8/10] END colsample_bytree=0.9562770851104112, le

[CV 1/5; 10/10] END colsample_bytree=0.48815639187225207, learning_rate=0.4, max_depth=20, min_child_samples=143, min_child_weight=0.001, num_leaves=39, reg_alpha=100, reg_lambda=10, subsample=0.6848484029926902; AUC: (train=1.000, test=1.000) F1: (train=0.998, test=0.998) Precision: (train=0.997, test=0.995) Recall: (train=1.000, test=1.000) accuracy: (train=0.998, test=0.998) total time=   0.2s
[CV 2/5; 10/10] START colsample_bytree=0.48815639187225207, learning_rate=0.4, max_depth=20, min_child_samples=143, min_child_weight=0.001, num_leaves=39, reg_alpha=100, reg_lambda=10, subsample=0.6848484029926902
Parameters: { "min_child_samples", "num_leaves" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.


[CV 2/5; 10/10] END colsample_bytree=0.488156391

In [105]:
md = joblib.load(fl_name)

In [22]:
!pip show scikit-learn  

Name: scikit-learn
Version: 0.23.2
Summary: A set of python modules for machine learning and data mining
Home-page: http://scikit-learn.org
Author: None
Author-email: None
License: new BSD
Location: /gpfs/mariana/home/rkalak/.local/lib/python3.8/site-packages
Requires: numpy, scipy, joblib, threadpoolctl
Required-by: PyImpetus, lightgbm, pyLDAvis, pycaret, bamboolib, yellowbrick, ppscore, pyod, skrebate, shap, mlxtend, kmodes, imbalanced-learn, pynndescent, umap-learn, scikit-plot, Boruta, lime, aix360, cvplot, sklearn


In [9]:
!pip list 

Package                           Version                Location                                                    
--------------------------------- ---------------------- ------------------------------------------------------------
about-time                        3.1.1                  
absl-py                           1.0.0                  
aix360                            0.1.0                  
alembic                           1.8.1                  
alive-progress                    2.4.1                  
analytics-python                  1.2.9                  
anyio                             3.4.0                  
appdirs                           1.4.3                  
argon2-cffi                       21.1.0                 
asgiref                           3.5.2                  
astor                             0.8.1                  
astunparse                        1.6.3                  
attrs                             21.4.0                 
Automat   

In [10]:
import pickle

In [53]:
type(X)

pandas.core.frame.DataFrame