In [None]:
#---------------------------------------------------------------------------------------------LIBRARIES--------------------------------------------------------------------------------------------                                                                                         #import OpenCV2 library for image processing and algorithms
import math
import csv 
import numpy as np                                                                                     #import numpy mathematical library
import pandas as pd

from IPython.core.display import display, HTML                                    
display(HTML("<style>.container { width:100% !important; }</style>"))                                  #change width of Jupyer Notebook to use the whole window resolution availab

# import the classifier
from xgboost import XGBClassifier
from sklearn.impute import SimpleImputer
from sklearn.metrics import accuracy_score

import hyperopt
from hyperopt import fmin, tpe, hp, STATUS_OK, Trials
from bayes_opt import BayesianOptimization

In [None]:
#dataset selection and loading
train_set_name = "weather_data_2000_2019_3c"
train_dataset = pd.read_csv(("./datasets/" + train_set_name + ".csv"),header=None)
test_set_name = "weather_data_2020_2021_3c"
test_dataset = pd.read_csv(("./datasets/" + test_set_name + ".csv"),header=None) 

print("Your Train Dataset is: ", train_set_name)                                                                       #display dataset name to user
print("Your Test Dataset is: ", test_set_name)                                                                       #display dataset name to user

features_selected = 5
#training features
temperature_train = train_dataset.iloc[:, 4:5]
feels_like_train = train_dataset.iloc[:, 7:8]
dew_point_train = train_dataset.iloc[:, 8:9]
humidity_train = train_dataset.iloc[:, 9:10]
pressure_train = train_dataset.iloc[:, 19:20] 
#uv_index_train = train_dataset.iloc[:, 24:25] 

#testing data
temperature_test = test_dataset.iloc[:, 4:5]
feels_like_test = test_dataset.iloc[:, 7:8]
dew_point_test = test_dataset.iloc[:, 8:9]
humidity_test = test_dataset.iloc[:, 9:10]
pressure_test = test_dataset.iloc[:, 19:20]
#uv_index_test = test_dataset.iloc[:, 24:25] 

X_train = pd.concat([temperature_train, feels_like_train, dew_point_train, humidity_train, pressure_train], axis=1)  #, uv_index_train], axis=1)
X_train.replace(([np.inf, -np.inf], np.nan), inplace=True)                                                            #replace any infinite values with nan
X_train = X_train.to_numpy().astype('float64')  

y_train = train_dataset.iloc[:, 30:31].to_numpy().astype('int32')
y_true = test_dataset.iloc[:, 30:31].to_numpy().astype('int32')

#https://pandas.pydata.org/pandas-docs/stable/user_guide/merging.html
test_data = pd.concat([temperature_test, feels_like_test, dew_point_test, humidity_test, pressure_test], axis=1)  #, uv_index_test], axis=1)       
test_data.replace(([np.inf, -np.inf], np.nan), inplace=True)                                                           #replace any infinite values with nan
X_test = test_data.to_numpy().astype('float64')  

#change all nan values in all datasets with the most frequent value of the dataset
imp = SimpleImputer(missing_values=np.nan, strategy='most_frequent')
imp.fit(X_train)
imp.fit(y_train)
imp.fit(X_test)
imp.fit(y_true)

In [None]:
#https://www.analyticsvidhya.com/blog/2020/09/alternative-hyperparameter-optimization-technique-you-need-to-know-hyperopt/
#https://medium.com/analytics-vidhya/hyperparameter-tuning-hyperopt-bayesian-optimization-for-xgboost-and-neural-network-8aedf278a1c9
def hyperparameter_tuning(space):                                   #bayesian optimisation
    model = XGBClassifier(objective = 'multi:softmax', n_estimators =space['n_estimators'], max_depth = int(space['max_depth']), gamma = space['gamma'],
                         reg_alpha = int(space['reg_alpha']), min_child_weight=space['min_child_weight'], colsample_bytree=space['colsample_bytree'])
    evaluation = [( X_train, y_train), ( X_test, y_true)]

    model.fit(X_train, y_train, eval_set=evaluation, eval_metric="merror", early_stopping_rounds=20,verbose=True)

    pred = model.predict(X_test)
    accuracy = accuracy_score(y_true, pred)
    print ("SCORE:", accuracy)
    return {'loss': -accuracy, 'status': STATUS_OK, 'model': model}

def main():
    n_est = list()
    for i in range(10, 500, 1):
        n_est.append(i)

    space={'max_depth': hp.quniform("max_depth", 2, 50, 1),
        'gamma': hp.uniform ('gamma', 1,20),
        'reg_alpha' : hp.quniform('reg_alpha', 10,200,1),
        'reg_lambda' : hp.uniform('reg_lambda', 0,1),
        'colsample_bytree' : hp.uniform('colsample_bytree', 0.5,1),
        'min_child_weight' : hp.quniform('min_child_weight', 0, 20, 1),
        'n_estimators': hp.choice("n_estimators", n_est)}
    
    disp = int(input("Please choose 1 to display the dataset or any button to cotinue without displaying!"))
    if disp == 1: display(X_train); display(y_train.ravel()); display(X_test)
    else: print("Not displaying dataset!")  

    trials = Trials()
    best = fmin(fn=hyperparameter_tuning,space=space,algo=tpe.suggest,max_evals=300,trials=trials)

    print(best)

if __name__ == "__main__":                                                                                               #script guard boilerplate
    main()