In [None]:
import warnings

import pandas as pd
import numpy as np
from matplotlib import pyplot as plt

from hyperopt import fmin, tpe, hp, STATUS_OK, Trials

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.exceptions import ConvergenceWarning
import xgboost as xgb

warnings.simplefilter(action='ignore', category=ConvergenceWarning)
warnings.simplefilter(action='ignore', category=UserWarning)


Notes: 
- for final presentation, maybe mention development date for algorithms 
- visualisierung für probierte werte 
- vergleichbare projekte von anderen + deren reached accuracy ( zum vergleich, um zu beweisen, dass unsere gut?)

# **0. Data Prep**

In [None]:
df = pd.read_csv('./data/brfss_imputed.csv')

# remove double indexing
df = df.drop(df.columns[0], axis=1)

# dropping unsused columns
df = df[df.columns.drop(list(df.filter(regex='unk_')))]
df = df[df.columns.drop(list(df.filter(regex='not_known_')))]
df = df[df.columns.drop(list(df.filter(regex='_was_missing')))]
df = df[df.columns.drop(["CVDCRHD4", "CVDINFR4", "CVDSTRK3", "ASTHMA3", "CHCOCNCR",
                        "ASTHNOW", "CHCSCNCR", "CHCCOPD3", "ADDEPEV3", "CHCKDNY2", "HAVARTH5"])]

# splitting into target & features df
target = df['DIABETE4']
features = df.drop(['DIABETE4'], axis=1)

categoric_features = ['SEXVAR', 'GENHLTH', 'PERSDOC3', 'MEDCOST1', 'CHECKUP1', 'EXERANY2', 'BPMEDS', 'TOLDHI3', 'CHOLMED3', 'DIABETE4', 'VETERAN3', 'PREGNANT', 'DEAF', 'BLIND', 'DECIDE', 'DIFFWALK',
                      'DIFFDRES', 'DIFFALON', 'SMOKE100', 'FLUSHOT7', 'PNEUVAC4', 'HIVTST7', '_METSTAT', '_URBSTAT', '_HLTHPLN', '_INCOMG1', 'high_blood_pressure', 'pregnant_high_blood_pressure',
                      'borderline_high_blood_pressure', 'cholesterol_checked_within_year', 'married', 'divorced', 'widowed', 'separated', 'never_married', 'unmarried_couple', 'own_house',
                      'renting', 'other_arrangement_housing', 'employed_for_wages', 'self_employed', 'out_of_work_year_plus', 'out_of_work_year_less', 'homemaker', 'student', 'retired', 'unable_to_work',
                      'smoke_every_day', 'smoke_some_days', 'smoke_not_at_all', 'smokeless_every_day', 'smokeless_some_days', 'smokeless_not_at_all', 'ecig_every_day', 'ecig_some_days', 'ecig_not_at_all',
                      'ecig_never_used', 'white', 'black', 'asian', 'native', 'hispanic', 'not_graduate_high_school', 'graduated_high_school', 'attended_college', 'graduated_college', 'DIABETE4']

for cat in categoric_features:
    df[cat].astype("category")


# splitting into training and test data
features_train, features_test, target_train, target_test = train_test_split(
    features, target, test_size=0.2, random_state=42)

# **xgboost Classification**

### **sckikit-learn Interface**

In [None]:
learning_rate_range = np.arange(0.01, 1, 0.05)
max_depth = np.arange(1, 5, 1)
test_XG = [] 
train_XG = []
train_accuracy = []
test_accuracy = []
for lr in learning_rate_range:
    for i in max_depth:
        xgb_classifier = xgb.XGBClassifier(eta = lr, tree_method="gpu_hist", max_depth = i,  enable_categorical=True)
        xgb_classifier.fit(features_train, target_train)

        # predictions_train = xgb_classifier.predict(features_train)
        # predictions_test = xgb_classifier.predict(features_test)

        # train_accuracy.append(accuracy_score(target_train, predictions_train))
        # test_accuracy.append(accuracy_score(target_test, predictions_test))

        train_XG.append(xgb_classifier.score(features_train, target_train))
        test_XG.append(xgb_classifier.score(features_test, target_test))

# print(xgb_classifier.feature_importances_)

# print(max(train_accuracy))
# print(max(test_accuracy))
print(max(train_XG))
print(max(test_XG))

In [None]:
xgb_classifier.save_model('xgb_model.json')

In [None]:
xgb_classifier = xgb.Booster()
xgb_classifier.load_model("xgb_model.json")


In [None]:
fig = plt.figure(figsize=(10, 7))
plt.plot(learning_rate_range, train_XG, c='orange', label='Train')
plt.plot(learning_rate_range, test_XG, c='m', label='Test')
plt.xlabel('Learning rate')
plt.xticks(learning_rate_range)
plt.ylabel('Accuracy score')
plt.ylim(0.6, 1)
plt.legend(prop={'size': 12}, loc=3)
plt.title('Accuracy score vs. Learning rate of XGBoost', size=14)
plt.show()

### **xgboost** 

In [None]:
# We need to prepare data as DMatrix objects
train = xgb.DMatrix(features_train, target_train)
test = xgb.DMatrix(features_test, target_test)

# We need to define parameters as dict
params = {
    "learning_rate": 0.01,
    # "tree_method" : gpu_hist,
    'max_depth': 3,
    'enable_categorical': True,
    "max_depth": 3
}
# training, we set the early stopping rounds parameter
model_xgb = xgb.train(params,
                      train, evals=[(train, "train"), (test, "validation")],
                      num_boost_round=1000
                      )


In [None]:
print(model_xgb.best_ntree_limit)

max(model_xgb.predict(test))

# **4. Vizualisations**

In [None]:
# # tested values over all trials

# sns.set(style="whitegrid", palette="Accent")

# tids = [t['tid'] for t in trials.trials]
# n_estimators = [t['misc']['vals']['n_estimators']for t in trials]

# fig, ax = plt.subplots()
# ax.scatter(tids, n_estimators)

# ax.legend(('n_estimators'), loc='lower right')
# ax.set_ylabel('n_estimators over all trials')
# ax.set_xlabel('trialIDs')
# fig.set_size_inches(10, 5)

# fig.savefig('./visualizations/tested_values.png')


In [None]:
# # loss Improvement over Trials
# from itertools import chain

# # loss Improvement over Trials


# def plot_reached_min_losses(trials):
#     losses = [t['result']['loss'] for t in trials]
#     tids = [t['tid'] for t in trials.trials]
#     n_estimators = [t['misc']['vals']['n_estimators'] for t in trials]

#     n_estimators = list(chain.from_iterable(n_estimators))

#     best_loss = losses[0]
#     points_to_plot = []
#     points_to_plot.append(losses[0])
#     tids_with_loss_improvement = [0]
#     counter = 0
#     for i in range(1, len(losses)):
#         if losses[i] < best_loss:
#             tid = tids[i]
#             # print(tid)
#             points_to_plot.append(losses[i])
#             tids_with_loss_improvement.append(tid)
#             best_loss = losses[i]

#     # plotting with logarithmic y-scale
#     sns.set(style="whitegrid")
#     fig, ax = plt.subplots()
#     # ax.set_yscale('log')
#     ax.set_ylabel('developement of min loss')
#     ax.set_xlabel('Trial-IDs')
#     fig.set_size_inches(15, 5)
#     ax.plot(tids_with_loss_improvement, points_to_plot,
#             color="mediumpurple", linestyle='-', marker='o')
#     ax.scatter(tids, losses, color='skyblue')


# plot_reached_min_losses(trials)

# fig.savefig('./visualizations/loss_improvement.png')


### **relicts & failed tests**

In [None]:
# # Optimizing n_estimators to reach the highest possible accuracy

# # N_ESTIMATORS should be max the SEARCHSPACE so all possibilities are tried once
# MAX_EVALS = 20 
# SEARCH_SPACE = [hp.uniformint('n_estimators', 2000, 8000), hp.uniformint('max_depth', 0, 25)]

# ### Optimizaion ##############################################################################################################


# def cost_function(params):
#     n_estimators = params[0]
#     max_depth = params[1]
#     if n_estimators == 0:
#         return 0
#     xgb_classifier = xgb.XGBClassifier(n_estimators=n_estimators, objective='binary:logistic', tree_method='hist', eta=0.1, max_depth=max_depth, random_state = 0).fit(features_train, target_train)
#     xgb_predictions = xgb_classifier.predict(features_test)
#     xgb_accuracy = accuracy_score(target_test, xgb_predictions)
#     return {'loss': - xgb_accuracy, 'status': STATUS_OK}


# trials = Trials()
# best = fmin(cost_function,
#             space=SEARCH_SPACE,
#             algo=tpe.suggest,
#             max_evals=MAX_EVALS,
#             trials=trials)

# print(best)


best loss:  -0.871387742140362

{'n_estimators': 2265.0}

In [None]:
# n_estimators = 5120
# max_depth = 3

# xgb_classifier = xgb.XGBClassifier(n_estimators=n_estimators, objective='binary:logistic',
#                                    tree_method='hist', eta=0.1, max_depth=max_depth).fit(features_train, target_train)
# xgb_predictions = xgb_classifier.predict(features_test)
# xgb_accuracy = accuracy_score(target_test, xgb_predictions)
# print(xgb_accuracy)


In [None]:
# # filename = 'xgb_model.pickle'

# # pickle.dump(xgb_classifier, open(filename, "wb"))


# xgb_classifier.save_model('xgb_model.json')

# xgb.Booster().load_model('xgb_model.json')

In [None]:
# model = xgb.Booster({'max_depth': 3, 'n_estimators' : 5120})  # init model
# model.load_model('xgb_model')  # load data

In [None]:
# # Usage Example' #################################################################################################

# xgb_classifier = pickle.load(open(filename, "rb"))

# example_input = features_test.iloc[576]
# # print (example_input.shape)

# def make_prediction(xgb_classifier, input):
#     prediction = xgb_classifier.predict(input.to_numpy())

#     return prediction


# prediction = make_prediction(xgb_classifier, example_input)
# print(prediction)
