In [1]:
import warnings
import random
import pickle

import pandas as pd
import seaborn as sns
from matplotlib import pyplot as plt

from hyperopt import fmin, tpe, hp, STATUS_OK, Trials

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.exceptions import ConvergenceWarning
import xgboost as xgb

warnings.simplefilter(action='ignore', category=ConvergenceWarning)
warnings.simplefilter(action='ignore', category=UserWarning)


Notes: 
- for final presentation, maybe mention development date for algorithms 
- visualisierung für probierte werte 
- vergleichbare projekte von anderen + deren reached accuracy ( zum vergleich, um zu beweisen, dass unsere gut?)

# **0. Data Prep**

In [13]:
df = pd.read_csv('./data/brfss_imputed.csv')
pd.options.display.max_columns = None

# remove double indexing
df = df.drop(df.columns[0], axis=1)

# # checking for nan's
# print("No. of columns containing null values")
# print(len(df.columns[df.isna().any()]))

# print("No. of rows containing null values")
# print(df.isnull().any(axis=1).sum())

# dropping unsused columns
df = df[df.columns.drop(list(df.filter(regex='unk_')))]
df = df[df.columns.drop(list(df.filter(regex='not_known_')))]
df = df[df.columns.drop(list(df.filter(regex='_was_missing')))]
df = df[df.columns.drop(["CVDCRHD4", "CVDINFR4", "CVDSTRK3", "ASTHMA3", "CHCOCNCR",
                        "ASTHNOW", "CHCSCNCR", "CHCCOPD3", "ADDEPEV3", "CHCKDNY2", "HAVARTH5"])]

# splitting into target & features df
target = df['DIABETE4']
features = df.drop(['DIABETE4'], axis=1)

# splitting into training and test data
features_train, features_test, target_train, target_test = train_test_split(
    features, target, test_size=0.2, random_state=42)

print(features_train.shape)
# print(features_test.shape)
# print(target_train.shape)
# print(target_test.shape)


(302304, 81)


## **3.1 Optimizing n_estimators & max_depth**

In [4]:
# # Optimizing n_estimators to reach the highest possible accuracy

# # N_ESTIMATORS should be max the SEARCHSPACE so all possibilities are tried once
# MAX_EVALS =
# SEARCH_SPACE = [hp.uniformint('n_estimators', 3000, 30000), hp.uniformint('max_depth', 0, 100)]

# ### Optimizaion ##############################################################################################################


# def cost_function(params):
#     n_estimators = params[0]
#     max_depth = params[1]
#     if n_estimators == 0:
#         return 0
#     xgb_classifier = xgb.XGBClassifier(n_estimators=n_estimators, objective='binary:logistic', tree_method='hist', eta=0.1, max_depth=max_depth, random_state = 0).fit(features_train, target_train)
#     sgb_predictions = xgb_classifier.predict(features_test)
#     sgb_accuracy = accuracy_score(target_test, sgb_predictions)
#     return {'loss': - sgb_accuracy, 'status': STATUS_OK}


# trials = Trials()
# best = fmin(cost_function,
#             space=SEARCH_SPACE,
#             algo=tpe.suggest,
#             max_evals=MAX_EVALS,
#             trials=trials)

# print(best)


best loss:  -0.871387742140362

{'n_estimators': 2265.0}

## **3.2 Training & saving model with optimized n_estimators**

In [3]:
n_estimators = 226

xgb_classifier = xgb.XGBClassifier(n_estimators=n_estimators, objective='binary:logistic',
                                   tree_method='hist', eta=0.1, max_depth=3).fit(features_train, target_train)
sgb_predictions = xgb_classifier.predict(features_test)
sgb_accuracy = accuracy_score(target_test, sgb_predictions)


filename = 'xgb_model.pickle'

pickle.dump(xgb_classifier, open(filename, "wb"))


In [4]:
print(sgb_accuracy)

0.8680136551286123


In [6]:
filename = 'model_final.pickle'

pickle.dump(xgb_classifier, open(filename, "wb"))


## **3.3 Importing model to predict for one person**

In [16]:
# Usage Example' #################################################################################################

xgb_classifier = pickle.load(open('model_final.pickle', "rb"))


# This would be the answers given by the user
# (here i am getting a random row from the dataset for demonstration)
example_input = features_test.iloc[576]
example_input = example_input.to_numpy()
print ( example_input.shape)

def make_prediction(xgb_classifier, input):
    prediction = xgb_classifier.predict(input)

    return prediction


prediction = make_prediction(xgb_classifier, example_input)
print(prediction)


(81,)


XGBoostError: [10:24:50] C:/buildkite-agent/builds/buildkite-windows-cpu-autoscaling-group-i-08de971ced8a8cdc6-1/xgboost/xgboost-ci-windows/src/predictor/cpu_predictor.cc:377: Check failed: m->NumColumns() == model.learner_model_param->num_feature (1 vs. 81) : Number of columns in data must equal to trained model.

# **4. Vizualisations**

In [None]:
# tested values over all trials

sns.set(style="whitegrid", palette="Accent")

tids = [t['tid'] for t in trials.trials]
n_estimators = [t['misc']['vals']['n_estimators']for t in trials]

fig, ax = plt.subplots()
ax.scatter(tids, n_estimators)

ax.legend(('n_estimators'), loc='lower right')
ax.set_ylabel('n_estimators over all trials')
ax.set_xlabel('trialIDs')
fig.set_size_inches(10, 5)

fig.savefig('./visualizations/tested_values.png')


In [None]:
# loss Improvement over Trials
from itertools import chain

# loss Improvement over Trials


def plot_reached_min_losses(trials):
    losses = [t['result']['loss'] for t in trials]
    tids = [t['tid'] for t in trials.trials]
    n_estimators = [t['misc']['vals']['n_estimators'] for t in trials]

    n_estimators = list(chain.from_iterable(n_estimators))

    best_loss = losses[0]
    points_to_plot = []
    points_to_plot.append(losses[0])
    tids_with_loss_improvement = [0]
    counter = 0
    for i in range(1, len(losses)):
        if losses[i] < best_loss:
            tid = tids[i]
            # print(tid)
            points_to_plot.append(losses[i])
            tids_with_loss_improvement.append(tid)
            best_loss = losses[i]

    # plotting with logarithmic y-scale
    sns.set(style="whitegrid")
    fig, ax = plt.subplots()
    # ax.set_yscale('log')
    ax.set_ylabel('developement of min loss')
    ax.set_xlabel('Trial-IDs')
    fig.set_size_inches(15, 5)
    ax.plot(tids_with_loss_improvement, points_to_plot,
            color="mediumpurple", linestyle='-', marker='o')
    ax.scatter(tids, losses, color='skyblue')


plot_reached_min_losses(trials)

fig.savefig('./visualizations/loss_improvement.png')
