In [27]:
import warnings
import random
import pickle

import pandas as pd

from hyperopt import fmin, tpe, hp, STATUS_OK, Trials

from sklearn.model_selection import train_test_split
from sklearn.linear_model import SGDClassifier
from sklearn.ensemble import GradientBoostingClassifier, HistGradientBoostingClassifier
from sklearn.metrics import accuracy_score
from sklearn import svm
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.exceptions import ConvergenceWarning

from collections import Counter

warnings.simplefilter(action='ignore', category=ConvergenceWarning)
warnings.simplefilter(action='ignore', category=UserWarning)

Notes: 
- for final presentation, maybe mention development date for algorithms 
- visualisierung für probierte werte 
- vergleichbare projekte von anderen + deren reached accuracy ( zum vergleich, um zu beweisen, dass unsere gut?)
- evtl nur max iter 200 - 400 testen oder so + cast to int 

# **0. Data Prep**

In [43]:
df = pd.read_csv('./data/brfss_wo_null.csv')

In [45]:
# *Ever told you had diabetes*
print(Counter(df['DIABETE4']))

df.DIABETE4.unique()

Counter({0.0: 324770, 1.0: 53110})


array([0., 1.])

In [47]:
df = df.drop(df.columns[0],axis = 1)

# remove colums containing NaN values
df = df.dropna(axis=1)

df.set_axis(range(len(df)), inplace=True)

# removing target('diabetes') from features
target = df['DIABETE4']
features = df.drop(['DIABETE4'],axis=1)

# splitting into training and test data
features_train, features_test, target_train, target_test = train_test_split(features, target, test_size=0.2, random_state=42)

print(features_train.shape)
print(features_test.shape)
print(target_train.shape)
print(target_test.shape)


  df.set_axis(range(len(df)), inplace=True)


(302304, 174)
(75576, 174)
(302304,)
(75576,)


# **1. Support Vector Machines**

**Conclusion:**

- Using a Pipepline/StandartScaler: Best reachable Accuracy: 76 % with max_iter = 93

-> Dont use, 2. & 3. perform better

In [None]:
# Optimizing max_iter to reach the highest possible Accuracy

# MAX_EVALS should be the same as/max the SEARCHSPACE so all possibilities are tried out
MAX_EVALS = 10
SEARCH_SPACE = [hp.randint('max_iter',100)]

### Optimizaion ##############################################################################################################
def cost_function(max_iter):
    max_iter = max_iter[0]
    if max_iter == 0:
        return 0
    svm_classifier = make_pipeline(StandardScaler(), svm.SVC(max_iter=max_iter)).fit(features_train, target_train)
    svm_predictions = svm_classifier.predict(features_test)
    svm_accuracy = accuracy_score(target_test, svm_predictions)

    return {'loss': - svm_accuracy , 'status': STATUS_OK }

trials = Trials()
best = fmin(cost_function,
    space = SEARCH_SPACE,
    algo = tpe.suggest,
    max_evals = MAX_EVALS, 
    trials = trials)

print(best)

In [None]:
# Predicting for one specific 'person' #################################################################################################

# max_iter = best['max_iter']
# svm_classifier = make_pipeline(StandardScaler(), svm.SVC(max_iter=max_iter)).fit(features_train, target_train)

# taking a random row from the test data to predict a result for:
# def make_test_prediction(svm_classifier):
#     rand_index = random.randint(0, 32581)
#     test_row = features_test.iloc[rand_index] #.values.flatten().tolist()
#     test_groundtruth = target_test.iloc[rand_index]
#     prediction = svm_classifier.predict([test_row])
#     
#     return (prediction, test_groundtruth)

# prediction, test_groundtruth = make_test_prediction(svm_classifier)
# print (prediction == test_groundtruth)

# **2. Stocastic Gradient Descent**

**Conclusion:**

- Using a Pipepline/StandartScaler: Best reachable Accuracy:  88%, with max_iter = 14 

In [None]:
# Optimizing max_iter to reach the highest possible Accuracy

# MAX_EVALS should be the same as/max the SEARCHSPACE so all possibilities are tried out
MAX_EVALS = 15
SEARCH_SPACE = [hp.randint('max_iter',100)]

### Optimizaion ##############################################################################################################
def cost_function(max_iter):
    max_iter = max_iter[0]
    if max_iter == 0:
        return 0
    sgd_classifier = make_pipeline(StandardScaler(), SGDClassifier(max_iter=max_iter)).fit(features_train, target_train)
    sgd_predictions = sgd_classifier.predict(features_test)
    sgd_accuracy = accuracy_score(target_test, sgd_predictions)

    return {'loss': - sgd_accuracy , 'status': STATUS_OK }

trials = Trials()
best = fmin(cost_function,
    space = SEARCH_SPACE,
    algo = tpe.suggest,
    max_evals = MAX_EVALS, 
    trials = trials)

print(best)

In [None]:
MAX_ITER = best['max_iter']
sgd_classifier = make_pipeline(StandardScaler(), SGDClassifier(max_iter=MAX_ITER)).fit(features_train, target_train)

In [None]:
# Predicting for one specific 'person'  #################################################################################################

# taking a random row from the test data to predict a result for:
def make_test_prediction(sgd_classifier):
    rand_index = random.randint(0, 32581)
    test_row = features_test.iloc[rand_index]
    test_groundtruth = target_test.iloc[rand_index]
    prediction = sgd_classifier.predict([test_row])

    return (prediction, test_groundtruth)

prediction, test_groundtruth = make_test_prediction(sgd_classifier)
print (prediction == test_groundtruth)

# **3. Stocastic Gradient Boosting**

## **3.1 Optimizing n_estimators**

In [48]:
print("No. of columns containing null values")
print(len(df.columns[df.isna().any()]))

print("No. of columns not containing null values")
print(len(df.columns[df.notna().all()]))

print("Total no. of columns in the dataframe")
print(len(df.columns))

print("No. of rows containing null values")
null_rows = df.isnull().any(axis=1).sum()
print(null_rows)

print("Total no. of rows in the dataframe")
rows = df.shape[0]
print(rows)

print("No of rows if all nans dropped")
remaining_rows = rows - null_rows
print(remaining_rows)

No. of columns containing null values
0
No. of columns not containing null values
175
Total no. of columns in the dataframe
175
No. of rows containing null values
0
Total no. of rows in the dataframe
377880
No of rows if all nans dropped
377880


In [49]:
## Optimizing n_estimators to reach the highest possible accuracy

# N_ESTIMATORS should be max the SEARCHSPACE so all possibilities are tried once
MAX_EVALS = 1
SEARCH_SPACE = [hp.uniformint('n_estimators', 100, 200)]

### Optimizaion ##############################################################################################################
def cost_function(n_estimators):
    n_estimators = n_estimators[0]
    if n_estimators == 0:
        return 0
    sgb_classifier = make_pipeline(StandardScaler(), GradientBoostingClassifier(n_estimators= n_estimators, learning_rate=0.5, random_state=0)).fit(features_train, target_train)
    sgb_predictions = sgb_classifier.predict(features_test)
    sgb_accuracy = accuracy_score(target_test, sgb_predictions)
    return {'loss': - sgb_accuracy , 'status': STATUS_OK }

trials = Trials()
best = fmin(cost_function,
    space = SEARCH_SPACE,
    algo = tpe.suggest,
    max_evals = MAX_EVALS, 
    trials = trials)

print(best)

  0%|          | 0/1 [00:00<?, ?trial/s, best loss=?]

best loss: -0.8988102735569271

{'n_estimators': 571.0}

## **3.2 Training & saving model with optimized n_estimators**

In [None]:
n_estimators = best['n_estimators']

sgb_classifier = make_pipeline(StandardScaler(), GradientBoostingClassifier(n_estimators=n_estimators, learning_rate=0.5, random_state=0)).fit(features_train, target_train)

In [None]:
filename = 'model_imputed.pickle'

pickle.dump(sgb_classifier, open(filename, "wb"))

## **3.3 importing model to predict for one person**

In [None]:
sgb_classifier = pickle.load(open(filename, "rb"))

In [None]:
# Verify performance / See false nagatives vs. false positives (ca: 7157 - 1855)

predictions = sgb_classifier.predict(features_test)

def calc_metrics(predictions, target_test):
    prediction_true = false_negative = false_positive = 0

    for i in range(0, 87331):
        if target_test.iloc[i] == predictions[i]:
            prediction_true+= 1
        if (target_test.iloc[i] == 1 and predictions[i] == 0):
            false_negative +=1
        if (target_test.iloc[i] == 0 and predictions[i] == 1):
            false_positive +=1
    accuracy = 1 - (false_negative + false_positive) / len(predictions)

    return (prediction_true, false_negative, false_positive, accuracy)

prediction_true, false_negative, false_positive, accuracy = calc_metrics(predictions, target_test)
print(prediction_true, false_negative, false_positive, accuracy)

In [None]:
# Predicting for one specific 'person' #################################################################################################

# taking a random row from the test data to predict a result for:
def make_test_prediction(sgb_classifier):
    rand_index = random.randint(0, 32581)
    test_row = features_test.iloc[rand_index] #.values.flatten().tolist()
    test_groundtruth = target_test.iloc[rand_index]
    prediction = sgb_classifier.predict([test_row])
    
    return (prediction, test_groundtruth)

prediction, test_groundtruth = make_test_prediction(sgb_classifier)
print (prediction)
print (test_groundtruth)

**Conclusion:**

- simple SGB: Best reachable Accuracy:  87.33042784359463%, with max_iter= 74
- Using a Pipepline/StandartScaler: Best reachable Accuracy:  100%, with max_iter = 11 

# **4. Vizualisations**

In [None]:
# Vizualisation
import seaborn as sns
from matplotlib import pyplot as plt

sns.set(style="whitegrid", palette= "Accent")

tids = [t['tid'] for t in trials.trials]
max_iter = [t['misc']['vals']['n_estimators']for t in trials]


fig, ax = plt.subplots()
ax.scatter(tids, max_iter)

ax.legend(('n_estimators'), loc='lower right')
ax.set_ylabel('n_estimators over all trials')
ax.set_xlabel('trialIDs')
fig.set_size_inches(10, 5)

fig.savefig('./visualizations/tested_values.png')