In [None]:
import warnings

import pandas as pd

from hyperopt import fmin, tpe, hp, STATUS_OK, Trials

from sklearn.model_selection import train_test_split
from sklearn.linear_model import SGDClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import accuracy_score
from sklearn import svm
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.exceptions import ConvergenceWarning

from collections import Counter

warnings.simplefilter(action='ignore', category=ConvergenceWarning)

Notes: 
- basically is erm KNN & Decision tree zoid, bzw. daccuracy ned hoch gnua
- am besten in da final presentation dann erwähnen, wann classifiers entwickelt woan hand**
 
 Algorithms to try out: 

- SVM
- SGD
- SGB


update performance conclusion automatically

# **0. Data Prep**

In [None]:
df = pd.read_csv('./data/clean_data.csv')

In [None]:
df = df.drop(df.columns[0],axis = 1)

In [None]:
# search for columns containing 'DIABETE' to find target var

diabate_cols = [col for col in df.columns if 'DIABETE' in col]
diabate_cols.append([col for col in df.columns if 'diabete' in col])

In [None]:
# remove colums containing NaN values
df = df.dropna(axis=1)

# print("No. of columns containing null values")
# print(len(df.columns[df.isna().any()]))

# print("No. of columns not containing null values")
# print(len(df.columns[df.notna().all()]))

# print("Total no. of columns in the dataframe")
# print(len(df.columns))

In [None]:
# removing target var from feature list
target = df['diabetes']
features = df.drop(['diabetes'],axis=1)

In [None]:
# splitting into training and test data

features_train, features_test, target_train, target_test = train_test_split(features, target, test_size=0.2, random_state=42)

In [None]:
# print(features_train.shape)
# print(features_test.shape)
# print(target_train.shape)
# print(target_test.shape)

# **4. Support Vector Machine**

In [None]:
# Optimizing max_iter to reach the highest possible Accuracy

# amount of trials (should be the same as the max value for max_iter, then all possibilities are tried out)
MAX_EVALS = 20

# One 'run' equals one fmin-execution where each run for a number of x trials
SEARCH_SPACE = [hp.randint('max_iter',100)]

### Optimizaion ##############################################################################################################

def cost_function(max_iter):
    max_iter = max_iter[0]
    print(max_iter)
    if max_iter == 0:
        return 0
    svm_classifier = make_pipeline(StandardScaler(), svm.SVC(max_iter=max_iter)).fit(features_train, target_train)
    # svm_classifier = svm.SVC(kernel = "linear", max_iter = max_iter)
    svm_predictions = svm_classifier.predict(features_test)
    svm_accuracy = accuracy_score(target_test, svm_predictions)

    print(f"Accuracy : {100 * svm_accuracy}")
    return {'loss': - svm_accuracy , 'status': STATUS_OK }

trials = Trials()
best = fmin(cost_function,
    space = SEARCH_SPACE,
    algo = tpe.suggest,
    max_evals = MAX_EVALS, 
    trials = trials)

print(best)

**Conclusion:**


- simple SVM: Best reachable Accuracy: 86.80559 %, with max_iter= 36
- Using a Pipepline/StandartScaler: Best reachable Accuracy:  99.668 %, with max_iter = 3

# **5. Stocastic Gradient Descent**

In [None]:
# Optimizing max_iter to reach the highest possible Accuracy

# amount of trials (should be the same as the max value for max_iter, then all possibilities are tried out)
MAX_EVALS = 40

# One 'run' equals one fmin-execution where each run for a number of x trials
SEARCH_SPACE = [hp.randint('max_iter',100)]

### Optimizaion ##############################################################################################################

def cost_function(max_iter):
    max_iter = max_iter[0]
    if max_iter == 0:
        return 0
    print(max_iter)
    # sgd_classifier = make_pipeline(StandardScaler(), SGDClassifier(max_iter=max_iter)).fit(features_train, target_train)
    sgd_classifier = SGDClassifier(max_iter = max_iter).fit(features_train, target_train)
    sgd_predictions = sgd_classifier.predict(features_test)
    sgd_accuracy = accuracy_score(target_test, sgd_predictions)

    print(f"Accuracy : {100 * sgd_accuracy}")
    return {'loss': - sgd_accuracy , 'status': STATUS_OK }

trials = Trials()
best = fmin(cost_function,
    space = SEARCH_SPACE,
    algo = tpe.suggest,
    max_evals = MAX_EVALS, 
    trials = trials)

print(best)

**Conclusion:**

- simple SGD: Best reachable Accuracy:  87.33042784359463%, with max_iter= 74
- Using a Pipepline/StandartScaler: Best reachable Accuracy:  100%, with max_iter = 11 

# **6. Stocastic Gradient Boosting**

In [None]:
# Optimizing max_depth to reach the highest possible Accuracy

# amount of trials (should be the same as the max value for max_iter, then all possibilities are tried out)
MAX_EVALS = 10

# One 'run' equals one fmin-execution where each run for a number of x trials
SEARCH_SPACE = [hp.randint('max_depth',100)]

### Optimizaion ##############################################################################################################

def cost_function(max_depth):
    max_depth = max_depth[0]
    if max_depth == 0:
        return 0
    print(max_depth)
    # sgb_classifier = make_pipeline(StandardScaler(), sgb_classifier = GradientBoostingClassifier(n_estimators=10, learning_rate=0.5, max_depth=max_depth, random_stat=0).fit(features_train, target_train)
    sgb_classifier = GradientBoostingClassifier(n_estimators=10, learning_rate=0.5, max_depth=max_depth).fit(features_train, target_train)    
    sgb_predictions = sgb_classifier.predict(features_test)
    sgb_accuracy = accuracy_score(target_test, sgb_predictions)
    comp = target_test == sgb_predictions
    print(Counter(comp))
    print(f"Accuracy : {100 * sgb_accuracy}")
    return {'loss': - sgb_accuracy , 'status': STATUS_OK }

trials = Trials()
best = fmin(cost_function,
    space = SEARCH_SPACE,
    algo = tpe.suggest,
    max_evals = MAX_EVALS, 
    trials = trials)

print(best)

**Conclusion:**

- simple SGB: Best reachable Accuracy:  87.33042784359463%, with max_iter= 74
- Using a Pipepline/StandartScaler: Best reachable Accuracy:  100%, with max_iter = 11 