<b>Dependencies:</b> <br>
    import pandas as pd <br><br>
    from sklearn.neighbors import KNeighborsClassifier <br>
    from sklearn.naive_bayes import GaussianNB <br>
    from sklearn.svm import SVC <br>
    from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier <br><br>
    from sklearn.model_selection import train_test_split <br>
    from sklearn.model_selection import KFold<br>
    from sklearn.model_selection import cross_validate<br><br>
    from sklearn.metrics import accuracy_score <br>
    from sklearn.metrics import precision_score <br>
    from sklearn.metrics import recall_score <br>
    from sklearn.metrics import f1_score <br>



In [7]:
# BASIC IMPLEMENTATION OF DIFFERENT EVALUATION MODELS AND METHODS 75-25

import pandas as pd

# Models
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import RandomForestClassifier

# Evaluation methods
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_validate

# tp: True Positive
# fp: False Positive
# tn: True Negative
# fn: false negative

# Number of correct predictions / Total number of predictions
from sklearn.metrics import accuracy_score

# tp / (tp + fp) -> Important when the cost of False Positive is high
from sklearn.metrics import precision_score

# tp / (tp + fn) -> Important when the cost of False Negative is high
from sklearn.metrics import recall_score

# (2* precision * recall) / (precision + recall) -> When looking for a balance between Precision and Recall AND
#                                                   there is an uneven class distribution (large number of negatives)
from sklearn.metrics import f1_score


diabetes_cleaned = pd.read_csv('../datasets/diabetes_cleaned.csv')
features = ['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness',
            'Insulin', 'BMI', 'DiabetesPedigreeFunction', 'Age']
X = diabetes_cleaned[features]
y = diabetes_cleaned.Outcome
random_seed = 3


# Models
models = [KNeighborsClassifier(), GaussianNB(), SVC(random_state=random_seed),
          GradientBoostingClassifier(random_state=random_seed), RandomForestClassifier(random_state=random_seed)]
model_names = ['KNN', 'GNB', 'SVC', 'GB', 'RF']


# Train/Test Split: Training-Evaluation
# Since we are not going to adjust the parameters for this first test, it is not necessary to use a validation set
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify = diabetes_cleaned.Outcome, random_state=random_seed)
accuracy_train_test = []
precision_train_test = []
recall_train_test = []
f1_train_test = []

for model in models:
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    accuracy_train_test.append(accuracy_score(y_test, y_pred))
    precision_train_test.append(precision_score(y_test, y_pred))
    recall_train_test.append(recall_score(y_test, y_pred))
    f1_train_test.append(f1_score(y_test, y_pred))

print("Train/Test Split")
results_train_test = pd.DataFrame({'Model': model_names,
                                   'Accuracy': accuracy_train_test,
                                   'Precision ': precision_train_test,
                                   'Recall': recall_train_test,
                                   'f1': f1_train_test,})
print(results_train_test, "\n\n")


# Cross-Validation
accuracy_cross_val = []
precision_cross_val = []
recall_cross_val = []
f1_cross_val = []

kfold = KFold(n_splits=10)
scoring = {'accuracy': 'accuracy',
           'precision': 'precision',
           'recall': 'recall',
           'f1': 'f1',}

for model in models:
    scores = cross_validate(model, X, y, cv=kfold, scoring=scoring)
    accuracy_cross_val.append(scores['test_accuracy'].mean())
    precision_cross_val.append(scores['test_precision'].mean())
    recall_cross_val.append(scores['test_recall'].mean())
    f1_cross_val.append(scores['test_f1'].mean())

print("Cross-Validation")
results_cross_val = pd.DataFrame({'Model': model_names,
                                  'Accuracy': accuracy_cross_val,
                                  'Precision': precision_cross_val,
                                  'Recall': recall_cross_val,
                                  'f1': f1_cross_val})
print(results_cross_val)


Train/Test Split
  Model  Accuracy  Precision     Recall        f1
0   KNN  0.701657    0.571429  0.516129  0.542373
1   GNB  0.762431    0.646154  0.677419  0.661417
2   SVC  0.718232    0.622222  0.451613  0.523364
3    GB  0.790055    0.700000  0.677419  0.688525
4    RF  0.773481    0.684211  0.629032  0.655462 


Cross-Validation
  Model  Accuracy  Precision    Recall        f1
0   KNN  0.719787   0.597252  0.513385  0.545803
1   GNB  0.757021   0.658534  0.621551  0.632747
2   SVC  0.758581   0.723262  0.461847  0.560023
3    GB  0.775057   0.695959  0.606272  0.643752
4    RF  0.773744   0.707577  0.591173  0.637477


In [1]:
# AVERAGE RESULTS OF BASIC IMPLEMENTATIONS 75-25
# In order to determine the best models their implementations will be repeated with random but equal seeds for each
# option and their results will be averaged.

import pandas as pd
import random

# Models
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import RandomForestClassifier

# Evaluation methods
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_validate

# tp: True Positive
# fp: False Positive
# tn: True Negative
# fn: false negative

# Number of correct predictions / Total number of predictions
from sklearn.metrics import accuracy_score

# tp / (tp + fp) -> Important when the cost of False Positive is high
from sklearn.metrics import precision_score

# tp / (tp + fn) -> Important when the cost of False Negative is high
from sklearn.metrics import recall_score

# (2* precision * recall) / (precision + recall) -> When looking for a balance between Precision and Recall AND
#                                                   there is an uneven class distribution (large number of negatives)
from sklearn.metrics import f1_score


def lists_average(results_dict):
    """ Provides a list with the average values of the lists in the given dictionary

        :param results_dict: Dictionary with lists containing the values obtained after executing different ML models

        :return: list with the average values
    """

    result = []
    for model_results in results_dict:
        result.append(sum(results_dict[model_results])/len(results_dict[model_results]))

    return result


diabetes_cleaned = pd.read_csv('../datasets/diabetes_cleaned.csv')
features = ['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness',
                 'Insulin', 'BMI', 'DiabetesPedigreeFunction', 'Age']
model_names = ['KNN', 'GNB', 'SVC', 'GB', 'RF']
X = diabetes_cleaned[features]
y = diabetes_cleaned.Outcome

accuracy_train_test = {'KNN': [], 'GNB': [], 'SVC': [], 'GB': [], 'RF': []}
precision_train_test = {'KNN': [], 'GNB': [], 'SVC': [], 'GB': [], 'RF': []}
recall_train_test = {'KNN': [], 'GNB': [], 'SVC': [], 'GB': [], 'RF': []}
f1_train_test = {'KNN': [], 'GNB': [], 'SVC': [], 'GB': [], 'RF': []}

accuracy_cross_val = {'KNN': [], 'GNB': [], 'SVC': [], 'GB': [], 'RF': []}
precision_cross_val = {'KNN': [], 'GNB': [], 'SVC': [], 'GB': [], 'RF': []}
recall_cross_val = {'KNN': [], 'GNB': [], 'SVC': [], 'GB': [], 'RF': []}
f1_cross_val = {'KNN': [], 'GNB': [], 'SVC': [], 'GB': [], 'RF': []}

kfold = KFold(n_splits=10)
scoring = {'accuracy': 'accuracy',
           'precision': 'precision',
           'recall': 'recall',
           'f1': 'f1',}


times_repeated = 100
for _ in range(times_repeated):
    random_seed = random.randint(0, 1000)

    # Models
    models = [KNeighborsClassifier(), GaussianNB(), SVC(random_state=random_seed),
              GradientBoostingClassifier(random_state=random_seed), RandomForestClassifier(random_state=random_seed)]

    # Train/Test Split: Training-Evaluation
    # Since we are not going to adjust the parameters for this first test, it is not necessary to use a validation set
    X_train, X_test, y_train, y_test = train_test_split(X, y, stratify = diabetes_cleaned.Outcome, random_state=random_seed)

    for model, model_name in zip(models, model_names):
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        accuracy_train_test[model_name].append(accuracy_score(y_test, y_pred))
        precision_train_test[model_name].append(precision_score(y_test, y_pred))
        recall_train_test[model_name].append(recall_score(y_test, y_pred))
        f1_train_test[model_name].append(f1_score(y_test, y_pred))


    # Cross-Validation
    for model, model_name in zip(models, model_names):
        scores = cross_validate(model, X, y, cv=kfold, scoring=scoring)
        accuracy_cross_val[model_name].append(scores['test_accuracy'].mean())
        precision_cross_val[model_name].append(scores['test_precision'].mean())
        recall_cross_val[model_name].append(scores['test_recall'].mean())
        f1_cross_val[model_name].append(scores['test_f1'].mean())


# Average results
dicts_to_average = [accuracy_train_test, precision_train_test, recall_train_test, f1_train_test,
                   accuracy_cross_val, precision_cross_val, recall_cross_val, f1_cross_val]
results = []

for dict_to_average in dicts_to_average:
    results.append(lists_average(dict_to_average))
results.reverse()


print("Train/Test Split")
results_train_test = pd.DataFrame({'Model': model_names,
                                   'Accuracy': results.pop(),
                                   'Precision ': results.pop(),
                                   'Recall': results.pop(),
                                   'f1': results.pop()})
print(results_train_test, "\n\n")

print("Cross-Validation")
results_cross_val = pd.DataFrame({'Model': model_names,
                                  'Accuracy': results.pop(),
                                  'Precision': results.pop(),
                                  'Recall': results.pop(),
                                  'f1': results.pop()})
print(results_cross_val)


Train/Test Split
  Model  Accuracy  Precision     Recall        f1
0   KNN  0.716685    0.600252  0.524677  0.558368
1   GNB  0.757735    0.660076  0.609839  0.632323
2   SVC  0.757459    0.731904  0.466290  0.567534
3    GB  0.757514    0.669016  0.586129  0.622743
4    RF  0.765801    0.692429  0.576774  0.626631 


Cross-Validation
  Model  Accuracy  Precision    Recall        f1
0   KNN  0.719787   0.597252  0.513385  0.545803
1   GNB  0.757021   0.658534  0.621551  0.632747
2   SVC  0.758581   0.723262  0.461847  0.560023
3    GB  0.775835   0.694017  0.613426  0.647162
4    RF  0.772065   0.700827  0.589435  0.635943
