This notebook evaluated classification data None, Pre-Diabetes, and Diabetes
Due to kernel crashes, 97 models were generated and saved.
Models were tested before and after pyCaret tune_model

Samplers: 
    TomekLinks
    SMOTE
    RandomOverSampler
    NearMiss
    AllKNN
    ADASYN

Weights
    Narrowed down to 3%

Models:
    Light Gradient Boosting Machine
    Gradient Boosting Classifier
    Ada Boost Classifier
    Logistic Regression
    Ridge Classifier
    Linear Discriminant Analysis
    Quadratic Discriminant Analysis
    Dummy Classifier
    SVM - Linear Kernel
    Random Forest Classifier
    Extra Trees Classifier
    K Neighbors Classifier
    Decision Tree Classifier
    Naive Bayes


In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from pycaret.classification import *
from imblearn.over_sampling import *
from imblearn.under_sampling import *
from sklearn.metrics import confusion_matrix
import seaborn as sns
import numpy as np
import os
import pycaret


In [2]:
TARGET = 'diabetes'
ANSWER = 42

In [3]:
data3 = pd.read_csv(r"..\3label2.csv")
data2 = pd.read_csv(r"..\binary.csv")

data = pd.read_csv("..\diabetes_cleaned.csv")
data.drop(['Unnamed: 0','heart_attack','angina_or_chd','chd_mi'], axis=1, inplace=True)

In [186]:
#display distribution of label records
print(data.diabetes.value_counts())

#convert the solution to binary
data.loc[data.diabetes.isin([2]), "diabetes"] = 1

#display distribution of label records
print(data.diabetes.value_counts())

0.0    254681
2.0     41479
1.0      6793
Name: diabetes, dtype: int64
0.0    254681
1.0     48272
Name: diabetes, dtype: int64


In [183]:
pd.set_option('display.max_rows', None)
print("Binary\nNone, Pre+Diabetes")
high = data2.sort_values("Total", ascending=False)
high.sort_values("Total", ascending=False)

Binary
None, Pre+Diabetes


Unnamed: 0,None,Diab,Classifier,Sampler,Mode,Total,Diab2
19,69.835287,78.371659,GradientBoostingClassifier,RandomOverSampler,auto,148.206946,78.371659
18,69.334668,78.868863,LGBMClassifier,RandomOverSampler,auto,148.203531,78.868863
20,71.266466,76.299979,AdaBoostClassifier,RandomOverSampler,auto,147.566446,76.299979
23,71.091741,75.750984,LogisticRegression,RandomOverSampler,auto,146.842725,75.750984
21,70.257377,76.527864,RidgeClassifier,RandomOverSampler,auto,146.785241,76.527864
22,70.257377,76.527864,LinearDiscriminantAnalysis,RandomOverSampler,auto,146.785241,76.527864
37,71.036771,75.699192,LogisticRegression,SMOTE,all,146.735963,75.699192
36,69.717494,76.993992,LinearDiscriminantAnalysis,SMOTE,all,146.711486,76.993992
35,69.717494,76.993992,RidgeClassifier,SMOTE,all,146.711486,76.993992
8,67.9349,78.568469,LinearDiscriminantAnalysis,ADASYN,auto,146.503369,78.568469


In [98]:
print("Classification\nNone, Pre, Diabetes")
high_3 =data3[((data3.Total > 140) | (data3.Sampler=='Base'))].sort_values("Total", ascending=False)
high_3.sort_values("Total", ascending=False)

Classification
None, Pre, Diabetes


Unnamed: 0,None,Pre,Diab,Classifier,Sampler,Mode,Total,Diab2
47,62.284043,32.450331,63.090646,GradientBoostingClassifier,RandomOverSampler,auto,157.82502,95.540977
46,62.325271,28.550405,65.923337,LGBMClassifier,RandomOverSampler,auto,156.799012,94.473741
48,64.202136,31.714496,60.342334,AdaBoostClassifier,RandomOverSampler,auto,156.258966,92.05683
63,63.833045,33.995585,57.581967,LogisticRegression,SMOTE,all,155.410598,91.577552
49,64.469138,31.420162,59.365959,LogisticRegression,RandomOverSampler,auto,155.255259,90.786121
50,62.884797,35.246505,56.231919,LinearDiscriminantAnalysis,RandomOverSampler,auto,154.36322,91.478424
64,62.142689,37.380427,54.689007,LinearDiscriminantAnalysis,SMOTE,all,154.212122,92.069434
8,62.117167,36.129507,55.810029,LinearDiscriminantAnalysis,ADASYN,auto,154.056703,91.939536
7,63.64261,32.00883,58.208775,LogisticRegression,ADASYN,auto,153.860216,90.217605
9,66.862337,27.740986,58.570395,RidgeClassifier,ADASYN,auto,153.173718,86.311381


In [6]:
train_df, test_df = train_test_split(data, test_size=0.2, random_state=ANSWER, stratify=data['diabetes'])
test_labels = test_df[TARGET]
test_features = test_df.drop(columns=TARGET)

In [8]:
#Categorical Values
melt = data.describe()[7:8].melt()
categorical_features = melt[((melt.value > 1) & (melt.value < 24))][['variable']].drop(0).to_numpy().reshape(11)
categorical_features

array(['bmi', 'smoker', 'any_healthcare_insurance',
       'general_health_status', 'mental_health_status',
       'physical_health_status', 'age', 'education', 'income', 'race',
       'routine_checkup'], dtype=object)

In [9]:
diabetes = setup(data = train_df, target = TARGET, categorical_features = ['bmi', 'smoker', 'any_healthcare_insurance',
       'general_health_status', 'mental_health_status',
       'physical_health_status', 'age', 'education', 'income', 'race',
       'routine_checkup'], session_id=ANSWER)

Unnamed: 0,Description,Value
0,Session id,42
1,Target,diabetes
2,Target type,Multiclass
3,Original data shape,"(242362, 22)"
4,Transformed data shape,"(242362, 63)"
5,Transformed train set shape,"(169653, 63)"
6,Transformed test set shape,"(72709, 63)"
7,Numeric features,10
8,Categorical features,11
9,Preprocess,True


In [12]:
import matplotlib.pyplot as plt

def print_test_heat_map3(model, sampler, strategy):
    predictions = predict_model(model, data=test_features)
    cm = confusion_matrix(test_labels, predictions["prediction_label"])
    cm_total = cm.sum(axis=1)
    result = [[0.0,0.0,0.0],[0.0,0.0,0.0],[0.0,0.0,0.0]]    
    for row in range(0,3):
        for column in range(0,3):
            result[row][column] = cm[row][column]/(cm_total[row])
            
    total_percent = result[0][0] + result[1][1] + result[2][2]
    diabetes_percent = result[1][1] + result[2][2]
    
    print(f"{result[0][0]*100},{result[1][1]*100},{result[2][2]*100},{type(model).__name__},{sampler},{strategy},{total_percent*100},{diabetes_percent*100}")

    
def print_test_heat_map2(model, sampler, strategy):
    predictions = predict_model(model, data=test_features)
    cm = confusion_matrix(test_labels, predictions["prediction_label"])
    cm_total = cm.sum(axis=1)
    result = [[0.0,0.0,0.0],[0.0,0.0,0.0]]    
    for row in range(0,2):
        for column in range(0,2):
            result[row][column] = cm[row][column]/(cm_total[row])
            
    total_percent = result[0][0] + result[1][1]
    diabetes_percent = result[1][1]
    
    print(f"{result[0][0]*100},{result[1][1]*100},{type(model).__name__},{sampler},{strategy},{total_percent*100},{diabetes_percent*100}")
 

In [13]:
def savemodel(model, sampler, strategy):
    model_name = f'model_{type(model).__name__}_{sampler}_{strategy}'
    save_model(model, model_name)

In [14]:
samplers_working = pd.read_csv(r"..\3label.csv")
samplers_working = samplers_working.drop_duplicates(subset=['Sampler','type'])[['Sampler','type']].sort_values(by='Sampler')
samplers_working.drop([28], axis =0, inplace=True)  #duplicate
samplers_working.drop([98], axis =0, inplace=True)  #duplicate
samplers_working.drop([70], axis =0, inplace=True)  #duplicate
samplers_working

Unnamed: 0,Sampler,type
42,NearMiss,all
56,RandomOverSampler,auto
14,SMOTE,all
0,TomekLinks,all


In [15]:
def Sampler(df, target, sampler_name, strategy):
    sampler = globals()[sampler_name](sampling_strategy=strategy)

    under_train_labels = df[target]
    under_train_features = df.drop(columns=target)
    
    X, y = sampler.fit_resample(under_train_features, under_train_labels)
    
    sampled_data = X
    sampled_data[target] = y
    
    print(sampled_data[target].value_counts())
    return sampled_data    

In [99]:
samplers_working


Unnamed: 0,Sampler,type
42,NearMiss,all
56,RandomOverSampler,auto
14,SMOTE,all
0,TomekLinks,all


In [16]:
#Print working types.
for index, row in samplers_working.iterrows():
    sampled_data = Sampler(train_df, TARGET, row.Sampler, row.type)
    sampled_data.diabetes.value_counts()
    s = setup(sampled_data, target = TARGET, session_id = 123,verbose=False)
    best = compare_models(n_select=14,verbose=False)
    for model in best:   
        print(row.Sampler, row.type) 
        try:
            savemodel(model, row.Sampler, row.type) 
        except:
            print("An exception occurred")
          
        #print(f'\nSampler: {sampler}, Strategy: {strategy}')
        #print_test_heat_map(model, row.Sampler, row.type)

0.0    5434
1.0    5434
2.0    5434
Name: diabetes, dtype: int64
NearMiss all
Transformation Pipeline and Model Successfully Saved
NearMiss all
Transformation Pipeline and Model Successfully Saved
NearMiss all
Transformation Pipeline and Model Successfully Saved
NearMiss all
Transformation Pipeline and Model Successfully Saved
NearMiss all
Transformation Pipeline and Model Successfully Saved
NearMiss all
Transformation Pipeline and Model Successfully Saved
NearMiss all
Transformation Pipeline and Model Successfully Saved
NearMiss all
Transformation Pipeline and Model Successfully Saved
NearMiss all
Transformation Pipeline and Model Successfully Saved
NearMiss all
Transformation Pipeline and Model Successfully Saved
NearMiss all
Transformation Pipeline and Model Successfully Saved
NearMiss all
Transformation Pipeline and Model Successfully Saved
NearMiss all
Transformation Pipeline and Model Successfully Saved
NearMiss all
Transformation Pipeline and Model Successfully Saved
0.0    2037

In [17]:
import os

In [None]:
# for filename in os.listdir(r'.'):
#     if ".pkl" in filename:
#         # checking if it is a file
#         if os.path.isfile(filename):
#             print(filename)
#             load_model(filename)
            

In [None]:
#  gbc_r_a = load_model("model_GradientBoostingClassifier_RandomOverSampler_auto")
#  gbc_s_a = load_model('model_GradientBoostingClassifier_SMOTE_all')
#  blender = blend_models([gbc_r_a,gbc_s_a])
#  print_test_heat_map3(blender,"a","b")

#  #98.40780587403802,0.0,12.704918032786885,VotingClassifier,a,b,111.1127239068249,12.704918032786885


In [None]:
# models = []
# results = []

# for i in np.arange(0.1,1,0.1):
#     model = create_model('lightgbm', learning_rate = i)
#     model_results = pull().loc[['Mean']]
#     models.append(model)
#     results.append(model_results)
    
# results = pd.concat(results, axis=0)
# results.index = np.arange(0.1,1,0.1)
# results.plot()

In [40]:
model = load_model('model_LogisticRegression_SMOTE_all')
print_test_heat_map3(model,"SMOTE","all")

result = tune_model(model)
result
print_test_heat_map3(result,"SMOTE","all")

Transformation Pipeline and Model Successfully Loaded


63.8330453902937,33.99558498896248,57.58196721311475,Pipeline,SMOTE,all,155.41059759237092,91.57755220207721


Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.857,0.8055,0.857,0.8035,0.8149,0.1318,0.1765
1,0.86,0.8113,0.86,0.8104,0.8175,0.1421,0.195
2,0.8565,0.7979,0.8565,0.8009,0.8123,0.1165,0.162
3,0.8601,0.8007,0.8601,0.8105,0.8183,0.1474,0.1995
4,0.8596,0.8137,0.8596,0.8091,0.8177,0.1452,0.1958
5,0.8576,0.8043,0.8576,0.8029,0.814,0.1267,0.1744
6,0.859,0.8086,0.859,0.8058,0.8147,0.1285,0.1806
7,0.8594,0.8063,0.8594,0.8084,0.8169,0.1401,0.1915
8,0.861,0.807,0.861,0.8131,0.8178,0.1418,0.1989
9,0.8591,0.8039,0.8591,0.8083,0.8181,0.1494,0.1976


Fitting 10 folds for each of 10 candidates, totalling 100 fits
Original model was better than the tuned model, hence it will be returned. NOTE: The display metrics are for the tuned model (not the original one).


98.14080414637976,0.0,12.270973963355834,LogisticRegression,SMOTE,all,110.4117781097356,12.270973963355834


In [41]:
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import GradientBoostingClassifier


In [43]:
test_labels = test_df[TARGET]
test_features = test_df.drop(columns=TARGET)

In [54]:
#Valid parameters are: ['ccp_alpha', 'criterion', 'init', 'learning_rate', 'loss', 'max_depth', 'max_features', 'max_leaf_nodes', 'min_impurity_decrease', 'min_samples_leaf', 'min_samples_split', 'min_weight_fraction_leaf', 'n_estimators', 'n_iter_no_change', 'random_state', 'subsample', 'tol', 'validation_fraction', 'verbose', 'warm_start']
params = [{
    'learning_rate': [0.16,0.15,0.14,0.13, 0.12],
}]

gs_knn = GridSearchCV(GradientBoostingClassifier(),
                      param_grid=params,
                      scoring='accuracy',
                      cv=5)

gs_knn.fit(test_features, test_labels)
gs_knn.best_params_

{'learning_rate': 0.13}

In [59]:
from joblib import parallel_backend
#Valid parameters are: ['ccp_alpha', 'criterion', 'init', 'learning_rate', 'loss', 'max_depth', 'max_features', 'max_leaf_nodes', 'min_impurity_decrease', 'min_samples_leaf', 'min_samples_split', 'min_weight_fraction_leaf', 'n_estimators', 'n_iter_no_change', 'random_state', 'subsample', 'tol', 'validation_fraction', 'verbose', 'warm_start']
params = [{
    'learning_rate': [0.22,0.21,0.20,0.19],
    'n_estimators': [1,2,3],
    'min_samples_split':[2],
    'min_samples_leaf':[1],
    'max_depth':[4,5,6,7,8,9],
    'max_features':[3,4,5,6,7]
}]

gs_knn = GridSearchCV(GradientBoostingClassifier(),
                      param_grid=params,
                      scoring='accuracy',
                      cv=5)

with parallel_backend('threading'):
    gs_knn.fit(test_features, test_labels)
    
gs_knn.best_params_

gs_knn.param_grid
clf = GradientBoostingClassifier(params={'learning_rate': 0.19,
 'max_depth': 6,
 'max_features': 5,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'n_estimators': 2})
print_test_heat_map3(clf,"Test","Test")

{'learning_rate': 0.19,
 'max_depth': 6,
 'max_features': 5,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'n_estimators': 2}

In [66]:
clf = GradientBoostingClassifier(**gs_knn.best_params_)
clf.fit(test_features, test_labels)
print_test_heat_map3(clf,"Test","Test")

100.0,0.14716703458425312,0.0,GradientBoostingClassifier,Test,Test,100.14716703458426,0.14716703458425312


0.14 recall not good.

In [None]:
#Compare 97 models, pre to post
#Due to Kernel crashes the data was saved in a file to be evaluated later.
#The results were printed in the notebook and saved in a file.

import time
for filename in os.listdir(r'.'):
    if ".pkl" in filename:
        # checking if it is a file
        if os.path.isfile(filename):
            print(f"Model:{filename}")
            start_time = time.time()
            filename = filename[:-4] #remove .pkl
            model=load_model(filename)
            print_test_heat_map3(model,"filename","Pre")
            result = tune_model(model)            
            print_test_heat_map3(result,"filename","Post") 
            print( time.time() - start_time, "seconds"          )

In [130]:
weights = [
    {0:0.10, 1:0.45, 2:0.45},
    {0:0.05, 1:0.55, 2:0.45},
]

for weight in weights:
    lightgbm = create_model('lightgbm', class_weight = weight, verbose=False)
    print_test_heat_map3(lightgbm,"lightgbm",f'{weight[0]},{weight[1]},{weight[2]}' )

79.2268729385896,0.07358351729212656,68.70781099324977,LGBMClassifier,lightgbm,0.1,0.45,0.45,148.0082674491315,68.78139451054189


64.27084969373331,0.07358351729212656,85.39054966248794,LGBMClassifier,lightgbm,0.05,0.55,0.45,149.7349828735134,85.46413317978008


In [177]:
classifiers = ['lr','ridge','lightgbm']
#classifiers=['lightgbm']

In [None]:
#identify best weights by calculating recall on each 
#Started with 10 per step, then 5 then 3.
#Resuls were saved to a file.
for clf in classifiers:
    for i in range(15,26,3):
        for j in range(20,51,5):  
            k = 100-i-j
            lightgbm = create_model(clf, class_weight = {0:i,1:j,2:k}, verbose=False)
            print_test_heat_map3(lightgbm,clf,f'{i},{j},{k},{sampler_name},{sampler_type}' )
        
            