In [7]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.preprocessing import LabelEncoder

In [8]:
df = pd.read_csv('diabetes_prediction_dataset.csv')
df.head()

Unnamed: 0,gender,age,hypertension,heart_disease,smoking_history,bmi,HbA1c_level,blood_glucose_level,diabetes
0,Female,80.0,0,1,never,25.19,6.6,140,0
1,Female,54.0,0,0,No Info,27.32,6.6,80,0
2,Male,28.0,0,0,never,27.32,5.7,158,0
3,Female,36.0,0,0,current,23.45,5.0,155,0
4,Male,76.0,1,1,current,20.14,4.8,155,0


In [9]:
for columns in df.select_dtypes('object'):
    print(f'{columns:-<50}{df[columns].unique()}')

gender--------------------------------------------['Female' 'Male' 'Other']
smoking_history-----------------------------------['never' 'No Info' 'current' 'former' 'ever' 'not current']


In [10]:
for columns in df.select_dtypes('int64'):
    print(f'{columns:-<50}{df[columns].unique()}')

hypertension--------------------------------------[0 1]
heart_disease-------------------------------------[1 0]
blood_glucose_level-------------------------------[140  80 158 155  85 200 145 100 130 160 126 159  90 260 220 300 280 240]
diabetes------------------------------------------[0 1]


# TrainTest Encodage

In [11]:
from sklearn.model_selection import train_test_split

In [12]:
trainset, testset = train_test_split(df,test_size=0.2,random_state=0)

In [13]:
trainset['diabetes'].value_counts()

diabetes
0    73203
1     6797
Name: count, dtype: int64

In [14]:
testset['diabetes'].value_counts()

diabetes
0    18297
1     1703
Name: count, dtype: int64

In [15]:
"""
def encode_gender(df):
    code={
        'female':0,
        'male':1,
        'other':2
    }
    df[columns]=df[columns].map(code)
    for columns in df.select_dtypes('object').columns:
        df[columns]=df[columns].map(code)
"""

"\ndef encode_gender(df):\n    code={\n        'female':0,\n        'male':1,\n        'other':2\n    }\n    df[columns]=df[columns].map(code)\n    for columns in df.select_dtypes('object').columns:\n        df[columns]=df[columns].map(code)\n"

In [16]:
# Merge list for encoder
def list_merge(one,two):
    merged={}
    if(len(one)==len(two)):
        for i in range(len(one)):
            merged[one[i]]=two[i]
        return merged;
    else:
        return 'one!=two'

In [17]:
# Auto encoding of all object type variables
code = LabelEncoder()
def encoder(df):
    for columns in df.select_dtypes('object').columns:
        unique = df[columns].unique()
        encoded = code.fit_transform(unique)
        merged = list_merge(unique,encoded)
        #print(columns,' ',merged)
        df[columns]= df[columns].map(merged)

In [18]:
df.head()

Unnamed: 0,gender,age,hypertension,heart_disease,smoking_history,bmi,HbA1c_level,blood_glucose_level,diabetes
0,Female,80.0,0,1,never,25.19,6.6,140,0
1,Female,54.0,0,0,No Info,27.32,6.6,80,0
2,Male,28.0,0,0,never,27.32,5.7,158,0
3,Female,36.0,0,0,current,23.45,5.0,155,0
4,Male,76.0,1,1,current,20.14,4.8,155,0


In [19]:
def preprocessing(df):
    encoder(df)
    X = df.drop('diabetes',axis=1)
    y = df['diabetes']
    print(y.value_counts())
    return X,y

In [20]:
X_train, y_train = preprocessing(trainset)

diabetes
0    73203
1     6797
Name: count, dtype: int64


In [21]:
X_test, y_test = preprocessing(testset)

diabetes
0    18297
1     1703
Name: count, dtype: int64


In [22]:
X_train.head()

Unnamed: 0,gender,age,hypertension,heart_disease,smoking_history,bmi,HbA1c_level,blood_glucose_level
10382,0,2.0,0,0,0,16.45,6.2,159
73171,0,55.0,0,0,4,24.59,6.0,130
30938,0,24.0,0,0,0,21.77,4.5,130
99310,1,30.0,0,0,4,27.32,6.2,159
58959,1,13.0,0,0,0,18.37,6.5,130


In [23]:
X_test.head()

Unnamed: 0,gender,age,hypertension,heart_disease,smoking_history,bmi,HbA1c_level,blood_glucose_level
3582,0,52.0,0,0,4,27.32,4.8,140
60498,1,56.0,0,0,4,27.32,4.8,100
53227,0,22.0,0,0,1,37.16,6.6,85
21333,0,49.0,0,0,0,43.83,5.0,160
3885,1,10.0,0,0,0,14.18,4.0,155


# Modelisation

In [24]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import make_pipeline
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.preprocessing import PolynomialFeatures , StandardScaler
from sklearn.decomposition import PCA

In [25]:
model1 = RandomForestClassifier(random_state=0)

In [26]:
preprocessor = make_pipeline(SelectKBest(f_classif,k='all'))

In [27]:
RandomForest = make_pipeline(preprocessor,RandomForestClassifier(random_state=0))
AdaBoost = make_pipeline(preprocessor, AdaBoostClassifier(random_state=0))
Svm = make_pipeline(preprocessor, StandardScaler(), SVC(random_state=0))

In [28]:
dict_of_models = {'RandomForest':RandomForest, 'AdaBoost':AdaBoost, 'Svm':Svm}

In [31]:
for name, model in dict_of_models.items():
    print(name)
    evaluation(model)

RandomForest
[[18235    62]
 [  520  1183]]
              precision    recall  f1-score   support

           0       0.97      1.00      0.98     18297
           1       0.95      0.69      0.80      1703

    accuracy                           0.97     20000
   macro avg       0.96      0.85      0.89     20000
weighted avg       0.97      0.97      0.97     20000

AdaBoost
[[18267    30]
 [  521  1182]]
              precision    recall  f1-score   support

           0       0.97      1.00      0.99     18297
           1       0.98      0.69      0.81      1703

    accuracy                           0.97     20000
   macro avg       0.97      0.85      0.90     20000
weighted avg       0.97      0.97      0.97     20000

Svm
[[18270    27]
 [  684  1019]]
              precision    recall  f1-score   support

           0       0.96      1.00      0.98     18297
           1       0.97      0.60      0.74      1703

    accuracy                           0.96     20000
   macro 

KeyboardInterrupt: 

# Evaluation

In [29]:
from sklearn.metrics import f1_score, confusion_matrix, classification_report, roc_auc_score
from sklearn.model_selection import learning_curve

In [30]:
def evaluation(model):
    model.fit(X_train,y_train)
    ypred = model.predict(X_test)
    
    print(confusion_matrix(y_test,ypred))
    print(classification_report(y_test,ypred))
    
    N, train_score, val_score = learning_curve(model,X_train,y_train,cv=5, scoring='f1',train_sizes=np.linspace(0.1,1,10))
    '''
    plt.figure(figsize=(12,8))
    plt.plot(N,train_score.mean(axis=1),label='train score')
    plt.plot(N,val_score.mean(axis=1),label='train score')
    plt.legend()
    '''

# Optimisation

In [37]:
from sklearn.model_selection import GridSearchCV

In [38]:
parameters_AdaBoost = {
    'n_estimators': [2, 3, 4, 11, 12, 15, 17, 20],
    'learning_rate': [0.0001, 0.001, 0.01, 0.1, 1.0],
    'algorithm': ['SAMME', 'SAMME.R']
}

In [36]:
grid = GridSearchCV(AdaBoostClassifier(),parameters,scoring='recall',cv=4)
grid.fit(X_train,y_train)

print(grid.best_params_)
y_pred = grid.predict(X_test)
print(classification_report(y_test,y_pred))

{'algorithm': 'SAMME.R', 'learning_rate': 1.0, 'n_estimators': 12}
              precision    recall  f1-score   support

           0       0.97      1.00      0.98     18297
           1       0.95      0.69      0.80      1703

    accuracy                           0.97     20000
   macro avg       0.96      0.84      0.89     20000
weighted avg       0.97      0.97      0.97     20000



In [39]:
parameters_RandomForest = { 
    'n_estimators': [200, 500],
    'max_depth' : [4,5,6,7,8],
    'criterion' :['gini', 'entropy']
}

In [40]:
def GSCV(classifier,parameters):
    grid = GridSearchCV(classifier,parameters,scoring='recall',cv=4)
    grid.fit(X_train,y_train)

    print(grid.best_params_)
    y_pred = grid.predict(X_test)
    print(classification_report(y_test,y_pred))

In [33]:
GSCV(AdaBoostClassifier(),parameters_AdaBoost)

{'algorithm': 'SAMME.R', 'learning_rate': 1.0, 'n_estimators': 12}
              precision    recall  f1-score   support

           0       0.97      1.00      0.98     18297
           1       0.95      0.69      0.80      1703

    accuracy                           0.97     20000
   macro avg       0.96      0.84      0.89     20000
weighted avg       0.97      0.97      0.97     20000



In [34]:
GSCV(RandomForestClassifier(random_state=0),parameters_RandomForest)

{'criterion': 'gini', 'max_depth': 4, 'n_estimators': 200}
              precision    recall  f1-score   support

           0       0.97      1.00      0.99     18297
           1       1.00      0.67      0.81      1703

    accuracy                           0.97     20000
   macro avg       0.99      0.84      0.90     20000
weighted avg       0.97      0.97      0.97     20000



In [32]:
parameters_SVC = {'C': [0.1, 1, 10, 100, 1000], 
              'gamma': [1, 0.1, 0.01, 0.001, 0.0001],
              'kernel': ['rbf']} 

In [None]:
GSCV(SVC(random_state=0),parameters_SVC)

In [None]:
AdaBoost_GSV = make_pipeline(preprocessor,AdaBoostClassifier(random_state=0,algorithm= 'SAMME.R', learning_rate= 1.0, n_estimators= 12))
RandomForest = make_pipeline(preprocessor,RandomForestClassifier(random_state=0,criterion= 'gini', max_depth= 4, n_estimators= 200))
