In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier 
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score,confusion_matrix,precision_score,recall_score,f1_score,auc


In [2]:
import warnings
warnings.filterwarnings('ignore')

In [8]:
df = pd.read_csv("G:\Projects\Sugar_Prediction\Dataset\diabetes.csv")

In [None]:
df.head()

   Pregnancies  Glucose  BloodPressure  ...  DiabetesPedigreeFunction  Age  Outcome
0            6      148             72  ...                     0.627   50        1
1            1       85             66  ...                     0.351   31        0
2            8      183             64  ...                     0.672   32        1
3            1       89             66  ...                     0.167   21        0
4            0      137             40  ...                     2.288   33        1

[5 rows x 9 columns]


In [None]:
df.isna().sum()

Pregnancies                 0
Glucose                     0
BloodPressure               0
SkinThickness               0
Insulin                     0
BMI                         0
DiabetesPedigreeFunction    0
Age                         0
Outcome                     0
dtype: int64


In [None]:
df.describe()

       Pregnancies     Glucose  ...         Age     Outcome
count   768.000000  768.000000  ...  768.000000  768.000000
mean      3.845052  120.894531  ...   33.240885    0.348958
std       3.369578   31.972618  ...   11.760232    0.476951
min       0.000000    0.000000  ...   21.000000    0.000000
25%       1.000000   99.000000  ...   24.000000    0.000000
50%       3.000000  117.000000  ...   29.000000    0.000000
75%       6.000000  140.250000  ...   41.000000    1.000000
max      17.000000  199.000000  ...   81.000000    1.000000

[8 rows x 9 columns]


In [None]:
pd.set_option('display.max_columns',None)

In [None]:
df.describe()

       Pregnancies     Glucose  BloodPressure  SkinThickness     Insulin  \
count   768.000000  768.000000     768.000000     768.000000  768.000000   
mean      3.845052  120.894531      69.105469      20.536458   79.799479   
std       3.369578   31.972618      19.355807      15.952218  115.244002   
min       0.000000    0.000000       0.000000       0.000000    0.000000   
25%       1.000000   99.000000      62.000000       0.000000    0.000000   
50%       3.000000  117.000000      72.000000      23.000000   30.500000   
75%       6.000000  140.250000      80.000000      32.000000  127.250000   
max      17.000000  199.000000     122.000000      99.000000  846.000000   

              BMI  DiabetesPedigreeFunction         Age     Outcome  
count  768.000000                768.000000  768.000000  768.000000  
mean    31.992578                  0.471876   33.240885    0.348958  
std      7.884160                  0.331329   11.760232    0.476951  
min      0.000000                  

In [None]:
df.duplicated().sum()

0


In [None]:
df.columns

Index(['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin',
       'BMI', 'DiabetesPedigreeFunction', 'Age', 'Outcome'],
      dtype='object')


In [9]:
col_names = ['Glucose','BloodPressure','SkinThickness','BMI','Insulin']

for i in col_names:
    df[i] = df[i].replace(df[i].min(),df[i].median())

In [None]:
df.describe()

       Pregnancies     Glucose  BloodPressure  SkinThickness     Insulin  \
count   768.000000  768.000000     768.000000     768.000000  768.000000   
mean      3.845052  121.751302      72.449219      27.376302   94.674805   
std       3.369578   30.309174      11.969650       9.172990  105.532247   
min       0.000000   56.000000      30.000000       8.000000   15.000000   
25%       1.000000  100.000000      64.000000      23.000000   30.500000   
50%       3.000000  117.000000      72.000000      23.000000   31.625000   
75%       6.000000  140.250000      80.000000      32.000000  127.250000   
max      17.000000  199.000000     122.000000      99.000000  846.000000   

              BMI  DiabetesPedigreeFunction         Age     Outcome  
count  768.000000                768.000000  768.000000  768.000000  
mean    32.504818                  0.471876   33.240885    0.348958  
std      6.817199                  0.331329   11.760232    0.476951  
min     18.400000                  

In [10]:
df['Outcome'].value_counts()

Outcome
0    500
1    268
Name: count, dtype: int64

In [11]:
# split the data into X and Y
x = df.drop(columns=['Outcome'])
y = df['Outcome']

In [12]:
# Split X and Y into Train test
X_train, X_test, y_train, y_test = train_test_split(x,y,test_size=0.2,random_state=0)

In [13]:
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

In [14]:
# Baseline Model
models = {
    'LogisticRegression':LogisticRegression(),
    'DecsionTreeClassifier': DecisionTreeClassifier(),
    'RandomForestClassifier':RandomForestClassifier()
}

In [15]:
for name,model in models.items():

    print(f'==================={name}==============')
    model.fit(X_train,y_train)

    y_pred = model.predict(X_test)
    print(f'Confusion Matrix for {name} is \n {confusion_matrix(y_test,y_pred)}')

    print(f'Accuracy for {name} is {accuracy_score(y_test,y_pred)}')

    print(f'Precision Score for {name} is {precision_score(y_test,y_pred)}')

    print(f'Recall Score for {name} is {recall_score(y_test,y_pred)}')

    print(f'F1 Score for {name} is {f1_score(y_test,y_pred)}')

    print(f'======================================')

    print()


    

Confusion Matrix for LogisticRegression is 
 [[98  9]
 [19 28]]
Accuracy for LogisticRegression is 0.8181818181818182
Precision Score for LogisticRegression is 0.7567567567567568
Recall Score for LogisticRegression is 0.5957446808510638
F1 Score for LogisticRegression is 0.6666666666666666

Confusion Matrix for DecsionTreeClassifier is 
 [[86 21]
 [15 32]]
Accuracy for DecsionTreeClassifier is 0.7662337662337663
Precision Score for DecsionTreeClassifier is 0.6037735849056604
Recall Score for DecsionTreeClassifier is 0.6808510638297872
F1 Score for DecsionTreeClassifier is 0.64

Confusion Matrix for RandomForestClassifier is 
 [[95 12]
 [16 31]]
Accuracy for RandomForestClassifier is 0.8181818181818182
Precision Score for RandomForestClassifier is 0.7209302325581395
Recall Score for RandomForestClassifier is 0.6595744680851063
F1 Score for RandomForestClassifier is 0.6888888888888889



In [16]:
rfc = RandomForestClassifier()
rfc.fit(X_train,y_train)

In [18]:
import pickle
pickle.dump(rfc ,open('G:\Projects\Sugar_Prediction\pickle_files\model.pkl','wb'))
pickle.dump(sc ,open('G:\Projects\Sugar_Prediction\pickle_filesscaler.pkl','wb'))



In [19]:
# Chnage Threshold Technique should be equal or greater than 0.3
y_proba = rfc.predict_proba(X_test)
y_predict_proba = (y_proba[:,1] >= 0.35).astype('int')
print(y_predict_proba)


[1 0 0 1 0 0 1 1 0 1 1 1 0 0 0 1 1 0 1 0 1 0 0 0 0 0 0 1 0 0 1 0 0 1 0 1 1
 0 0 1 0 0 0 1 1 0 0 0 1 0 1 0 1 1 0 0 0 1 0 1 1 1 0 1 1 1 1 0 1 0 0 0 0 1
 1 1 0 1 0 0 1 1 0 0 0 0 0 0 1 0 0 0 0 0 1 0 1 1 1 1 0 0 0 0 1 1 0 1 0 1 0
 1 1 1 1 1 0 1 0 1 1 1 0 0 0 0 0 1 0 1 1 0 0 1 0 1 0 1 0 0 0 1 0 0 1 0 0 0
 0 1 0 1 0 0]


In [20]:

print(f'Confusion Matrix for {name} is \n {confusion_matrix(y_test,y_predict_proba)}')

print(f'Accuracy for {name} is {accuracy_score(y_test,y_predict_proba)}')

print(f'Precision Score for {name} is {precision_score(y_test,y_predict_proba)}')

print(f'Recall Score for {name} is {recall_score(y_test,y_predict_proba)}')

print(f'F1 Score for {name} is {f1_score(y_test,y_predict_proba)}')

print(f'======================================')

Confusion Matrix for RandomForestClassifier is 
 [[81 26]
 [ 6 41]]
Accuracy for RandomForestClassifier is 0.7922077922077922
Precision Score for RandomForestClassifier is 0.6119402985074627
Recall Score for RandomForestClassifier is 0.8723404255319149
F1 Score for RandomForestClassifier is 0.7192982456140351


In [21]:
from imblearn.over_sampling import SMOTE
smote = SMOTE()
x_resampled,y_resampled = smote.fit_resample(x,y)

x_tr_smote, x_ts_smote,y_tr_smote, y_ts_smote = train_test_split(x_resampled,y_resampled,test_size=0.2,random_state=0)


sc_1 =StandardScaler()
x_tr_smote = sc_1.fit_transform(x_tr_smote)
x_ts_smote = sc_1.fit_transform(x_ts_smote)



In [22]:
rfc_smote = RandomForestClassifier()
rfc_smote.fit(x_tr_smote,y_tr_smote)

y_pred_smote = rfc_smote.predict(x_ts_smote)


print(f'Confusion Matrix for {name} is \n {confusion_matrix(y_ts_smote,y_pred_smote)}')

print(f'Accuracy for {name} is {accuracy_score(y_ts_smote,y_pred_smote)}')

print(f'Precision Score for {name} is {precision_score(y_ts_smote,y_pred_smote)}')

print(f'Recall Score for {name} is {recall_score(y_ts_smote,y_pred_smote)}')

print(f'F1 Score for {name} is {f1_score(y_ts_smote,y_pred_smote)}')

print(f'======================================')


Confusion Matrix for RandomForestClassifier is 
 [[82 23]
 [ 4 91]]
Accuracy for RandomForestClassifier is 0.865
Precision Score for RandomForestClassifier is 0.7982456140350878
Recall Score for RandomForestClassifier is 0.9578947368421052
F1 Score for RandomForestClassifier is 0.8708133971291866


In [24]:
pickle.dump(rfc_smote,open('G:\Projects\Sugar_Prediction\pickle_filesrfc_model_smote.pkl','wb'))
pickle.dump(sc_1,open('G:\Projects\Sugar_Prediction\pickle_files\sc_1_smote.pkl','wb'))


In [26]:
# Hyperparameter Tuning
param_grid = {
    'bootstrap':[True],
    'max_depth':[80,90,100,110],
    'max_features':[2,3]

}
grid = GridSearchCV(estimator=rfc,param_grid=param_grid,cv = 5,scoring='f1')

grid.fit(X_train,y_train)
model = grid.best_estimator_
print(grid.best_params_)

model.fit(X_train,y_train)
y_pred_tuning = model.predict(X_test)


print(f'Confusion Matrix for {name} is \n {confusion_matrix(y_test,y_pred_tuning)}')

print(f'Accuracy for {name} is {accuracy_score(y_test,y_pred_tuning)}')

print(f'Precision Score for {name} is {precision_score(y_test,y_pred_tuning)}')

print(f'Recall Score for {name} is {recall_score(y_test,y_pred_tuning)}')

print(f'F1 Score for {name} is {f1_score(y_test,y_pred_tuning)}')

print(f'======================================')


{'bootstrap': True, 'max_depth': 100, 'max_features': 3}
Confusion Matrix for RandomForestClassifier is 
 [[94 13]
 [15 32]]
Accuracy for RandomForestClassifier is 0.8181818181818182
Precision Score for RandomForestClassifier is 0.7111111111111111
Recall Score for RandomForestClassifier is 0.6808510638297872
F1 Score for RandomForestClassifier is 0.6956521739130435


In [28]:
pickle.dump(model, open(r'G:\Projects\Sugar_Prediction\pickle_files\rfc_model_tuning.pkl', 'wb'))
