# Data Description

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split,GridSearchCV,cross_val_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler,MinMaxScaler
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC
import warnings
warnings.filterwarnings('ignore')
%load_ext autotime

time: 0 ns (started: 2023-03-04 23:37:24 +05:30)


In [2]:
diabetes = pd.read_csv('Pimadiabetes.csv')

time: 0 ns (started: 2023-03-04 23:37:24 +05:30)


In [3]:
print(diabetes.columns)

Index(['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin',
       'BMI', 'DiabetesPedigreeFunction', 'Age', 'Outcome'],
      dtype='object')
time: 0 ns (started: 2023-03-04 23:37:24 +05:30)


In [4]:
diabetes.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


time: 16 ms (started: 2023-03-04 23:37:24 +05:30)


In [5]:
diabetes.shape

(768, 9)

time: 0 ns (started: 2023-03-04 23:37:24 +05:30)


In [6]:
print(diabetes.groupby('Outcome').size())

Outcome
0    500
1    268
dtype: int64
time: 0 ns (started: 2023-03-04 23:37:24 +05:30)


In [7]:
diabetes.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 768 entries, 0 to 767
Data columns (total 9 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Pregnancies               768 non-null    int64  
 1   Glucose                   768 non-null    int64  
 2   BloodPressure             768 non-null    int64  
 3   SkinThickness             768 non-null    int64  
 4   Insulin                   768 non-null    int64  
 5   BMI                       768 non-null    float64
 6   DiabetesPedigreeFunction  768 non-null    float64
 7   Age                       768 non-null    int64  
 8   Outcome                   768 non-null    int64  
dtypes: float64(2), int64(7)
memory usage: 54.1 KB
time: 15 ms (started: 2023-03-04 23:37:24 +05:30)


In [8]:
X=diabetes.drop('Outcome',axis=1)
y=diabetes.Outcome
scaler=StandardScaler()
X=scaler.fit_transform(X)

time: 0 ns (started: 2023-03-04 23:37:24 +05:30)


In [9]:
X_train, X_test, y_train, y_test = train_test_split(X,y)
encoder = MinMaxScaler()
X_train = encoder.fit_transform(X_train)
X_test = encoder.transform(X_test)

time: 0 ns (started: 2023-03-04 23:37:24 +05:30)


# K-Nearest Neighbors to Predict Diabetes

In [10]:
knn = KNeighborsClassifier()

params={
    'n_neighbors':[8,9,10],
    'weights':['uniform','distance'],
    'n_jobs':[1]
}

GSknn=GridSearchCV(knn,param_grid=params)
GSknn.fit(X_train,y_train)
print(GSknn.best_params_)
print(GSknn.best_score_)

{'n_jobs': 1, 'n_neighbors': 8, 'weights': 'distance'}
0.7361019490254873
time: 141 ms (started: 2023-03-04 23:37:24 +05:30)


# Decision Tree Classifier

In [11]:
tree = DecisionTreeClassifier()

params={
    'max_depth':[2,3,4,5],
     'min_samples_split':[2,4,6],
    "min_samples_leaf":[1,2,3,4],
}

GStree=GridSearchCV(tree,param_grid=params)
GStree.fit(X_train,y_train)
print(GStree.best_params_)
print(GStree.best_score_)

{'max_depth': 5, 'min_samples_leaf': 3, 'min_samples_split': 6}
0.7534782608695652
time: 453 ms (started: 2023-03-04 23:37:24 +05:30)


# MLP to Predict Diabetes

In [12]:
mlp = MLPClassifier()

params={
    'max_iter':[500,1000],
    'alpha':[1,2,3],
    'activation':['identity', 'logistic', 'tanh', 'relu']
}

GSmlp=GridSearchCV(mlp,param_grid=params)
GSmlp.fit(X_train, y_train)
print(GSmlp.best_params_)
print(GSmlp.best_score_)

{'activation': 'relu', 'alpha': 1, 'max_iter': 1000}
0.7621139430284858
time: 55.5 s (started: 2023-03-04 23:37:25 +05:30)


# Random Forest Classification

In [13]:
rf=RandomForestClassifier()

params={
    'min_samples_split':[2,4,6],
    "min_samples_leaf":[1,2,3,4],
    'max_features':['auto','log2',None]
}

GSrf=GridSearchCV(rf,param_grid=params)
GSrf.fit(X_train,y_train)
print(GSrf.best_params_)
print(GSrf.best_score_)

{'max_features': 'log2', 'min_samples_leaf': 3, 'min_samples_split': 4}
0.7725337331334333
time: 29.8 s (started: 2023-03-04 23:38:20 +05:30)


# Support Vector Machine (Support Vector Classifier)

In [14]:
svc=SVC()

params={
    'C':[1,2,3,4,5],
    'kernel':['poly','rbf','sigmoid'],
    'gamma':['auto','scale']
}
GSsvm=GridSearchCV(svc,param_grid=params,n_jobs=1)
GSsvm.fit(X_train,y_train)
print(GSsvm.best_params_)
print(GSsvm.best_score_)

{'C': 1, 'gamma': 'scale', 'kernel': 'rbf'}
0.7656071964017991
time: 1.31 s (started: 2023-03-04 23:38:50 +05:30)


# LogisticRegression

In [15]:
lr=LogisticRegression()

params={
    'C':[1,2,3,4,5],
    'penalty':['l1', 'l2', 'elasticnet', None],
    "solver":["lbfgs", "liblinear", "newton-cg", "newton-cholesky", "sag", "saga"],
    'max_iter':[100,500,1000]
}

GSlr=GridSearchCV(lr,param_grid=params)
GSlr.fit(X_train,y_train)
print(GSlr.best_params_)
print(GSlr.best_score_)

{'C': 1, 'max_iter': 100, 'penalty': 'l1', 'solver': 'liblinear'}
0.7604197901049476
time: 3.92 s (started: 2023-03-04 23:38:51 +05:30)


In [16]:
results={
    'Algorithm':['KNeighborsClassifier','DecisionTreeClassifier','MLPClassifier','RandomForestClassifier','SVC','LogisticRegression'],
    'HyperParameters':[GSknn.best_params_,GStree.best_params_,GSmlp.best_params_,GSrf.best_params_,GSsvm.best_params_,GSlr.best_params_],
    'Best_Score':[GSknn.best_score_,GStree.best_score_,GSmlp.best_score_,GSrf.best_score_,GSsvm.best_score_,GSlr.best_score_],
    'Accuracy':[GSknn.score(X_test,y_test),GStree.score(X_test,y_test),GSmlp.score(X_test,y_test),GSrf.score(X_test,y_test),GSsvm.score(X_test,y_test),GSlr.score(X_test,y_test)]
}

resultsdf=pd.DataFrame(results)
resultsdf.sort_values(by=['Best_Score'],ascending=False,inplace=True)
resultsdf

Unnamed: 0,Algorithm,HyperParameters,Best_Score,Accuracy
3,RandomForestClassifier,"{'max_features': 'log2', 'min_samples_leaf': 3...",0.772534,0.78125
4,SVC,"{'C': 1, 'gamma': 'scale', 'kernel': 'rbf'}",0.765607,0.765625
2,MLPClassifier,"{'activation': 'relu', 'alpha': 1, 'max_iter':...",0.762114,0.786458
5,LogisticRegression,"{'C': 1, 'max_iter': 100, 'penalty': 'l1', 'so...",0.76042,0.791667
1,DecisionTreeClassifier,"{'max_depth': 5, 'min_samples_leaf': 3, 'min_s...",0.753478,0.760417
0,KNeighborsClassifier,"{'n_jobs': 1, 'n_neighbors': 8, 'weights': 'di...",0.736102,0.729167


time: 31 ms (started: 2023-03-04 23:38:55 +05:30)


In [17]:
MLP=cross_val_score(GSmlp,X_train,y_train,cv=5)
SVC=cross_val_score(GSsvm,X_train,y_train,cv=5)
print(f'MLP {np.average(MLP)}')
print(f'SVC {np.average(SVC)}')

MLP 0.7656971514242878
SVC 0.7656371814092953
time: 5min 18s (started: 2023-03-04 23:02:03 +05:30)


In [18]:
print(round(GSsvm.score(X_train,y_train),2),round(GSsvm.score(X_test,y_test),2))
print(round(GSmlp.score(X_train,y_train),2),round(GSmlp.score(X_test,y_test),2))

0.8 0.72
0.79 0.73
time: 32 ms (started: 2023-03-04 23:07:21 +05:30)


In [21]:
import pickle

pickle.dump(mlp,open('Pimadiabetes.pkl','wb'))

time: 0 ns (started: 2023-03-02 17:39:18 +05:30)


# Experimentation to improve Train Scores

In [20]:
diabetes.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


time: 15 ms (started: 2023-03-03 12:46:51 +05:30)


In [19]:
def shuffle(model):
    X=diabetes.drop('Outcome',axis=1)
    y=diabetes.Outcome
    print(X.columns)
    scaler=StandardScaler()
    dropped=[]
    train_res=[]
    test_res=[]
    for i in range(len(diabetes.columns)-2):
        X=diabetes.drop('Outcome',axis=1)
        X.drop(X.columns[i],axis=1,inplace=True)
        X_train,X_test,y_train,y_test=train_test_split(X,y,shuffle=False,random_state=20)
        X_train=scaler.fit_transform(X_train)
        X_test=scaler.fit_transform(X_test)
        model.fit(X_train,y_train)
        train_score=model.score(X_train,y_train)
        test_score=model.score(X_test,y_test)
        col=X.columns[i]
        dropped.append(col)
        train_res.append(train_score)
        test_res.append(test_score)

    scores=pd.DataFrame({'Dropped Columns':dropped,'Train_Scores':train_res,'Test_Scores':test_res})
    scores=scores.sort_values(by='Train_Scores',ascending=False)
    return scores

time: 0 ns (started: 2023-03-04 23:07:21 +05:30)


In [22]:
GSsvm.best_params_

{'C': 5, 'gamma': 'auto', 'kernel': 'rbf'}

time: 0 ns (started: 2023-03-03 12:47:09 +05:30)


## SVC

In [30]:
model=SVC(C=5,gamma='auto',kernel='rbf')
shuffle(model)

time: 16 ms (started: 2023-03-04 23:11:26 +05:30)


In [26]:
GSmlp.best_params_

{'activation': 'tanh', 'alpha': 1, 'max_iter': 1000}

time: 15 ms (started: 2023-03-03 12:47:34 +05:30)


## MLP

In [22]:
model=MLPClassifier(activation='tanh',alpha=1,max_iter=1000)
shuffle(model)

Index(['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin',
       'BMI', 'DiabetesPedigreeFunction', 'Age'],
      dtype='object')


Unnamed: 0,Dropped Columns,Train_Scores,Test_Scores
0,Glucose,0.788194,0.786458
3,Insulin,0.788194,0.78125
2,SkinThickness,0.784722,0.796875
4,BMI,0.779514,0.776042
6,Age,0.774306,0.78125
5,DiabetesPedigreeFunction,0.762153,0.817708
1,BloodPressure,0.710069,0.677083


time: 9.8 s (started: 2023-03-04 23:08:47 +05:30)


In [29]:
GSrf.best_params_

{'max_features': 'auto', 'min_samples_leaf': 2, 'min_samples_split': 6}

time: 15 ms (started: 2023-03-04 23:10:36 +05:30)


## RFC

In [23]:
model=RandomForestClassifier(max_features=None,min_samples_leaf=1,min_samples_split=6)
shuffle(model)

Index(['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin',
       'BMI', 'DiabetesPedigreeFunction', 'Age'],
      dtype='object')


Unnamed: 0,Dropped Columns,Train_Scores,Test_Scores
1,BloodPressure,0.987847,0.734375
4,BMI,0.987847,0.786458
3,Insulin,0.982639,0.786458
0,Glucose,0.980903,0.776042
2,SkinThickness,0.980903,0.755208
5,DiabetesPedigreeFunction,0.980903,0.755208
6,Age,0.970486,0.760417


time: 1.91 s (started: 2023-03-04 23:09:03 +05:30)


In [32]:
GSlr.best_params_

{'C': 2, 'max_iter': 100, 'penalty': 'l1', 'solver': 'liblinear'}

time: 0 ns (started: 2023-03-03 12:49:00 +05:30)


## LR

In [25]:
model=LogisticRegression(C=2,max_iter=100,penalty='l1',solver='liblinear')
shuffle(model)

Index(['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin',
       'BMI', 'DiabetesPedigreeFunction', 'Age'],
      dtype='object')


Unnamed: 0,Dropped Columns,Train_Scores,Test_Scores
2,SkinThickness,0.78125,0.796875
4,BMI,0.779514,0.776042
3,Insulin,0.777778,0.786458
0,Glucose,0.772569,0.791667
6,Age,0.767361,0.791667
5,DiabetesPedigreeFunction,0.751736,0.802083
1,BloodPressure,0.701389,0.682292


time: 63 ms (started: 2023-03-04 23:09:21 +05:30)


In [34]:
GSknn.best_params_

{'n_jobs': 1, 'n_neighbors': 10, 'weights': 'uniform'}

time: 0 ns (started: 2023-03-03 12:49:21 +05:30)


In [31]:
model=RandomForestClassifier(max_features=None,min_samples_leaf=1,min_samples_split=6)
shuffle(model)

Index(['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin',
       'BMI', 'DiabetesPedigreeFunction', 'Age'],
      dtype='object')


Unnamed: 0,Dropped Columns,Train_Scores,Test_Scores
0,Glucose,0.989583,0.791667
1,BloodPressure,0.987847,0.734375
2,SkinThickness,0.987847,0.760417
3,Insulin,0.986111,0.796875
4,BMI,0.982639,0.776042
5,DiabetesPedigreeFunction,0.982639,0.770833
6,Age,0.970486,0.760417


time: 1.84 s (started: 2023-03-04 23:13:10 +05:30)


## KNN

In [26]:
model=KNeighborsClassifier(n_jobs=1,n_neighbors=10,weights='uniform')
shuffle(model)

Index(['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin',
       'BMI', 'DiabetesPedigreeFunction', 'Age'],
      dtype='object')


Unnamed: 0,Dropped Columns,Train_Scores,Test_Scores
2,SkinThickness,0.789931,0.760417
3,Insulin,0.789931,0.776042
4,BMI,0.784722,0.755208
6,Age,0.782986,0.729167
0,Glucose,0.763889,0.776042
5,DiabetesPedigreeFunction,0.755208,0.734375
1,BloodPressure,0.751736,0.677083


time: 218 ms (started: 2023-03-04 23:09:24 +05:30)


# Without Scaling

In [27]:
X=diabetes.drop('Outcome',axis=1)
y=diabetes.Outcome

X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.3)

time: 0 ns (started: 2023-03-04 23:09:49 +05:30)


In [28]:
knn = KNeighborsClassifier()

params={'n_jobs': [1], 'n_neighbors': [10], 'weights': ['uniform']}


GSknn=GridSearchCV(knn,param_grid=params)
GSknn.fit(X_train,y_train)
print(GSknn.best_params_)
print(GSknn.best_score_)

tree = DecisionTreeClassifier()

params={'max_depth': [2], 'min_samples_leaf': [1], 'min_samples_split': [2]}

GStree=GridSearchCV(tree,param_grid=params)
GStree.fit(X_train,y_train)
print(GStree.best_params_)
print(GStree.best_score_)

rf=RandomForestClassifier()

params={'max_features': ['auto'], 'min_samples_leaf': [2], 'min_samples_split': [6]}


GSrf=GridSearchCV(rf,param_grid=params)
GSrf.fit(X_train,y_train)
print(GSrf.best_params_)
print(GSrf.best_score_)


lr=LogisticRegression()

params={'C': [1], 'max_iter': [100], 'penalty': ['l2'], 'solver': ['liblinear']}


GSlr=GridSearchCV(lr,param_grid=params)
GSlr.fit(X_train,y_train)
print(GSlr.best_params_)
print(GSlr.best_score_)


mlp = MLPClassifier()

params={
    'activation': ['logistic'], 'alpha':[2], 'max_iter': [1000]
}
GSmlp=GridSearchCV(mlp,param_grid=params)
GSmlp.fit(X_train, y_train)
print(GSmlp.best_params_)
print(GSmlp.best_score_)


results={
    'Algorithm':['KNeighborsClassifier','DecisionTreeClassifier','MLPClassifier','RandomForestClassifier','LogisticRegression'],
    'HyperParameters':[GSknn.best_params_,GStree.best_params_,GSmlp.best_params_,GSrf.best_params_,GSlr.best_params_],
    'Best_Score':[GSknn.best_score_,GStree.best_score_,GSmlp.best_score_,GSrf.best_score_,GSlr.best_score_],
    'Accuracy':[GSknn.score(X_test,y_test),GStree.score(X_test,y_test),GSmlp.score(X_test,y_test),GSrf.score(X_test,y_test),GSlr.score(X_test,y_test)]
}

resultsdf=pd.DataFrame(results)
resultsdf.sort_values(by=['Best_Score'],ascending=False,inplace=True)
resultsdf

{'n_jobs': 1, 'n_neighbors': 10, 'weights': 'uniform'}
0.730079612322603
{'max_depth': 2, 'min_samples_leaf': 1, 'min_samples_split': 2}
0.7543267566632054
{'max_features': 'auto', 'min_samples_leaf': 2, 'min_samples_split': 6}
0.787712011076497
{'C': 1, 'max_iter': 100, 'penalty': 'l2', 'solver': 'liblinear'}
0.7672377985462098
{'activation': 'logistic', 'alpha': 2, 'max_iter': 1000}
0.7188127379716164


Unnamed: 0,Algorithm,HyperParameters,Best_Score,Accuracy
3,RandomForestClassifier,"{'max_features': 'auto', 'min_samples_leaf': 2...",0.787712,0.735931
4,LogisticRegression,"{'C': 1, 'max_iter': 100, 'penalty': 'l2', 'so...",0.767238,0.731602
1,DecisionTreeClassifier,"{'max_depth': 2, 'min_samples_leaf': 1, 'min_s...",0.754327,0.748918
0,KNeighborsClassifier,"{'n_jobs': 1, 'n_neighbors': 10, 'weights': 'u...",0.73008,0.748918
2,MLPClassifier,"{'activation': 'logistic', 'alpha': 2, 'max_it...",0.718813,0.744589


time: 5.5 s (started: 2023-03-04 23:09:50 +05:30)


## Since model is performing well without Glucose trainScore->0.989583 & testScore->0.791667

In [32]:
diabetes.columns

Index(['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin',
       'BMI', 'DiabetesPedigreeFunction', 'Age', 'Outcome'],
      dtype='object')

time: 0 ns (started: 2023-03-04 23:16:08 +05:30)


In [38]:
X=diabetes.drop(['Glucose','Outcome'],axis=1)
y=diabetes.Outcome
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=.33)
model=RandomForestClassifier(max_features=None,min_samples_leaf=1,min_samples_split=6)
model.fit(X_train,y_train)
model.score(X_train,y_train),model.score(X_test,y_test)

(0.9902723735408561, 0.7322834645669292)

time: 282 ms (started: 2023-03-04 23:18:25 +05:30)


In [39]:
import pickle

pickle.dump(model,open('Pimadiabetes.pkl','wb'))

time: 15 ms (started: 2023-03-04 23:18:37 +05:30)
