# Diabetes Prediction

In [64]:
import numpy as np
import pandas as pd

# IMPORTING DATASET

In [65]:
dataset = pd.read_csv("Diabities-210331-154610.csv")
dataset.head(11)

Unnamed: 0.1,Unnamed: 0,Pregnancies,Glucose,blood pressure,skin thickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,,6.0,148.0,72.0,35.0,0.0,33.6,0.627,50.0,1.0
1,,1.0,85.0,66.0,29.0,0.0,26.6,0.351,31.0,0.0
2,,8.0,183.0,64.0,0.0,0.0,23.3,0.672,32.0,1.0
3,,1.0,89.0,66.0,23.0,94.0,28.1,0.167,21.0,0.0
4,,0.0,137.0,40.0,35.0,168.0,43.1,2.288,33.0,1.0
5,,5.0,116.0,74.0,0.0,0.0,25.6,0.201,30.0,0.0
6,,3.0,78.0,50.0,32.0,88.0,31.0,0.248,26.0,1.0
7,,10.0,115.0,0.0,0.0,0.0,35.3,0.134,29.0,0.0
8,,2.0,197.0,70.0,45.0,543.0,30.5,0.158,53.0,1.0
9,,8.0,125.0,96.0,0.0,0.0,0.0,0.232,54.0,1.0


# ANALYZING DATASET

In [66]:
dataset.info()                       # all of the data is useful so we cant remove any columns

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 769 entries, 0 to 768
Data columns (total 10 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Unnamed: 0                0 non-null      float64
 1   Pregnancies               768 non-null    float64
 2   Glucose                   768 non-null    float64
 3   blood pressure            768 non-null    float64
 4   skin thickness            768 non-null    float64
 5   Insulin                   768 non-null    float64
 6   BMI                       768 non-null    float64
 7   DiabetesPedigreeFunction  768 non-null    float64
 8   Age                       768 non-null    float64
 9   Outcome                   768 non-null    float64
dtypes: float64(10)
memory usage: 60.2 KB


In [67]:
dataset.drop(dataset.columns[0], axis=1, inplace = True)# removing the unnamed column 
                                                            # axis 0 represents rows whereas axis 1 represents columns

In [68]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 769 entries, 0 to 768
Data columns (total 9 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Pregnancies               768 non-null    float64
 1   Glucose                   768 non-null    float64
 2   blood pressure            768 non-null    float64
 3   skin thickness            768 non-null    float64
 4   Insulin                   768 non-null    float64
 5   BMI                       768 non-null    float64
 6   DiabetesPedigreeFunction  768 non-null    float64
 7   Age                       768 non-null    float64
 8   Outcome                   768 non-null    float64
dtypes: float64(9)
memory usage: 54.2 KB


In [69]:
dataset.isnull().sum()

Pregnancies                 1
Glucose                     1
blood pressure              1
skin thickness              1
Insulin                     1
BMI                         1
DiabetesPedigreeFunction    1
Age                         1
Outcome                     1
dtype: int64

In [70]:
dataset.dropna(axis=0, how="any", thresh=None, subset=None, inplace=True)

In [71]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 768 entries, 0 to 767
Data columns (total 9 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Pregnancies               768 non-null    float64
 1   Glucose                   768 non-null    float64
 2   blood pressure            768 non-null    float64
 3   skin thickness            768 non-null    float64
 4   Insulin                   768 non-null    float64
 5   BMI                       768 non-null    float64
 6   DiabetesPedigreeFunction  768 non-null    float64
 7   Age                       768 non-null    float64
 8   Outcome                   768 non-null    float64
dtypes: float64(9)
memory usage: 60.0 KB


In [72]:
dataset.isnull().sum()

Pregnancies                 0
Glucose                     0
blood pressure              0
skin thickness              0
Insulin                     0
BMI                         0
DiabetesPedigreeFunction    0
Age                         0
Outcome                     0
dtype: int64

In [73]:
x = dataset.iloc[:,:-1]
y = dataset.iloc[:,-1]

# SPLITTING DATA

In [74]:
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=25,random_state=0)

# APPLYING CLASSIFIERS AND EVALUATION

# RANDOM FOREST

In [75]:
from sklearn.ensemble import RandomForestClassifier
classifier = RandomForestClassifier(n_estimators = 6, criterion='entropy', random_state=0)
classifier.fit(x_train,y_train)

y_pred = classifier.predict(x_test)

In [76]:
# n_estimators is the number of trees we want to build before taking the maximun voting or average number of predictions

# criterion is used to Measures the quality of each split. It can either be “gini” or “entropy”. “gini” uses the Gini impurity
    #while “entropy” makes the split based on the information gain.
    
#Since the bootstrapping generates random samples it is often hard to exactly duplicate results. 
   #This parameter makes it easy for others to replicate your results if given the same training data and parameters.    

In [77]:
from sklearn.metrics import accuracy_score
acc_randomforest = round(accuracy_score(y_pred,y_test) ,2)*100
print("Accuracy:" , acc_randomforest)

Accuracy: 88.0


# LOGISTIC REGRESSION

In [78]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score,r2_score,classification_report
logreg = LogisticRegression(solver = 'lbfgs',max_iter = 1000)
logreg.fit(x_train,y_train)
y_pred = logreg.predict(x_test)
acc_logreg = round(accuracy_score(y_pred,y_test) , 2)*100
print("Accuracy:" , acc_logreg)

Accuracy: 96.0


In [79]:
# Solver is the Algorithm to use in the optimization problem.
   #lbfgs is used to solve multiclass problem
    
# max_iter: Maximum number of iterations taken for the solvers to converge(come together).

# r2(coefficient of determination) regression score function.

 #Best possible score is 1.0 and it can be negative (because the model can be arbitrarily worse).
    #A constant model that always predicts the expected value of y, disregarding the input features, 
    #would get a r2 score of 0.0.

# K NEIGHBOUR CLASSIFIER

In [80]:
from sklearn.neighbors import KNeighborsClassifier
knn =  KNeighborsClassifier(n_neighbors=3)
knn.fit(x_train,y_train)
y_pred = knn.predict(x_test)
acc_knn = round(accuracy_score(y_pred,y_test) , 2)*100
print("Accuracy:" , acc_knn)

Accuracy: 88.0


In [None]:
# Number of neighbors to use for kneighbors queries.