In [284]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier

In [285]:
columns = ['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI', 'DiabetesPedigreeFunction', 'Age', 'Class' ]

In [286]:
data = pd.read_csv('C:\\Users\\Deepak\\Downloads\\pima-indians-diabetes.csv', names = columns)

# Preprocessing Stage

In [287]:
data.describe()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Class
count,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0
mean,3.845052,120.894531,69.105469,20.536458,79.799479,31.992578,0.471876,33.240885,0.348958
std,3.369578,31.972618,19.355807,15.952218,115.244002,7.88416,0.331329,11.760232,0.476951
min,0.0,0.0,0.0,0.0,0.0,0.0,0.078,21.0,0.0
25%,1.0,99.0,62.0,0.0,0.0,27.3,0.24375,24.0,0.0
50%,3.0,117.0,72.0,23.0,30.5,32.0,0.3725,29.0,0.0
75%,6.0,140.25,80.0,32.0,127.25,36.6,0.62625,41.0,1.0
max,17.0,199.0,122.0,99.0,846.0,67.1,2.42,81.0,1.0


In [288]:
data.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Class
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [289]:
data[['Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI']] = data[['Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI']].replace(0, np.NaN)

In [290]:
data.fillna(data.mean(), inplace = True)  #Filling all null values with mean of that column

In [291]:
data.head(15)

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Class
0,6,148.0,72.0,35.0,155.548223,33.6,0.627,50,1
1,1,85.0,66.0,29.0,155.548223,26.6,0.351,31,0
2,8,183.0,64.0,29.15342,155.548223,23.3,0.672,32,1
3,1,89.0,66.0,23.0,94.0,28.1,0.167,21,0
4,0,137.0,40.0,35.0,168.0,43.1,2.288,33,1
5,5,116.0,74.0,29.15342,155.548223,25.6,0.201,30,0
6,3,78.0,50.0,32.0,88.0,31.0,0.248,26,1
7,10,115.0,72.405184,29.15342,155.548223,35.3,0.134,29,0
8,2,197.0,70.0,45.0,543.0,30.5,0.158,53,1
9,8,125.0,96.0,29.15342,155.548223,32.457464,0.232,54,1


In [292]:
#Creating Train and Test set
X = data.drop(columns = 'Class')
y = data['Class']

In [293]:
X.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age
0,6,148.0,72.0,35.0,155.548223,33.6,0.627,50
1,1,85.0,66.0,29.0,155.548223,26.6,0.351,31
2,8,183.0,64.0,29.15342,155.548223,23.3,0.672,32
3,1,89.0,66.0,23.0,94.0,28.1,0.167,21
4,0,137.0,40.0,35.0,168.0,43.1,2.288,33


In [294]:
y.head()

0    1
1    0
2    1
3    0
4    1
Name: Class, dtype: int64

In [295]:
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify = y)

In [296]:
#Random Forest Classifier

model = RandomForestClassifier()
model.fit(X_train, y_train)
model.score(X_test, y_test)

0.7135416666666666

In [297]:
# Finding Important features

feature_importances = pd.DataFrame(model.feature_importances_, index = X.columns, columns=['importance']).sort_values('importance', ascending=False)
print(feature_importances)

                          importance
Glucose                     0.260971
BMI                         0.136814
DiabetesPedigreeFunction    0.131750
Insulin                     0.100211
BloodPressure               0.099836
Age                         0.099221
Pregnancies                 0.096409
SkinThickness               0.074788


In [298]:
#After usage of Random forest, we got to know that below 4 features are more relevant

col_subset = ['Glucose', 'BMI', 'Age', 'DiabetesPedigreeFunction', 'Class' ]
data_subset = data[col_subset]

In [299]:
data_subset.head()

Unnamed: 0,Glucose,BMI,Age,DiabetesPedigreeFunction,Class
0,148.0,33.6,50,0.627,1
1,85.0,26.6,31,0.351,0
2,183.0,23.3,32,0.672,1
3,89.0,28.1,21,0.167,0
4,137.0,43.1,33,2.288,1


In [300]:
#Creating Train and Test set again
X = data_subset.drop(columns = 'Class')
y = data_subset['Class']

In [301]:
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify = y)

In [302]:
X_train.head()

Unnamed: 0,Glucose,BMI,Age,DiabetesPedigreeFunction
290,78.0,36.9,21,0.434
600,108.0,27.1,24,0.4
525,87.0,21.8,21,0.444
92,81.0,46.7,42,0.261
467,97.0,36.8,25,0.6


In [303]:
y_train.value_counts()

0    375
1    201
Name: Class, dtype: int64

In [304]:
#Random Forest Classifier

model = RandomForestClassifier()
model.fit(X_train, y_train)
model.score(X_test, y_test)

0.75