# Let's start by importing essential libraries

In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import accuracy_score
from sklearn.metrics import mean_absolute_error

import matplotlib.pyplot as plt

### Read the csv file

In [265]:
data = pd.read_csv("diabetes.csv")
data.head(20)

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1
5,5,116,74,0,0,25.6,0.201,30,0
6,3,78,50,32,88,31.0,0.248,26,1
7,10,115,0,0,0,35.3,0.134,29,0
8,2,197,70,45,543,30.5,0.158,53,1
9,8,125,96,0,0,0.0,0.232,54,1


### Checking for missing value columns

In [3]:
missing_cols = [mis for mis in data.columns if data[mis].isnull().any()]
missing_cols

[]

## That's Good!. 

#### There are no missing values. Let's move forward.


In [4]:
num_cols = [num for num in data.columns if data[num].dtype in ['int64', 'float64']]
num_cols

['Pregnancies',
 'Glucose',
 'BloodPressure',
 'SkinThickness',
 'Insulin',
 'BMI',
 'DiabetesPedigreeFunction',
 'Age',
 'Outcome']

#### The Features that could affect if a person is having diabetes or not are,

In [594]:
features = ['Pregnancies',
 'Glucose',
 'BloodPressure',
 'SkinThickness',
 'Insulin',
 'BMI',
 'DiabetesPedigreeFunction',
 'Age']

### We didn't include the Insulin factor as it shows uncommon behavior in the charts.

### The target variable y

In [595]:
y = data.Outcome
X = data[features]

In [707]:
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, train_size=0.8)

In [708]:
model = RandomForestRegressor(n_estimators=100, random_state=1)

In [709]:
model.fit(X_train, y_train)

RandomForestRegressor(random_state=1)

In [710]:
preds = model.predict(X_valid)
A = np.around(preds)
predictions = A.astype(int)

In [711]:
accuracy = accuracy_score(y_valid, predictions)
print("Accuracy = ", accuracy * 100 , "%")

Accuracy =  76.62337662337663 %


In [700]:
y_valid

520    0
209    1
207    1
575    0
36     0
      ..
381    0
507    0
42     0
65     0
125    1
Name: Outcome, Length: 154, dtype: int64

In [701]:
predictions

array([0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0,
       0, 0, 1, 1, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 0,
       1, 0, 1, 1, 0, 0, 1, 0, 1, 0, 1, 1, 0, 0, 1, 1, 1, 0, 1, 0, 0, 1,
       1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0,
       0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0,
       0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0,
       0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1])