<h2>
    Importing necessary libraries
 </h2>

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
%matplotlib inline

<h2>
    Uploading Diabetes Dataset
</h2>

In [None]:
dia_data = pd.read_csv("../input/diabetes/diabetes.csv")
dia_data.head()

In [None]:
# check the shape of our dataset
dia_data.shape

In [None]:
dia_data.info()

In [None]:
dia_data.describe()

In [None]:
dia_data.head()

<h2>
    Checking null values in the dataset
</h2>

In [None]:
dia_data.isnull().sum()

<h2> 
    Checking Duplicate data
</h2>

In [None]:
dia_data.duplicated().sum()

In [None]:
# droping duplicate data
dia_data.drop_duplicates(inplace = True)

In [None]:
dia_data.shape

<h2>
    Check Missing values in the data
</h2>

In [None]:
print("total no of rows :: {} ".format(len(dia_data)))
print("total no of rows missing Pregnancies :: {} ".format(len(dia_data.loc[dia_data['Pregnancies'] == 0])))
print("total no of rows missing glucose :: {} ".format(len(dia_data.loc[dia_data['Glucose'] == 0])))
print("total no of rows missing bp :: {} ".format(len(dia_data.loc[dia_data['BloodPressure'] == 0])))
print("total no of rows missing insulin :: {} ".format(len(dia_data.loc[dia_data['Insulin'] == 0])))
print("total no of rows missing SkinThickness :: {} ".format(len(dia_data.loc[dia_data['SkinThickness'] == 0])))
print("total no of rows missing DiabetesPedigreeFunction :: {} ".format(len(dia_data.loc[dia_data['DiabetesPedigreeFunction'] == 0])))
print("total no of rows missing bmi :: {} ".format(len(dia_data.loc[dia_data['BMI'] == 0])))
print("total no of rows missing age :: {} ".format(len(dia_data.loc[dia_data['Age'] == 0])))

In [None]:
dia_data.iloc[:,:-1].columns

In [None]:
fig = plt.figure(figsize = (10,5))
plt.title("Ploting of missing values")
sns.barplot([ len(dia_data.loc[dia_data['Pregnancies'] == 0]),
              len(dia_data.loc[dia_data['Glucose'] == 0]),
              len(dia_data.loc[dia_data['BloodPressure'] == 0]),
              len(dia_data.loc[dia_data['Insulin'] == 0]),
              len(dia_data.loc[dia_data['SkinThickness'] == 0]),
              len(dia_data.loc[dia_data['DiabetesPedigreeFunction'] == 0]),
              len(dia_data.loc[dia_data['BMI'] == 0]),
              len(dia_data.loc[dia_data['Age'] == 0])
            ], dia_data.iloc[:,:-1].columns)
plt.show()

In [None]:
fig , s= plt.subplots(3,2, figsize = (15,10))
s[0][0].set_title("Histogram of pregnancies column")
s[1][0].set_title("Histogram of Glucose column")
s[2][0].set_title("Histogram of BloodPressure column")
s[0][1].set_title("Histogram of Insulin column")
s[1][1].set_title("Histogram of SkinThickness column")
s[2][1].set_title("Histogram of BMI column")

s[0][0].hist(dia_data['Pregnancies'], rwidth = 0.8)
s[1][0].hist(dia_data['Glucose'], rwidth = 0.8)
s[2][0].hist(dia_data['BloodPressure'], rwidth = 0.8)
s[0][1].hist(dia_data['Insulin'] ,rwidth = 0.8)
s[1][1].hist(dia_data['SkinThickness'],rwidth = 0.8)
s[2][1].hist(dia_data['BMI'], rwidth = 0.8)
plt.show()

In [None]:
plt.figure(figsize=(15,5))
sns.scatterplot(x= 'Age',y= 'Pregnancies', hue = 'Outcome', data = dia_data)
plt.show()

In [None]:
plt.figure(figsize=(15,5))
sns.scatterplot(x= 'Age',y= 'Glucose', hue = 'Outcome', data = dia_data)
plt.show()

In [None]:
plt.figure(figsize=(15,5))
sns.scatterplot(x= 'Age',y= 'BloodPressure', hue = 'Outcome', data = dia_data)
plt.show()

In [None]:
plt.figure(figsize=(15,5))
sns.scatterplot(x= 'Age',y= 'SkinThickness', hue = 'Outcome', data = dia_data)
plt.show()

In [None]:
plt.figure(figsize=(15,5))
sns.scatterplot(x= 'Age',y= 'Insulin', hue = 'Outcome', data = dia_data)
plt.show()

In [None]:
plt.figure(figsize=(15,5))
sns.scatterplot(x= 'Age',y= 'BMI', hue = 'Outcome', data = dia_data)
plt.show()

In [None]:
plt.figure(figsize=(15,5))
sns.scatterplot(x= 'Age',y= 'DiabetesPedigreeFunction', hue = 'Outcome', data = dia_data)
plt.show()

# Correlation between each features

In [None]:
corr_data = dia_data.corr() # correlated metrics
top_corr_features = corr_data.index

In [None]:
corr_data

In [None]:
top_corr_features

<h2>
    Ploting heat map of the correlated data
</h2>

In [None]:
plt.figure(figsize = (8,5))
# annot is used to show each values
# cmap is used for color map on the graph
sns.heatmap(corr_data, annot = True, cmap = 'RdYlGn')

In [None]:
# features set (independent data)
features_column = list(dia_data.iloc[:,:-1].columns)
predicted_column = ['Outcome']
print("features columns :: {} \n predicted columns :: {}".format(features_column,predicted_column))

In [None]:
X = dia_data[features_column].values
y = dia_data[predicted_column].values

# print("features columns :: {} \n predicted columns :: {}".format(X,y))

In [None]:
X.shape  , y.shape

<h2>
    Spliting training and testing dataset
</h2>

In [None]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 10)

In [None]:
x_train.shape, y_train.shape  , x_test.shape, y_test.shape
# dia_data.head()

<h2>
    Filling missing values in training set
</h2>

In [None]:
from sklearn.impute import SimpleImputer
fill_null_value = SimpleImputer(missing_values = 0, strategy = 'mean')

x_train = fill_null_value.fit_transform(x_train)
x_test = fill_null_value.fit_transform(x_test)

## Creating Random Forest model

In [None]:
from sklearn.ensemble import RandomForestRegressor

rf = RandomForestRegressor(n_estimators = 10, random_state = 10)
# n_estimators ==> determine the no of decision trees
# random_state ==> It ensures that the splits that you generate are reproducible.
# Eg. when random_state set to an integer, train_test_split will return same results for each execution.
# when random_state set to an None, train_test_split will return different results for each execution.

In [None]:
# train the model 
rf.fit( x_train , y_train.ravel())

In [None]:
# Predicting values from the model 
y_pred = rf.predict(x_test)

In [None]:
y_pred = np.array([0 if i < 0.5 else 1 for i in y_pred])

In [None]:
y_pred.shape, y_test.ravel().shape

<h2>
    Checking accuracy score of our model
</h2>

In [None]:
from sklearn.metrics import confusion_matrix, accuracy_score

cm = confusion_matrix(y_test, y_pred)
acc = accuracy_score(y_test, y_pred)

In [None]:
from sklearn.metrics import classification_report
class_report = classification_report(y_test, y_pred)

In [None]:
print("confusion matrix :: {} \n\n Accuracy = {} \n\n classification report :: \n{}".format(cm,acc,class_report))

<h1>
    Appling random  testing dataset
</h1>

In [None]:
test_data2 = pd.read_csv('../input/diabites-dataset/diabetes.csv')

In [None]:
x_test2 = test_data2.iloc[:,:-1]
y_test2 = np.array(test_data2.iloc[:,-1])

In [None]:
x_test2.shape

In [None]:
y_pred2 = rf.predict(x_test2)
y_pred2 = np.array([0 if i < 0.5 else 1 for i in y_pred2])

In [None]:
y_pred2[:100:5], y_test2[:100:5]