In [None]:
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt
import seaborn

In [None]:
dataset = pd.read_csv(r'https://raw.githubusercontent.com/Devansh-Trivedi/Diabetes-Prediction/main/diabetes.csv')

# Step 1: Descriptive Statistics

In [None]:
dataset.head()

In [None]:
dataset.shape

In [None]:
dataset.info()

In [None]:
# Statistical summary
dataset.describe().T

In [None]:
dataset.isnull().sum()

# Data Visualization

In [None]:
seaborn.countplot(x = 'Outcome',data = dataset)

In [None]:
# Heatmap
seaborn.heatmap(dataset.corr(), annot = True)
plt.show()

# From the above visualizations we can observe
1. the data is imbalanced
2. the number of patients who have diabetes are less than that of who don't have.
3. we can also see that Outcome and [Glucose,BMI,Age,Insulin] has high correlation.


# Data Preprocessing

In [None]:
dataset_new = dataset

In [None]:
# Replacing zero values with NaN
dataset_new[["Glucose", "BloodPressure", "SkinThickness", "Insulin", "BMI"]] = dataset_new[["Glucose", "BloodPressure", "SkinThickness", "Insulin", "BMI"]].replace(0, np.NaN) 

In [None]:
# Count of NaN
dataset_new.isnull().sum()

In [None]:
dataset_new["Glucose"].fillna(dataset_new["Glucose"].min(), inplace = True)
dataset_new["BloodPressure"].fillna(dataset_new["BloodPressure"].min(), inplace = True)
dataset_new["SkinThickness"].fillna(dataset_new["SkinThickness"].min(), inplace = True)
dataset_new["Insulin"].fillna(dataset_new["Insulin"].min(), inplace = True)
dataset_new["BMI"].fillna(dataset_new["BMI"].min(), inplace = True)

In [None]:
dataset_new.describe().T

In [None]:
# Feature scaling using MinMaxScaler
from sklearn.preprocessing import MinMaxScaler
sc = MinMaxScaler(feature_range = (0, 1))
dataset_scaled = sc.fit_transform(dataset_new)

In [None]:
dataset_scaled = pd.DataFrame(dataset_scaled)

In [None]:
# Selecting features - [Glucose, SkinThickness, Insulin, BMI, Age]
X = dataset_scaled.iloc[:, [1, 3, 4, 5, 7]].values
Y = dataset_scaled.iloc[:, 8].values

In [None]:
# Splitting X and Y
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.20, random_state = 42, stratify = dataset_new['Outcome'] )

In [None]:
# Checking dimensions
print("X_train shape:", X_train.shape)
print("X_test shape:", X_test.shape)
print("Y_train shape:", Y_train.shape)
print("Y_test shape:", Y_test.shape)

# Data Modelling

In [None]:
# Logistic Regression Algorithm
from sklearn.linear_model import LogisticRegression
logistic_regression = LogisticRegression(random_state = 42)
logistic_regression.fit(X_train, Y_train)

In [None]:
# Plotting a graph for n_neighbors 
from sklearn import metrics
from sklearn.neighbors import KNeighborsClassifier

X_axis = list(range(1, 31))
acc = pd.Series()
x = range(1,31)

for i in list(range(1, 31)):
    knn_model = KNeighborsClassifier(n_neighbors = i) 
    knn_model.fit(X_train, Y_train)
    prediction = knn_model.predict(X_test)
    acc = acc.append(pd.Series(metrics.accuracy_score(prediction, Y_test)))
plt.plot(X_axis, acc)
plt.xticks(x)
plt.title("Finding best value for n_estimators")
plt.xlabel("n_estimators")
plt.ylabel("Accuracy")
plt.grid()
plt.show()
print('Highest value: ',acc.values.max())

In [None]:
# Support Vector Classifier Algorithm
from sklearn.svm import SVC
svc = SVC(kernel = 'linear', random_state = 42)
svc.fit(X_train, Y_train)

In [None]:
# Naive Bayes Algorithm
from sklearn.naive_bayes import GaussianNB
nb = GaussianNB()
nb.fit(X_train, Y_train)

In [None]:
# Decision tree Algorithm
from sklearn.tree import DecisionTreeClassifier
decision_tree = DecisionTreeClassifier(criterion = 'entropy', random_state = 42)
decision_tree.fit(X_train, Y_train)

In [None]:
# Random forest Algorithm
from sklearn.ensemble import RandomForestClassifier
random_forest = RandomForestClassifier(n_estimators = 11, criterion = 'entropy', random_state = 42)
random_forest.fit(X_train, Y_train)

In [None]:
# Making predictions on test dataset
Y_pred_logistic_regression = logistic_regression.predict(X_test)
Y_pred_svc = svc.predict(X_test)
Y_pred_nb = nb.predict(X_test)
Y_pred_decision_tree = decision_tree.predict(X_test)
Y_pred_random_forest = random_forest.predict(X_test)

# Model Evaluation

In [None]:
# Evaluating using accuracy_score metric
from sklearn.metrics import accuracy_score
accuracy_logistic_regression = accuracy_score(Y_test, Y_pred_logistic_regression)
accuracy_svc = accuracy_score(Y_test, Y_pred_svc)
accuracy_nb = accuracy_score(Y_test, Y_pred_nb)
accuracy_decision_tree = accuracy_score(Y_test, Y_pred_decision_tree)
accuracy_random_forest = accuracy_score(Y_test, Y_pred_random_forest)

In [None]:
# Accuracy on test set
print("Logistic Regression: " + str(accuracy_logistic_regression * 100))
print("Support Vector Classifier: " + str(accuracy_svc * 100))
print("Naive Bayes: " + str(accuracy_nb * 100))
print("Decision tree: " + str(accuracy_decision_tree * 100))
print("Random Forest: " + str(accuracy_random_forest * 100))

In [None]:
# Confusion matrix
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(Y_test, Y_pred_random_forest)
cm

In [None]:
# Heatmap of Confusion matrix
seaborn.heatmap(pd.DataFrame(cm), annot=True)

In [None]:
# Classification report
from sklearn.metrics import classification_report
print(classification_report(Y_test, Y_pred_random_forest))

# Generate Model for Random Forest

In [None]:
import pickle
pickle.dump(random_forest, open('model.pkl','wb'))