<a href="https://colab.research.google.com/github/Aravindh-dasari/chronicdisease/blob/diabetes/diabetes_Model_Code.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [11]:
import numpy as np
import pandas as pd
from sklearn import ensemble
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import LinearSVC, SVC
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier

import joblib

In [2]:
df = pd.read_csv(r"/content/drive/MyDrive/Chronic_Project/diabetes.csv")

In [3]:
df['Glucose'].replace(0, np.nan, inplace=True)
df['BloodPressure'].replace(0, np.nan, inplace=True)
df['SkinThickness'].replace(0, np.nan, inplace=True)
df['Insulin'].replace(0, np.nan, inplace=True)
df['BMI'].replace(0, np.nan, inplace=True)
df.drop(['SkinThickness' ,'Insulin'], axis=1, inplace=True)
df = df.dropna(axis=0)

cols = ['Pregnancies', 'Glucose', 'BloodPressure', 'BMI',
       'DiabetesPedigreeFunction', 'Age', 'Outcome']

df_outcome_1 = df[df['Outcome'] == 1].copy()
i = len(df_outcome_1)
df_outcome_0 = df[df['Outcome'] == 0].sample(i, random_state=1)
df_balanced = df_outcome_0.append(df_outcome_1)
df_balanced = df[cols]

In [9]:
X = df_balanced.drop('Outcome', axis=1)
y = df_balanced['Outcome']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2
                                                    , stratify=y, random_state=1)
print('Shape training set: X:{}, y:{}'.format(X_train.shape, y_train.shape))
print('Shape test set: X:{}, y:{}'.format(X_test.shape, y_test.shape))

Shape training set: X:(579, 6), y:(579,)
Shape test set: X:(145, 6), y:(145,)


In [21]:
models = {
    "                   Logistic Regression": LogisticRegression(),
    "                   K-Nearest Neighbors": KNeighborsClassifier(),
    "                         Decision Tree": DecisionTreeClassifier(),
    "Support Vector Machine (Linear Kernel)": LinearSVC(max_iter=1500),
    "   Support Vector Machine (RBF Kernel)": SVC(max_iter=1500),
    "                        Neural Network": MLPClassifier(max_iter=2500),
    "                         Random Forest": RandomForestClassifier(),
    "                     Gradient Boosting": GradientBoostingClassifier()
}

for name, model in models.items():
    model.fit(X_train, y_train)
    print(name + " trained.")
print('--------------------------------')

# model = ensemble.RandomForestClassifier()
# model.fit(X_train, y_train)
# y_pred = model.predict(X_test)
# print('Accuracy : {}'.format(accuracy_score(y_test, y_pred)))

# clf_report = classification_report(y_test, y_pred)
# print('Classification report')
# print("---------------------")
# print(clf_report)
# print("_____________________")

# joblib.dump(model,r"/content/drive/MyDrive/Chrnoic_Models/diabetes_model.pkl")

                   Logistic Regression trained.
                   K-Nearest Neighbors trained.
                         Decision Tree trained.
Support Vector Machine (Linear Kernel) trained.
   Support Vector Machine (RBF Kernel) trained.




                        Neural Network trained.
                         Random Forest trained.
                     Gradient Boosting trained.
--------------------------------


In [22]:
for name, model in models.items():
  print(name + ": {:.2f}%".format(model.score(X_test, y_test) * 100))
print("------------------------------")

                   Logistic Regression: 72.41%
                   K-Nearest Neighbors: 68.28%
                         Decision Tree: 71.03%
Support Vector Machine (Linear Kernel): 40.69%
   Support Vector Machine (RBF Kernel): 75.17%
                        Neural Network: 70.34%
                         Random Forest: 75.17%
                     Gradient Boosting: 77.24%
------------------------------


In [23]:
svm = models["   Support Vector Machine (RBF Kernel)"]

In [24]:
pred = svm.predict(X_test)

cm = confusion_matrix(y_test,pred)
print(cm)

print("Accuracy:", accuracy_score(y_test,pred))
print(classification_report(y_test,pred))

[[85 10]
 [26 24]]
Accuracy: 0.7517241379310344
              precision    recall  f1-score   support

           0       0.77      0.89      0.83        95
           1       0.71      0.48      0.57        50

    accuracy                           0.75       145
   macro avg       0.74      0.69      0.70       145
weighted avg       0.75      0.75      0.74       145



In [25]:
joblib.dump(model,r"/content/drive/MyDrive/Chrnoic_Models/diabetes_model.pkl")

['/content/drive/MyDrive/Chrnoic_Models/diabetes_model.pkl']