In [47]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
%matplotlib inline

In [48]:
data = pd.read_csv('..\datasets\diabetes.csv')
data.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [49]:
data['Outcome'].value_counts()

Outcome
0    500
1    268
Name: count, dtype: int64

In [50]:
data.shape

(768, 9)

In [51]:
data.isnull().sum()

Pregnancies                 0
Glucose                     0
BloodPressure               0
SkinThickness               0
Insulin                     0
BMI                         0
DiabetesPedigreeFunction    0
Age                         0
Outcome                     0
dtype: int64

In [52]:
data.corr()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
Pregnancies,1.0,0.129459,0.141282,-0.081672,-0.073535,0.017683,-0.033523,0.544341,0.221898
Glucose,0.129459,1.0,0.15259,0.057328,0.331357,0.221071,0.137337,0.263514,0.466581
BloodPressure,0.141282,0.15259,1.0,0.207371,0.088933,0.281805,0.041265,0.239528,0.065068
SkinThickness,-0.081672,0.057328,0.207371,1.0,0.436783,0.392573,0.183928,-0.11397,0.074752
Insulin,-0.073535,0.331357,0.088933,0.436783,1.0,0.197859,0.185071,-0.042163,0.130548
BMI,0.017683,0.221071,0.281805,0.392573,0.197859,1.0,0.140647,0.036242,0.292695
DiabetesPedigreeFunction,-0.033523,0.137337,0.041265,0.183928,0.185071,0.140647,1.0,0.033561,0.173844
Age,0.544341,0.263514,0.239528,-0.11397,-0.042163,0.036242,0.033561,1.0,0.238356
Outcome,0.221898,0.466581,0.065068,0.074752,0.130548,0.292695,0.173844,0.238356,1.0


In [53]:
''' import seaborn as sns
 plt.figure(figsize=(10,10))
 sns.heatmap(data.corr(), annot = True)
plt.show()'''

In [54]:
data.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [55]:
!pip install imblearn



In [56]:
from imblearn.over_sampling import SMOTE, RandomOverSampler

# Initialize SMOTE
smote = SMOTE(random_state=1)

# Apply SMOTE to the training data
X_resampled_smote, y_resampled_smote = smote.fit_resample(data[data.columns.to_list()[:-1]], data['Outcome'])

# Initialize RandomOverSampler
ros = RandomOverSampler(random_state=1)

# Apply Random Over-Sampling to the SMOTE-resampled data
X_resampled_final, y_resampled_final = ros.fit_resample(X_resampled_smote, y_resampled_smote)

X, y = X_resampled_final, y_resampled_final

In [57]:
y.value_counts()

Outcome
1    500
0    500
Name: count, dtype: int64

In [58]:
X.shape, y.shape

((1000, 8), (1000,))

In [59]:
X.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age
0,6,148,72,35,0,33.6,0.627,50
1,1,85,66,29,0,26.6,0.351,31
2,8,183,64,0,0,23.3,0.672,32
3,1,89,66,23,94,28.1,0.167,21
4,0,137,40,35,168,43.1,2.288,33


In [60]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 10)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((800, 8), (200, 8), (800,), (200,))

In [61]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [62]:
from sklearn.decomposition import PCA
pca = PCA()
pca.fit(X_train_scaled)
explained_variance_ratio = pca.explained_variance_ratio_
cumulative_variance_ratio = np.cumsum(explained_variance_ratio)
cumulative_variance_ratio

array([0.26229183, 0.48295783, 0.61357391, 0.7247507 , 0.81566759,
       0.90359023, 0.95746794, 1.        ])

In [63]:
from sklearn.decomposition import PCA
pca = PCA(n_components=8)
X_train_pca = pca.fit_transform(X_train_scaled)
X_test_pca = pca.transform(X_test_scaled)

In [64]:
from sklearn.ensemble import RandomForestClassifier
model = RandomForestClassifier(criterion='gini', n_estimators=20)
model.fit(X_train_pca, y_train)
y_pred = model.predict(X_test_pca)

In [65]:
from sklearn.metrics import accuracy_score
print(accuracy_score(y_pred=y_pred, y_true=y_test)*100)

75.0


In [67]:
import pickle
pickle.dump(model, open("../models/diabetes-model.pkl",'wb'))
pickle.dump(scaler, open("../scalar/diabetes-scaler.pkl",'wb'))
pickle.dump(pca, open("../pca/diabetes-pca.pkl",'wb'))

In [1]:
inputs = ['Pregnancies','Glucose','BloodPressure',
        	'SkinThickness',	'Insulin',	'BMI',
                	'DiabetesPedigreeFunction',	'Age']
output = ['outcome']