In [3]:
import numpy as np
import pandas as pd
from pandas_profiling import ProfileReport
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from joblib import dump, load

In [10]:
data= pd.read_csv('C:\\Users\\Abhinav\\Desktop\\Fiverr\\Classification\\diabetes.csv')

In [11]:
data[['Glucose','BloodPressure','SkinThickness','Insulin','BMI']] = data[['Glucose','BloodPressure','SkinThickness','Insulin','BMI']].replace(0,np.NaN)

In [12]:
data.isnull().sum()

Pregnancies                   0
Glucose                       5
BloodPressure                35
SkinThickness               227
Insulin                     374
BMI                          11
DiabetesPedigreeFunction      0
Age                           0
Outcome                       0
dtype: int64

In [13]:
#function to impute the missing values with median based on Outcome class
def impute_median(data, var):
  temp = data[data[var].notnull()]
  temp = temp[[var, 'Outcome']].groupby(['Outcome'])[[var]].median()
  data.loc[(data['Outcome'] == 0 ) & (data[var].isnull()), var] = temp.loc[0 ,var]
  data.loc[(data['Outcome'] == 1 ) & (data[var].isnull()), var] = temp.loc[1 ,var]
  return data

#impute values using the function
data = impute_median(data, 'Glucose')
data = impute_median(data, 'BloodPressure')
data = impute_median(data, 'SkinThickness')
data = impute_median(data, 'Insulin')
data = impute_median(data, 'BMI')

#separate features and target as x & y
y = data['Outcome']
x = data.drop('Outcome', axis = 1)
columns = x.columns

In [17]:
#scale the values using a StandardScaler
scaler = StandardScaler()
scaler = scaler.fit(x)
X = scaler.transform(x)

#features DataFrame 
features = pd.DataFrame(X, columns = columns)

In [22]:
dump(scaler,'scaler.joblib')

['scaler.joblib']

In [18]:
#split data into training and test sets
x_train, x_test, y_train, y_test = train_test_split(features, y, test_size = 0.2, random_state = 42)

#define the model
model = RandomForestClassifier(n_estimators=300, bootstrap = True, max_features = 'sqrt')

#fit model to training data
model.fit(x_train, y_train)

#predict on test data
y_pred = model.predict(x_test)

#evaluate performance
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.92      0.88      0.90        99
           1       0.80      0.85      0.82        55

    accuracy                           0.87       154
   macro avg       0.86      0.87      0.86       154
weighted avg       0.87      0.87      0.87       154



In [21]:
dump(model,'model.joblib')

['model.joblib']

In [23]:


pregnancies = 2
glucose = 13
bloodpressure = 30
skinthickness = 4
insulin = 5
bmi = 5
dpf = 0.55
age = 34
feat_cols = features.columns

row = [pregnancies, glucose, bloodpressure, skinthickness, insulin, bmi, dpf, age]


In [24]:
scaler = load('scaler.joblib')
model = load('model.joblib')

In [25]:
feat_cols

Index(['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin',
       'BMI', 'DiabetesPedigreeFunction', 'Age'],
      dtype='object')

In [26]:


df = pd.DataFrame([row], columns = feat_cols)
X = scaler.transform(df)
features = pd.DataFrame(X, columns = feat_cols)



In [27]:


if (model.predict(features)==0):
    print("This is a healthy person!")
else: print("This person has high chances of having diabetics!")



This is a healthy person!
