In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
import sys
import warnings
warnings.filterwarnings('ignore')


### Read data

In [3]:
df  = pd.read_csv('../data/diabetes.csv')

### Check missing values

In [4]:
df.isna().sum()

Pregnancies                 0
Glucose                     0
BloodPressure               0
SkinThickness               0
Insulin                     0
BMI                         0
DiabetesPedigreeFunction    0
Age                         0
Outcome                     0
dtype: int64

### Check for information

In [5]:
df.describe()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
count,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0
mean,3.845052,120.894531,69.105469,20.536458,79.799479,31.992578,0.471876,33.240885,0.348958
std,3.369578,31.972618,19.355807,15.952218,115.244002,7.88416,0.331329,11.760232,0.476951
min,0.0,0.0,0.0,0.0,0.0,0.0,0.078,21.0,0.0
25%,1.0,99.0,62.0,0.0,0.0,27.3,0.24375,24.0,0.0
50%,3.0,117.0,72.0,23.0,30.5,32.0,0.3725,29.0,0.0
75%,6.0,140.25,80.0,32.0,127.25,36.6,0.62625,41.0,1.0
max,17.0,199.0,122.0,99.0,846.0,67.1,2.42,81.0,1.0


### Lets us model the data

#### Scale the data

In [6]:
# scale the data
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaler.fit(df.drop('Outcome', axis=1))
scaled_features = scaler.transform(df.drop('Outcome', axis=1))
df_feat = pd.DataFrame(scaled_features, columns=df.columns[:-1])
df_feat['Outcome'] = df['Outcome']


#### Split the data

In [7]:
# split the data into train and test data
from sklearn.model_selection import train_test_split
train , test = train_test_split(df_feat, test_size = 0.4)


#### Create a model using random forest and xgboost and compare the results

In [8]:
from sklearn.metrics import classification_report, confusion_matrix
# import randomforest
from sklearn.ensemble import RandomForestClassifier
# import xgboost
from xgboost import XGBClassifier
# import lightgbm


In [9]:
# model of random forest
model_randomforest = RandomForestClassifier(n_estimators=100)
model_randomforest.fit(train.drop(columns=['Outcome']), train.Outcome)

# model of xgboost
model_xgboost = XGBClassifier(eval_metric='auc')
model_xgboost.fit(train.drop(columns=['Outcome']), train.Outcome)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, enable_categorical=False,
              eval_metric='auc', gamma=0, gpu_id=-1, importance_type=None,
              interaction_constraints='', learning_rate=0.300000012,
              max_delta_step=0, max_depth=6, min_child_weight=1, missing=nan,
              monotone_constraints='()', n_estimators=100, n_jobs=8,
              num_parallel_tree=1, predictor='auto', random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
              tree_method='exact', validate_parameters=1, verbosity=None)

##### Score of RandomForest

In [10]:
print(classification_report(test.Outcome, model_randomforest.predict(test.drop(columns=['Outcome']))))

              precision    recall  f1-score   support

           0       0.80      0.85      0.83       202
           1       0.68      0.60      0.64       106

    accuracy                           0.77       308
   macro avg       0.74      0.73      0.73       308
weighted avg       0.76      0.77      0.76       308



##### Score of XGBoost

In [11]:
print(classification_report(test.Outcome, model_xgboost.predict(test.drop(columns=['Outcome']))))

              precision    recall  f1-score   support

           0       0.80      0.80      0.80       202
           1       0.62      0.62      0.62       106

    accuracy                           0.74       308
   macro avg       0.71      0.71      0.71       308
weighted avg       0.74      0.74      0.74       308



### Save the model

In [13]:
# save the model_xgboost as pickle file for future use
import pickle
pickle.dump(model_xgboost, open('../model/model_xgboost.pkl', 'wb'))