# Experiments on Diabetes dataset:
*  Pima diabetes dataset.


In [None]:
# Necessary packages
import pandas as pd
import numpy as np
import seaborn as sns 
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier

from sklearn.model_selection import RepeatedStratifiedKFold, cross_val_score

from sklearn.impute import KNNImputer
from sklearn.model_selection import cross_validate

from sklearn.metrics import recall_score, accuracy_score
from sklearn.preprocessing import StandardScaler


In [None]:
# read the data set
df = pd.read_csv("/content/drive/MyDrive/PhD UCO 062022/step by step/Datasets 12062022/diabetes.csv")

# Data Cleaning



*   Some medical measurements have zero values, = missing values
*   Find them, replace them using the nearest neighbor imputation
*   Finally, let's go



In [None]:
df.shape

(768, 9)

In [None]:
df.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [None]:
#   "Glucose", "BloodPressure", "SkinThickness", "Insulin", "BMI"

# Handling Missing Values

In [None]:
# Replace the zeroes with Nan
df["Insulin"].replace(0, np.NaN,inplace=True)
df["BMI"].replace(0, np.NaN,inplace=True)
df["Glucose"].replace(0, np.NaN,inplace=True)
df["BloodPressure"].replace(0, np.NaN,inplace=True)
df["SkinThickness"].replace(0, np.NaN,inplace=True)

In [None]:
df.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148.0,72.0,35.0,,33.6,0.627,50,1
1,1,85.0,66.0,29.0,,26.6,0.351,31,0
2,8,183.0,64.0,,,23.3,0.672,32,1
3,1,89.0,66.0,23.0,94.0,28.1,0.167,21,0
4,0,137.0,40.0,35.0,168.0,43.1,2.288,33,1


In [None]:
df.describe()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
count,768.0,763.0,733.0,541.0,394.0,757.0,768.0,768.0,768.0
mean,3.845052,121.686763,72.405184,29.15342,155.548223,32.457464,0.471876,33.240885,0.348958
std,3.369578,30.535641,12.382158,10.476982,118.775855,6.924988,0.331329,11.760232,0.476951
min,0.0,44.0,24.0,7.0,14.0,18.2,0.078,21.0,0.0
25%,1.0,99.0,64.0,22.0,76.25,27.5,0.24375,24.0,0.0
50%,3.0,117.0,72.0,29.0,125.0,32.3,0.3725,29.0,0.0
75%,6.0,141.0,80.0,36.0,190.0,36.6,0.62625,41.0,1.0
max,17.0,199.0,122.0,99.0,846.0,67.1,2.42,81.0,1.0


In [None]:
# Check missing values
df.isnull().sum()

Pregnancies                   0
Glucose                       5
BloodPressure                35
SkinThickness               227
Insulin                     374
BMI                          11
DiabetesPedigreeFunction      0
Age                           0
Outcome                       0
dtype: int64

# Fill Nan with mean, as the feature having missing data are numerical

In [None]:
# method 1
df.fillna(df.mean())

In [None]:
# method 2
#  "Glucose", "BloodPressure", "SkinThickness", "Insulin", "BMI"
df["Glucose"].fillna(df["Glucose"].mean(), inplace = True)
df["BloodPressure"].fillna(df["BloodPressure"].mean(), inplace = True)
df["SkinThickness"].fillna(df["SkinThickness"].mean(), inplace = True)
df["Insulin"].fillna(df["Insulin"].mean(), inplace = True)
df["BMI"].fillna(df["BMI"].mean(), inplace = True)

In [None]:
X=df.iloc[:,:-1]
y=df.iloc[:,-1]

In [None]:
# method 3
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(missing_values=np.nan, strategy='mean')
imputer = imputer.fit(X)
  
X = imputer.transform(X)

In [None]:
# Random Forest
scoring= ['accuracy','precision', 'recall', 'roc_auc','balanced_accuracy', 'f1']
score_rf=cross_validate( RandomForestClassifier(),X,y,scoring= scoring, cv =3)

In [None]:
# Evaluate the built RF 
from tabulate import tabulate
scoring_val = []
for s in scoring:
  s="test_"+s
  scoring_val.append(round(np.mean(score_rf[s]),3))
eval_rf = []
for i in range(len(scoring)):
  eval_rf.append([scoring[i],scoring_val[i]])
print(tabulate(eval_rf,headers=["metric", "score of standard RF using mean imputation"], tablefmt='fancy_grid'))

╒═══════════════════╤══════════════════════════════════════════════╕
│ metric            │   score of standard RF using mean imputation │
╞═══════════════════╪══════════════════════════════════════════════╡
│ accuracy          │                                        0.753 │
├───────────────────┼──────────────────────────────────────────────┤
│ precision         │                                        0.671 │
├───────────────────┼──────────────────────────────────────────────┤
│ recall            │                                        0.582 │
├───────────────────┼──────────────────────────────────────────────┤
│ roc_auc           │                                        0.822 │
├───────────────────┼──────────────────────────────────────────────┤
│ balanced_accuracy │                                        0.713 │
├───────────────────┼──────────────────────────────────────────────┤
│ f1                │                                        0.619 │
╘═══════════════════╧═════════════

# Miss Forest imputation

In [None]:
pip install missingpy

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting missingpy
  Downloading missingpy-0.2.0-py3-none-any.whl (49 kB)
[?25l[K     |██████▊                         | 10 kB 32.5 MB/s eta 0:00:01[K     |█████████████▍                  | 20 kB 36.1 MB/s eta 0:00:01[K     |████████████████████            | 30 kB 25.6 MB/s eta 0:00:01[K     |██████████████████████████▊     | 40 kB 23.3 MB/s eta 0:00:01[K     |████████████████████████████████| 49 kB 6.0 MB/s 
[?25hInstalling collected packages: missingpy
Successfully installed missingpy-0.2.0


In [None]:
pip install sklearn

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting sklearn
  Downloading sklearn-0.0.tar.gz (1.1 kB)
Building wheels for collected packages: sklearn
  Building wheel for sklearn (setup.py) ... [?25l[?25hdone
  Created wheel for sklearn: filename=sklearn-0.0-py2.py3-none-any.whl size=1310 sha256=4f6f05888e083a836e2f27ecbbb1e2ea629f18a72eb5e8a75d11e305b8b937fb
  Stored in directory: /root/.cache/pip/wheels/46/ef/c3/157e41f5ee1372d1be90b09f74f82b10e391eaacca8f22d33e
Successfully built sklearn
Installing collected packages: sklearn
Successfully installed sklearn-0.0


In [None]:
import sys
import sklearn.neighbors._base
sys.modules['sklearn.neighbors.base'] = sklearn.neighbors._base

In [None]:

from missingpy import MissForest

imputer = MissForest()
X_imputed = imputer.fit_transform(X)

In [None]:
# Random Forest +  MissForest
scoring= ['accuracy','precision', 'recall', 'roc_auc','balanced_accuracy', 'f1']
score_rf=cross_validate( RandomForestClassifier(),X_imputed,y,scoring= scoring, cv =3)

In [None]:
# Evaluate the built RF 
from tabulate import tabulate
scoring_val = []
for s in scoring:
  s="test_"+s
  scoring_val.append(round(np.mean(score_rf[s]),3))
eval_rf = []
for i in range(len(scoring)):
  eval_rf.append([scoring[i],scoring_val[i]])
print(tabulate(eval_rf,headers=["metric", "score of standard RF using Miss Forest imputation"], tablefmt='fancy_grid'))

╒═══════════════════╤═════════════════════════════════════════════════════╕
│ metric            │   score of standard RF using Miss Forest imputation │
╞═══════════════════╪═════════════════════════════════════════════════════╡
│ accuracy          │                                               0.758 │
├───────────────────┼─────────────────────────────────────────────────────┤
│ precision         │                                               0.685 │
├───────────────────┼─────────────────────────────────────────────────────┤
│ recall            │                                               0.593 │
├───────────────────┼─────────────────────────────────────────────────────┤
│ roc_auc           │                                               0.832 │
├───────────────────┼─────────────────────────────────────────────────────┤
│ balanced_accuracy │                                               0.719 │
├───────────────────┼─────────────────────────────────────────────────────┤
│ f1        

# Drop Missing values

In [None]:
# Drop Missing values

# drop rows containing missing values: knowing that the default is to remove rows 
df.dropna(axis=0, inplace=True)


In [None]:
X=df.iloc[:,:-1]
y=df.iloc[:,-1]

In [None]:
# Second TRAIN the model using cross validation
# Random Forest
scoring= ['accuracy','precision', 'recall', 'roc_auc','balanced_accuracy', 'f1']
score_rf=cross_validate( RandomForestClassifier(),X,y,scoring= scoring, cv =3)

In [None]:
# Evaluate the built RF 
from tabulate import tabulate
scoring_val = []
for s in scoring:
  s="test_"+s
  scoring_val.append(round(np.mean(score_rf[s]),3))
eval_rf = []
for i in range(len(scoring)):
  eval_rf.append([scoring[i],scoring_val[i]])
print(tabulate(eval_rf,headers=["metric", "score of standard RF with dropping rows containing missing data"], tablefmt='fancy_grid'))

╒═══════════════════╤═══════════════════════════════════════════════════════════════════╕
│ metric            │   score of standard RF with dropping rows containing missing data │
╞═══════════════════╪═══════════════════════════════════════════════════════════════════╡
│ accuracy          │                                                             0.799 │
├───────────────────┼───────────────────────────────────────────────────────────────────┤
│ precision         │                                                             0.751 │
├───────────────────┼───────────────────────────────────────────────────────────────────┤
│ recall            │                                                             0.601 │
├───────────────────┼───────────────────────────────────────────────────────────────────┤
│ roc_auc           │                                                             0.86  │
├───────────────────┼───────────────────────────────────────────────────────────────────┤
│ balanced

In [None]:
# drop columns containing missing values 
df.dropna(axis=1, inplace=True)

In [None]:
# Train the RF on the data where we removed rows containing missing values:
scoring= ['accuracy','precision', 'recall', 'roc_auc','balanced_accuracy', 'f1']
score_rf=cross_validate( RandomForestClassifier(),X,y,scoring= scoring, cv =3)

In [None]:
# Evaluate the built RF 
from tabulate import tabulate
scoring_val = []
for s in scoring:
  s="test_"+s
  scoring_val.append(round(np.mean(score_rf[s]),3))
eval_rf = []
for i in range(len(scoring)):
  eval_rf.append([scoring[i],scoring_val[i]])
print(tabulate(eval_rf,headers=["metric", "score of standard RF with dropping columns containing missing data"], tablefmt='fancy_grid'))

╒═══════════════════╤══════════════════════════════════════════════════════════════════════╕
│ metric            │   score of standard RF with dropping columns containing missing data │
╞═══════════════════╪══════════════════════════════════════════════════════════════════════╡
│ accuracy          │                                                                0.793 │
├───────────────────┼──────────────────────────────────────────────────────────────────────┤
│ precision         │                                                                0.741 │
├───────────────────┼──────────────────────────────────────────────────────────────────────┤
│ recall            │                                                                0.593 │
├───────────────────┼──────────────────────────────────────────────────────────────────────┤
│ roc_auc           │                                                                0.863 │
├───────────────────┼─────────────────────────────────────────────────

In [None]:
X=df.iloc[:,:-1]
y=df.iloc[:,-1]

**Handling Missing values using KNNimputer from sklearn**

In [None]:
from sklearn.impute import KNNImputer
# define imputer
imputer = KNNImputer(n_neighbors=5, weights='uniform', metric='nan_euclidean')

In [None]:
# print total missing
X=X.values
print('Missing: %d' % sum(np.isnan(X).flatten()))

Missing: 652


In [None]:
# fit on the dataset
imputer.fit(X)
# transform the dataset


KNNImputer()

In [None]:
Xtrans = imputer.transform(X)

In [None]:
print('Missing: %d' % sum(np.isnan(Xtrans).flatten()))

Missing: 0


In [None]:

# Random Forest
scoring= ['accuracy','precision', 'recall', 'roc_auc','balanced_accuracy', 'f1']
score_rf=cross_validate( RandomForestClassifier(),Xtrans,y,scoring= scoring, cv =3)

In [None]:
# Evaluate Random Forest with cv=3
from tabulate import tabulate
scoring_val = []
for s in scoring:
  s="test_"+s
  scoring_val.append(round(np.mean(score_rf[s]),3))
eval_rf = []
for i in range(len(scoring)):
  eval_rf.append([scoring[i],scoring_val[i]])
print(tabulate(eval_rf,headers=["metric", "score of standard RF based on KNN inputer"], tablefmt='fancy_grid'))

╒═══════════════════╤═════════════════════════════════════════════╕
│ metric            │   score of standard RF based on KNN inputer │
╞═══════════════════╪═════════════════════════════════════════════╡
│ accuracy          │                                       0.755 │
├───────────────────┼─────────────────────────────────────────────┤
│ precision         │                                       0.678 │
├───────────────────┼─────────────────────────────────────────────┤
│ recall            │                                       0.593 │
├───────────────────┼─────────────────────────────────────────────┤
│ roc_auc           │                                       0.826 │
├───────────────────┼─────────────────────────────────────────────┤
│ balanced_accuracy │                                       0.717 │
├───────────────────┼─────────────────────────────────────────────┤
│ f1                │                                       0.627 │
╘═══════════════════╧═══════════════════════════