In [17]:
import pandas as pd
from sklearn.metrics import classification_report, confusion_matrix, f1_score, precision_score, recall_score, roc_auc_score, roc_curve, accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
import xgboost as xgb
import cudf
from cuml.neighbors import KNeighborsClassifier as cuKNN
from cuml.svm import SVC as cuSVC
from cuml.ensemble import RandomForestClassifier as cuRF

In [2]:
df = pd.read_csv("https://github.com/YBIFoundation/Dataset/raw/main/Diabetes.csv")
df.head()

Unnamed: 0,pregnancies,glucose,diastolic,triceps,insulin,bmi,dpf,age,diabetes
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 768 entries, 0 to 767
Data columns (total 9 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   pregnancies  768 non-null    int64  
 1   glucose      768 non-null    int64  
 2   diastolic    768 non-null    int64  
 3   triceps      768 non-null    int64  
 4   insulin      768 non-null    int64  
 5   bmi          768 non-null    float64
 6   dpf          768 non-null    float64
 7   age          768 non-null    int64  
 8   diabetes     768 non-null    int64  
dtypes: float64(2), int64(7)
memory usage: 54.1 KB


In [4]:
df.shape

(768, 9)

In [5]:
df.describe()

Unnamed: 0,pregnancies,glucose,diastolic,triceps,insulin,bmi,dpf,age,diabetes
count,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0
mean,3.845052,120.894531,69.105469,20.536458,79.799479,31.992578,0.471876,33.240885,0.348958
std,3.369578,31.972618,19.355807,15.952218,115.244002,7.88416,0.331329,11.760232,0.476951
min,0.0,0.0,0.0,0.0,0.0,0.0,0.078,21.0,0.0
25%,1.0,99.0,62.0,0.0,0.0,27.3,0.24375,24.0,0.0
50%,3.0,117.0,72.0,23.0,30.5,32.0,0.3725,29.0,0.0
75%,6.0,140.25,80.0,32.0,127.25,36.6,0.62625,41.0,1.0
max,17.0,199.0,122.0,99.0,846.0,67.1,2.42,81.0,1.0


In [6]:
df.columns

Index(['pregnancies', 'glucose', 'diastolic', 'triceps', 'insulin', 'bmi',
       'dpf', 'age', 'diabetes'],
      dtype='object')

In [10]:
df['diabetes'].value_counts()

Unnamed: 0_level_0,count
diabetes,Unnamed: 1_level_1
0,500
1,268


In [7]:
X = df.drop("diabetes", axis = 1)
y = df['diabetes']

In [8]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 420)

In [9]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((614, 8), (154, 8), (614,), (154,))

# Scaling the data

In [None]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Trying differnet Classification models

## Logisitic Regression on CPU

In [34]:
model = LogisticRegression()
model.fit(X_train_scaled,y_train)
y_pred = model.predict(X_test_scaled)

In [35]:
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))
print("Precision:", precision_score(y_test, y_pred))
print("Recall:", recall_score(y_test, y_pred))
print("F1 Score:", f1_score(y_test, y_pred))
print("ROC-AUC:", roc_auc_score(y_test, y_pred))

Confusion Matrix:
 [[88 11]
 [28 27]]

Classification Report:
               precision    recall  f1-score   support

           0       0.76      0.89      0.82        99
           1       0.71      0.49      0.58        55

    accuracy                           0.75       154
   macro avg       0.73      0.69      0.70       154
weighted avg       0.74      0.75      0.73       154

Precision: 0.7105263157894737
Recall: 0.4909090909090909
F1 Score: 0.5806451612903226
ROC-AUC: 0.6898989898989898


In [28]:
model.intercept_

array([-8.10937057])

In [29]:
model.coef_

array([[ 0.09982045,  0.03259172, -0.01572359,  0.00515704, -0.00124251,
         0.09216341,  0.61585825,  0.02192193]])

## KNN on GPU

In [36]:
import torch
torch.cuda.is_available()

True

In [18]:
X_train_pd = pd.DataFrame(X_train_scaled, columns=X.columns)
X_test_pd = pd.DataFrame(X_test_scaled, columns=X.columns)

y_train_pd = pd.Series(y_train)

# Convert pandas to cuDF
X_train_cu = cudf.DataFrame.from_pandas(X_train_pd)
X_test_cu = cudf.DataFrame.from_pandas(X_test_pd)
y_train_cu = cudf.Series(y_train_pd)

# Train KNN on GPU
knn = cuKNN(n_neighbors=1, weights="uniform")
knn.fit(X_train_cu, y_train_cu)

# Predict and convert back to pandas
y_pred_cu = knn.predict(X_test_cu).to_pandas()

In [19]:
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_cu))
print("\nClassification Report:\n", classification_report(y_test, y_pred_cu))

print("Precision:", precision_score(y_test, y_pred_cu))
print("Recall:", recall_score(y_test, y_pred_cu))
print("F1 Score:", f1_score(y_test, y_pred_cu))
print("ROC-AUC:", roc_auc_score(y_test, y_pred_cu))

Confusion Matrix:
 [[83 16]
 [31 24]]

Classification Report:
               precision    recall  f1-score   support

           0       0.73      0.84      0.78        99
           1       0.60      0.44      0.51        55

    accuracy                           0.69       154
   macro avg       0.66      0.64      0.64       154
weighted avg       0.68      0.69      0.68       154

Precision: 0.6
Recall: 0.43636363636363634
F1 Score: 0.5052631578947369
ROC-AUC: 0.6373737373737374


## SVM on GPU

In [20]:
svm = cuSVC(kernel='rbf', C=1.0)
svm.fit(X_train_cu, y_train_cu)
y_pred_svm = svm.predict(X_test_cu).to_pandas()

In [21]:
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_svm))
print("\nClassification Report:\n", classification_report(y_test, y_pred_svm))

print("Precision:", precision_score(y_test, y_pred_svm))
print("Recall:", recall_score(y_test, y_pred_svm))
print("F1 Score:", f1_score(y_test, y_pred_svm))
print("ROC-AUC:", roc_auc_score(y_test, y_pred_svm))

Confusion Matrix:
 [[90  9]
 [30 25]]

Classification Report:
               precision    recall  f1-score   support

           0       0.75      0.91      0.82        99
           1       0.74      0.45      0.56        55

    accuracy                           0.75       154
   macro avg       0.74      0.68      0.69       154
weighted avg       0.74      0.75      0.73       154

Precision: 0.7352941176470589
Recall: 0.45454545454545453
F1 Score: 0.5617977528089888
ROC-AUC: 0.6818181818181819


## Decision Tree on GPU

In [22]:
# For Decision Tree we will use n_estimators=1
dt = cuRF(n_estimators=1, max_depth=10, random_state=42)
dt.fit(X_train_cu, y_train_cu)

y_pred_dt = dt.predict(X_test_cu).to_pandas()


In [23]:
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_dt))
print("\nClassification Report:\n", classification_report(y_test, y_pred_dt))

print("Precision:", precision_score(y_test, y_pred_dt))
print("Recall:", recall_score(y_test, y_pred_dt))
print("F1 Score:", f1_score(y_test, y_pred_dt))
print("ROC-AUC:", roc_auc_score(y_test, y_pred_dt))

Confusion Matrix:
 [[79 20]
 [28 27]]

Classification Report:
               precision    recall  f1-score   support

           0       0.74      0.80      0.77        99
           1       0.57      0.49      0.53        55

    accuracy                           0.69       154
   macro avg       0.66      0.64      0.65       154
weighted avg       0.68      0.69      0.68       154

Precision: 0.574468085106383
Recall: 0.4909090909090909
F1 Score: 0.5294117647058824
ROC-AUC: 0.6444444444444446


## Random Forest on GPU

In [24]:
rf = cuRF(n_estimators=100, max_depth=10, random_state=42)
rf.fit(X_train_cu, y_train_cu)
y_pred_rf = rf.predict(X_test_cu).to_pandas()

In [25]:
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_rf))
print("\nClassification Report:\n", classification_report(y_test, y_pred_rf))

print("Precision:", precision_score(y_test, y_pred_rf))
print("Recall:", recall_score(y_test, y_pred_rf))
print("F1 Score:", f1_score(y_test, y_pred_rf))
print("ROC-AUC:", roc_auc_score(y_test, y_pred_rf))

Confusion Matrix:
 [[88 11]
 [27 28]]

Classification Report:
               precision    recall  f1-score   support

           0       0.77      0.89      0.82        99
           1       0.72      0.51      0.60        55

    accuracy                           0.75       154
   macro avg       0.74      0.70      0.71       154
weighted avg       0.75      0.75      0.74       154

Precision: 0.717948717948718
Recall: 0.509090909090909
F1 Score: 0.5957446808510638
ROC-AUC: 0.6989898989898989


## XGBoost on GPU

In [166]:
model = xgb.XGBClassifier(
    n_estimators=40,
    max_depth=4,
    learning_rate=0.01,
    scale_pos_weight=(len(y_train) - sum(y_train)) / sum(y_train),
    tree_method='gpu_hist',
    random_state=42
)

model.fit(X_train_scaled, y_train)

y_pred = model.predict(X_test_scaled)
y_proba = model.predict_proba(X_test_scaled)[:, 1]



    E.g. tree_method = "hist", device = "cuda"

  bst.update(dtrain, iteration=i, fobj=obj)

    E.g. tree_method = "hist", device = "cuda"

  if len(data.shape) != 1 and self.num_features() != data.shape[1]:


In [167]:
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))
print("Precision:", precision_score(y_test, y_pred))
print("Recall:", recall_score(y_test, y_pred))
print("F1 Score:", f1_score(y_test, y_pred))
print("ROC-AUC:", roc_auc_score(y_test, y_proba))

Confusion Matrix:
 [[71 28]
 [11 44]]

Classification Report:
               precision    recall  f1-score   support

           0       0.87      0.72      0.78        99
           1       0.61      0.80      0.69        55

    accuracy                           0.75       154
   macro avg       0.74      0.76      0.74       154
weighted avg       0.77      0.75      0.75       154

Precision: 0.6111111111111112
Recall: 0.8
F1 Score: 0.6929133858267716
ROC-AUC: 0.8229568411386593


# Now I ill use smote to solve the imbalance in the dataset and try again

In [72]:
from imblearn.over_sampling import SMOTE
smote = SMOTE(random_state=42)
X_res_scaled, y_res = smote.fit_resample(X_train_scaled, y_train)

## KNN with smote

In [69]:
# Convert NumPy arrays to pandas DataFrame
X_train_pd = pd.DataFrame(X_res_scaled, columns=X.columns)
X_test_pd = pd.DataFrame(X_test_scaled, columns=X.columns)

y_train_pd = pd.Series(y_res)

# Convert pandas to cuDF
X_train_cu = cudf.DataFrame.from_pandas(X_train_pd)
X_test_cu = cudf.DataFrame.from_pandas(X_test_pd)
y_train_cu = cudf.Series(y_train_pd)

# Train KNN on GPU
knn = cuKNN(n_neighbors=1, weights="uniform")
knn.fit(X_train_cu, y_train_cu)

# Predict and convert back to pandas
y_pred_cu = knn.predict(X_test_cu).to_pandas()

In [70]:
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_cu))
print("\nClassification Report:\n", classification_report(y_test, y_pred_cu))

print("Precision:", precision_score(y_test, y_pred_cu))
print("Recall:", recall_score(y_test, y_pred_cu))
print("F1 Score:", f1_score(y_test, y_pred_cu))
print("ROC-AUC:", roc_auc_score(y_test, y_pred_cu))

Confusion Matrix:
 [[79 20]
 [28 27]]

Classification Report:
               precision    recall  f1-score   support

           0       0.74      0.80      0.77        99
           1       0.57      0.49      0.53        55

    accuracy                           0.69       154
   macro avg       0.66      0.64      0.65       154
weighted avg       0.68      0.69      0.68       154

Precision: 0.574468085106383
Recall: 0.4909090909090909
F1 Score: 0.5294117647058824
ROC-AUC: 0.6444444444444446


## SVM

In [74]:
svm = cuSVC(kernel='rbf', C=1.0)
svm.fit(X_train_cu, y_train_cu)
y_pred_svm = svm.predict(X_test_cu).to_pandas()

In [75]:
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_svm))
print("\nClassification Report:\n", classification_report(y_test, y_pred_svm))

print("Precision:", precision_score(y_test, y_pred_svm))
print("Recall:", recall_score(y_test, y_pred_svm))
print("F1 Score:", f1_score(y_test, y_pred_svm))
print("ROC-AUC:", roc_auc_score(y_test, y_pred_svm))

Confusion Matrix:
 [[81 18]
 [18 37]]

Classification Report:
               precision    recall  f1-score   support

           0       0.82      0.82      0.82        99
           1       0.67      0.67      0.67        55

    accuracy                           0.77       154
   macro avg       0.75      0.75      0.75       154
weighted avg       0.77      0.77      0.77       154

Precision: 0.6727272727272727
Recall: 0.6727272727272727
F1 Score: 0.6727272727272727
ROC-AUC: 0.7454545454545455


## Decision Tree

In [76]:
# For Decision Tree we will use n_estimators=1
dt = cuRF(n_estimators=1, max_depth=10, random_state=42)
dt.fit(X_train_cu, y_train_cu)

y_pred_dt = dt.predict(X_test_cu).to_pandas()


In [77]:
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_dt))
print("\nClassification Report:\n", classification_report(y_test, y_pred_dt))

print("Precision:", precision_score(y_test, y_pred_dt))
print("Recall:", recall_score(y_test, y_pred_dt))
print("F1 Score:", f1_score(y_test, y_pred_dt))
print("ROC-AUC:", roc_auc_score(y_test, y_pred_dt))

Confusion Matrix:
 [[67 32]
 [23 32]]

Classification Report:
               precision    recall  f1-score   support

           0       0.74      0.68      0.71        99
           1       0.50      0.58      0.54        55

    accuracy                           0.64       154
   macro avg       0.62      0.63      0.62       154
weighted avg       0.66      0.64      0.65       154

Precision: 0.5
Recall: 0.5818181818181818
F1 Score: 0.5378151260504201
ROC-AUC: 0.6292929292929292


## Random Forest

In [79]:
rf = cuRF(n_estimators=100, max_depth=10, random_state=42)
rf.fit(X_train_cu, y_train_cu)
y_pred_rf = rf.predict(X_test_cu).to_pandas()

In [80]:
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_rf))
print("\nClassification Report:\n", classification_report(y_test, y_pred_rf))

print("Precision:", precision_score(y_test, y_pred_rf))
print("Recall:", recall_score(y_test, y_pred_rf))
print("F1 Score:", f1_score(y_test, y_pred_rf))
print("ROC-AUC:", roc_auc_score(y_test, y_pred_rf))

Confusion Matrix:
 [[83 16]
 [17 38]]

Classification Report:
               precision    recall  f1-score   support

           0       0.83      0.84      0.83        99
           1       0.70      0.69      0.70        55

    accuracy                           0.79       154
   macro avg       0.77      0.76      0.77       154
weighted avg       0.78      0.79      0.79       154

Precision: 0.7037037037037037
Recall: 0.6909090909090909
F1 Score: 0.6972477064220184
ROC-AUC: 0.7646464646464647


## XGBoost

In [186]:
model = xgb.XGBClassifier(
    n_estimators=300,
    max_depth=4,
    learning_rate=0.01,
    tree_method='gpu_hist',
    random_state=42
)

model.fit(X_res_scaled, y_res)

y_pred = model.predict(X_test_scaled)
y_proba = model.predict_proba(X_test_scaled)[:, 1]


    E.g. tree_method = "hist", device = "cuda"

  bst.update(dtrain, iteration=i, fobj=obj)

    E.g. tree_method = "hist", device = "cuda"

  if len(data.shape) != 1 and self.num_features() != data.shape[1]:


In [187]:
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))
print("Precision:", precision_score(y_test, y_pred))
print("Recall:", recall_score(y_test, y_pred))
print("F1 Score:", f1_score(y_test, y_pred))
print("ROC-AUC:", roc_auc_score(y_test, y_proba))

Confusion Matrix:
 [[81 18]
 [15 40]]

Classification Report:
               precision    recall  f1-score   support

           0       0.84      0.82      0.83        99
           1       0.69      0.73      0.71        55

    accuracy                           0.79       154
   macro avg       0.77      0.77      0.77       154
weighted avg       0.79      0.79      0.79       154

Precision: 0.6896551724137931
Recall: 0.7272727272727273
F1 Score: 0.7079646017699115
ROC-AUC: 0.8604224058769514


# Conclusion as we see our best model was the XGBoost with F1 score of 0.707
