In [1]:
# Import necessary libraries
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

# Load the dataset
pima = pd.read_csv("../diabetes.csv")
print(pima.head())

   Pregnancies  Glucose  BloodPressure  SkinThickness  Insulin   BMI  \
0            6      148             72             35        0  33.6   
1            1       85             66             29        0  26.6   
2            8      183             64              0        0  23.3   
3            1       89             66             23       94  28.1   
4            0      137             40             35      168  43.1   

   DiabetesPedigreeFunction  Age  Outcome  
0                     0.627   50        1  
1                     0.351   31        0  
2                     0.672   32        1  
3                     0.167   21        0  
4                     2.288   33        1  


In [2]:
# Split the data into features (X) and target variable (y)
X = pima.drop(columns='Outcome')
y = pima['Outcome']

In [3]:
# Number of negative and positive cases in the data
num_obs = len(pima)
negative = len(pima.loc[pima['Outcome'] == 0])
positive = len(pima.loc[pima['Outcome'] == 1])
print("Number of negative cases:  {0} ({1:2.2f}%)".format(negative, ((1.00 * negative)/(1.0 * num_obs)) * 100))
print("Number of positve cases:  {0} ({1:2.2f}%)".format(positive, ((1.00 * positive)/(1.0 * num_obs)) * 100))

Number of negative cases:  500 (65.10%)
Number of positve cases:  268 (34.90%)


In [4]:
#Arrange Data using MinMax 
from sklearn.preprocessing import StandardScaler
scaler_x = StandardScaler() 
print(scaler_x.fit(X))
xscale=scaler_x.transform(X)

StandardScaler()


In [5]:
# Split xscale
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(xscale, y, test_size= 0.2, random_state = 0) 

In [6]:
# Number of each case in the data training and testing
print("Original negative : {0} ({1:0.2f}%)".format(len(pima.loc[pima['Outcome'] == 0]), (len(pima.loc[pima['Outcome'] == 0])/len(pima.index)) * 100.0))
print("Original positive : {0} ({1:0.2f}%)".format(len(pima.loc[pima['Outcome'] == 1]), (len(pima.loc[pima['Outcome'] == 1])/len(pima.index)) * 100.0))
print("")
print("Training negative : {0} ({1:0.2f}%)".format(len(y_train[y_train[:] == 0]), (len(y_train[y_train[:] == 0])/len(y_train) * 100.0)))
print("Training positive : {0} ({1:0.2f}%)".format(len(y_train[y_train[:] == 1]), (len(y_train[y_train[:] == 1])/len(y_train) * 100.0)))
print("")
print("Test negative     : {0} ({1:0.2f}%)".format(len(y_test[y_test[:] == 0]), (len(y_test[y_test[:] == 0])/len(y_test) * 100.0)))
print("Test positive     : {0} ({1:0.2f}%)".format(len(y_test[y_test[:] == 1]), (len(y_test[y_test[:] == 1])/len(y_test) * 100.0)))

Original negative : 500 (65.10%)
Original positive : 268 (34.90%)

Training negative : 393 (64.01%)
Training positive : 221 (35.99%)

Test negative     : 107 (69.48%)
Test positive     : 47 (30.52%)


In [7]:
from xgboost import XGBClassifier 
A = {'criterion': 'friedman_mse', 'loss': 'exponential', 'n_estimators': 25}
xgb = XGBClassifier(objective = 'binary:logistic', n_estimators = 40,criterion="friedman_mse",lass='exponential')

xgb.fit(X_train, y_train)

Parameters: { "criterion", "lass" } are not used.



In [8]:
y_pred = xgb.predict(X_test)

In [9]:
# accuracy score
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
print(accuracy_score(y_train, xgb.predict(X_train)))
# 30
xgb_acc = accuracy_score(y_test, y_pred)
print(xgb_acc)

0.998371335504886
0.8571428571428571


In [10]:
from sklearn.metrics import confusion_matrix, classification_report
print("Confusion Matrix: \n", confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

Confusion Matrix: 
 [[95 12]
 [10 37]]
              precision    recall  f1-score   support

           0       0.90      0.89      0.90       107
           1       0.76      0.79      0.77        47

    accuracy                           0.86       154
   macro avg       0.83      0.84      0.83       154
weighted avg       0.86      0.86      0.86       154

