In [1]:
# Import necessary libraries
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

# Load the dataset
pima = pd.read_csv("./cl2.csv")
print(pima.head())

   Unnamed: 0  Pregnancies  Glucose  BloodPressure  SkinThickness  Insulin  \
0         192            7      159             66              0        0   
1         193           11      135              0              0        0   
2         194            8       85             55             20        0   
3         195            5      158             84             41      210   
4         196            1      105             58              0        0   

    BMI  DiabetesPedigreeFunction  Age  Outcome  
0  30.4                     0.383   36        1  
1  52.3                     0.578   40        1  
2  24.4                     0.136   42        0  
3  39.4                     0.395   29        1  
4  24.3                     0.187   21        0  


In [2]:
# Split the data into features (X) and target variable (y)
X = pima.drop(columns='Outcome')
y = pima['Outcome']

In [3]:
# Number of negative and positive cases in the data
num_obs = len(pima)
negative = len(pima.loc[pima['Outcome'] == 0])
positive = len(pima.loc[pima['Outcome'] == 1])
print("Number of negative cases:  {0} ({1:2.2f}%)".format(negative, ((1.00 * negative)/(1.0 * num_obs)) * 100))
print("Number of positve cases:  {0} ({1:2.2f}%)".format(positive, ((1.00 * positive)/(1.0 * num_obs)) * 100))

Number of negative cases:  377 (65.45%)
Number of positve cases:  199 (34.55%)


In [3]:
# Split xscale
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size= 0.2, random_state = 0) 

In [5]:
# Number of each case in the data training and testing
print("Original negative : {0} ({1:0.2f}%)".format(len(pima.loc[pima['Outcome'] == 0]), (len(pima.loc[pima['Outcome'] == 0])/len(pima.index)) * 100.0))
print("Original positive : {0} ({1:0.2f}%)".format(len(pima.loc[pima['Outcome'] == 1]), (len(pima.loc[pima['Outcome'] == 1])/len(pima.index)) * 100.0))
print("")
print("Training negative : {0} ({1:0.2f}%)".format(len(y_train[y_train[:] == 0]), (len(y_train[y_train[:] == 0])/len(y_train) * 100.0)))
print("Training positive : {0} ({1:0.2f}%)".format(len(y_train[y_train[:] == 1]), (len(y_train[y_train[:] == 1])/len(y_train) * 100.0)))
print("")
print("Test negative     : {0} ({1:0.2f}%)".format(len(y_test[y_test[:] == 0]), (len(y_test[y_test[:] == 0])/len(y_test) * 100.0)))
print("Test positive     : {0} ({1:0.2f}%)".format(len(y_test[y_test[:] == 1]), (len(y_test[y_test[:] == 1])/len(y_test) * 100.0)))

Original negative : 377 (65.45%)
Original positive : 199 (34.55%)

Training negative : 293 (63.70%)
Training positive : 167 (36.30%)

Test negative     : 84 (72.41%)
Test positive     : 32 (27.59%)


In [6]:
print(X_train.shape,X_test.shape)

(460, 9) (116, 9)


In [7]:
import argparse
import warnings
from typing import Union
from logging import INFO
from datasets import Dataset, DatasetDict
import xgboost as xgb
import numpy as np
import flwr as fl
from flwr_datasets import FederatedDataset
from flwr.common.logger import log
from flwr.common import (
    Code,
    EvaluateIns,
    EvaluateRes,
    FitIns,
    FitRes,
    GetParametersIns,
    GetParametersRes,
    Parameters,
    Status,
)
from flwr_datasets.partitioner import IidPartitioner


warnings.filterwarnings("ignore", category=UserWarning)

# def transform_dataset_to_dmatrix(data: Union[Dataset, DatasetDict]) -> xgb.core.DMatrix:
#     """Transform dataset to DMatrix format for xgboost."""
#     x = data["inputs"]
#     y = data["label"]
#     new_data = xgb.DMatrix(x, label=y)
#     return new_data

# # Train/test splitting
# train_data, valid_data, num_train, num_val = X_train, X_test, y_train, y_test
num_train = 460
num_val = 116


xgb_train = xgb.DMatrix(X_train, y_train, enable_categorical=True)
xgb_test = xgb.DMatrix(X_test, y_test, enable_categorical=True)

# Reformat data to DMatrix for xgboost
log(INFO, "Reformatting data...")
# train_dmatrix = transform_dataset_to_dmatrix(train_data)
# valid_dmatrix = transform_dataset_to_dmatrix(valid_data)
train_dmatrix = xgb_train
valid_dmatrix = xgb_test

from sklearn import metrics
from sklearn.metrics import confusion_matrix, classification_report


# Hyper-parameters for xgboost training
num_local_round = 5
params = {
    "objective": "binary:logistic",
    "eta": 0.5,  # Learning rate
    "max_depth": 8,
    "eval_metric": "error",
    "nthread": 16,
    "num_parallel_tree": 1,
    "subsample": 1,
    "tree_method": "hist",
}


# Define Flower client
class XgbClient(fl.client.Client):
    def __init__(self):
        self.bst = None
        self.config = None

    def get_parameters(self, ins: GetParametersIns) -> GetParametersRes:
        _ = (self, ins)
        return GetParametersRes(
            status=Status(
                code=Code.OK,
                message="OK",
            ),
            parameters=Parameters(tensor_type="", tensors=[]),
        )

    def _local_boost(self):
        # Update trees based on local training data.
        for i in range(num_local_round):
            self.bst.update(train_dmatrix, self.bst.num_boosted_rounds())

        # Extract the last N=num_local_round trees for sever aggregation
        bst = self.bst[
            self.bst.num_boosted_rounds()
            - num_local_round : self.bst.num_boosted_rounds()
        ]

        return bst

    def fit(self, ins: FitIns) -> FitRes:
        if not self.bst:
            # First round local training
            log(INFO, "Start training at round 1")
            bst = xgb.train(
                params,
                train_dmatrix,
                num_boost_round=num_local_round,
                evals=[(valid_dmatrix, "validate"), (train_dmatrix, "train")],
            )
            self.config = bst.save_config()
            self.bst = bst
        else:
            for item in ins.parameters.tensors:
                global_model = bytearray(item)

            # Load global model into booster
            self.bst.load_model(global_model)
            self.bst.load_config(self.config)

            bst = self._local_boost()

        local_model = bst.save_raw("json")
        local_model_bytes = bytes(local_model)

        return FitRes(
            status=Status(
                code=Code.OK,
                message="OK",
            ),
            parameters=Parameters(tensor_type="", tensors=[local_model_bytes]),
            num_examples=num_train,
            metrics={},
        )

    def evaluate(self, ins: EvaluateIns) -> EvaluateRes:
        eval_results = self.bst.eval_set(
            evals=[(valid_dmatrix, "valid")],
            iteration=self.bst.num_boosted_rounds() - 1,
        )
        auc = round(float(eval_results.split("\t")[1].split(":")[1]), 4)
        
        preds = self.bst.predict(valid_dmatrix)
        y_pred = np.multiply(preds,100)
        y_pred = y_pred.astype(int)
        a = [1 if i >= 50 else 0 for i in y_pred]
        print("Confusion Matrix: \n", confusion_matrix(y_test, a))
        print(metrics.classification_report(y_test,a))
        

        return EvaluateRes(
            status=Status(
                code=Code.OK,
                message="OK",
            ),
            loss=0.0,
            num_examples=num_val,
            metrics={"error": auc},
        )


# Start Flower client
fl.client.start_client(server_address="127.0.0.1:8080", client=XgbClient().to_client())

# Confusion Matrix: 
#  [[69 15]
#  [ 7 25]]
#               precision    recall  f1-score   support

#            0       0.91      0.82      0.86        84
#            1       0.62      0.78      0.69        32

#     accuracy                           0.81       116
#    macro avg       0.77      0.80      0.78       116
# weighted avg       0.83      0.81      0.82       116

# 78

INFO flwr 2024-05-13 18:08:51,185 | 1497266632.py:44 | Reformatting data...
INFO flwr 2024-05-13 18:08:51,188 | grpc.py:52 | Opened insecure gRPC connection (no certificates were passed)
DEBUG flwr 2024-05-13 18:08:51,202 | connection.py:55 | ChannelConnectivity.IDLE
DEBUG flwr 2024-05-13 18:08:51,205 | connection.py:55 | ChannelConnectivity.READY
INFO flwr 2024-05-13 18:08:51,208 | 1497266632.py:100 | Start training at round 1


[0]	validate-error:0.22414	train-error:0.13043
[1]	validate-error:0.25000	train-error:0.08478
[2]	validate-error:0.18103	train-error:0.05217
[3]	validate-error:0.16379	train-error:0.03261
[4]	validate-error:0.17241	train-error:0.02174
Confusion Matrix: 
 [[73 11]
 [ 9 23]]
              precision    recall  f1-score   support

           0       0.89      0.87      0.88        84
           1       0.68      0.72      0.70        32

    accuracy                           0.83       116
   macro avg       0.78      0.79      0.79       116
weighted avg       0.83      0.83      0.83       116

Confusion Matrix: 
 [[66 18]
 [ 8 24]]
              precision    recall  f1-score   support

           0       0.89      0.79      0.84        84
           1       0.57      0.75      0.65        32

    accuracy                           0.78       116
   macro avg       0.73      0.77      0.74       116
weighted avg       0.80      0.78      0.78       116

Confusion Matrix: 
 [[67 17]
 [ 8

DEBUG flwr 2024-05-13 18:08:52,155 | connection.py:220 | gRPC channel closed
INFO flwr 2024-05-13 18:08:52,155 | app.py:398 | Disconnect and shut down


Confusion Matrix: 
 [[66 18]
 [13 19]]
              precision    recall  f1-score   support

           0       0.84      0.79      0.81        84
           1       0.51      0.59      0.55        32

    accuracy                           0.73       116
   macro avg       0.67      0.69      0.68       116
weighted avg       0.75      0.73      0.74       116



In [9]:
from xgboost import XGBClassifier 

xgb = XGBClassifier(objective = 'binary:logistic')

xgb.fit(X_train, y_train)
print()
y_pred = xgb.predict(X_test)

print("Confusion Matrix: \n", confusion_matrix(y_test, y_pred))
print(metrics.classification_report(y_test,y_pred))


Confusion Matrix: 
 [[62 22]
 [10 22]]
              precision    recall  f1-score   support

           0       0.86      0.74      0.79        84
           1       0.50      0.69      0.58        32

    accuracy                           0.72       116
   macro avg       0.68      0.71      0.69       116
weighted avg       0.76      0.72      0.74       116

