Federated XGBoost based on FedAvg - as proposed by McMahan

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import pickle
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [2]:
file = 'RELATHE.csv'
df_main = pd.read_csv(file)
df_main = df_main.sample(frac = 1)
df_main.shape

(1427, 4323)

In [3]:
df_main.head

<bound method NDFrame.head of       V1  V2  V3  V4  V5  V6  V7  V8  V9  V10  ...  V4314  V4315  V4316  \
33     0   0   0   0   0   0   0   0   0    0  ...      0      0      0   
1005   0   0   0   0   0   0   0   0   0    0  ...      0      0      0   
853    0   0   0   0   0   0   0   0   0    0  ...      0      0      0   
368    0   0   1   0   0   0   0   0   0    0  ...      0      0      0   
1288   0   0   0   0   0   0   0   0   0    0  ...      0      0      0   
...   ..  ..  ..  ..  ..  ..  ..  ..  ..  ...  ...    ...    ...    ...   
310    0   0   0   0   0   0   0   0   0    0  ...      0      0      0   
425    0   0   0   0   0   0   0   0   0    0  ...      0      0      0   
1148   0   0   0   0   0   0   0   0   0    0  ...      0      0      0   
256    0   0   0   0   0   0   0   0   0    0  ...      0      0      0   
218    0   0   0   0   0   0   0   0   0    0  ...      0      0      0   

      V4317  V4318  V4319  V4320  V4321  V4322  TARGET  
33        0 

In [4]:
#DATA DIVISION

df_1 = df_main.iloc[:300, :]
x_1 = df_1.drop('TARGET', axis = 1)
y_1 = df_1['TARGET']
df_2 = df_main.iloc[300:600, :]
x_2 = df_2.drop('TARGET', axis = 1)
y_2 = df_2['TARGET']
df_3 = df_main.iloc[600:900, :]
x_3 = df_3.drop('TARGET', axis = 1)
y_3 = df_3['TARGET']
df_4 = df_main.iloc[900:1200, :]
x_4 = df_4.drop('TARGET', axis = 1)
y_4 = df_4['TARGET']
df_5 = df_main.iloc[1200:, :]
x_5 = df_5.drop('TARGET', axis = 1)
y_5 = df_5['TARGET']
df_list = [df_1, df_2, df_3, df_4, df_5]
x_list = [x_1, x_2, x_3, x_4, x_5]

In [5]:
global_model = pickle.load(open("xgb_clf.dat", "rb"))

In [6]:
class Server():
    def __init__(self, df, model):
        self.df = df
        self.model = model
    def update_weights():
        return

server = Server(df_main, global_model)

In [7]:
class Client():
    def __init__(self, cli, df, model):
        self.cli = cli
        self.df = df
        self.model = model
    def generate_weights(self, model):
        df = self.df
        X = df.drop('TARGET', axis = 1)
        y = df['TARGET']
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)
        eval_set = [(X_test, y_test)]
        model.fit(X_train, 
                y_train, 
                early_stopping_rounds=15,                  
                eval_set=eval_set,
                verbose = False)
        results = model.evals_result()
        return model, sum(results['validation_0']['logloss'])/len(results['validation_0']['logloss'])
    def inference(self, model):    #return the acc, loss of given model on local dataset
        df = self.df
        X = df.drop('TARGET', axis = 1)
        y = df['TARGET']
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)
        y_pred = model.predict(X_test)
        return accuracy_score(y_test, y_pred), 

In [8]:
def update_model(global_model, local_weights):    #change the weights in the model itself
    # local weights is also a model
    global_new_model = global_model
    return global_new_model

In [9]:
cl1 = Client(1, df_1, global_model)
cl2 = Client(2, df_2, global_model)
cl3 = Client(3, df_3, global_model)
cl4 = Client(4, df_4, global_model)
cl5 = Client(5, df_5, global_model)

cli_list = [cl1, cl2, cl3, cl4, cl5]

In [18]:
c = 1    #Fraction of Clients to be used
global_training_rounds = 4
train_loss = []
train_accuracy = []
for i in range(global_training_rounds):
    m = max(c*5, 1)
    st = np.random.choice(5, m, replace = False)    #random set of m clients
    local_w = []
    local_loss = []
    print('Global Training Round: ', end = "")
    print(i+1)
    for k in st:
        print('Training | Client index: ', end = "")
        print(k+1)
        w, loss = cli_list[k].generate_weights(global_model)
        local_w.append(w)
        local_loss.append(loss)
        #w(t+1)k = clientupdate(k, wt)
        #w(t+1) = sum((nk/n) * w(t+1)k)
    global_model = update_model(global_model, local_w)
    train_loss.append(sum(local_loss)/len(local_loss))
    # Calculate avg training accuracy over all users at every epoch
    list_acc = []
    for j in range(5):
        acc = cli_list[j].inference(global_model)
        list_acc.append(sum(acc))
    train_accuracy.append(sum(list_acc)/len(list_acc))
    print("Training loss for Global Training Round: %2d is : %2.4f" % (i+1, train_loss[i]))
    print("Training accuracy for Global Training Round: %2d is : %2.4f" % (i+1, train_accuracy[i]))

Global Training Round: 1
Training | Client index: 2




Training | Client index: 3




Training | Client index: 1




Training | Client index: 4




Training | Client index: 5




Training loss for Global Training Round:  1 is : 0.5218
Training accuracy for Global Training Round:  1 is : 0.7394
Global Training Round: 2
Training | Client index: 5




Training | Client index: 1




Training | Client index: 4




Training | Client index: 2




Training | Client index: 3




Training loss for Global Training Round:  2 is : 0.5218
Training accuracy for Global Training Round:  2 is : 0.8606
Global Training Round: 3
Training | Client index: 3




Training | Client index: 5




Training | Client index: 2




Training | Client index: 4




Training | Client index: 1




Training loss for Global Training Round:  3 is : 0.5218
Training accuracy for Global Training Round:  3 is : 0.7742
Global Training Round: 4
Training | Client index: 1




Training | Client index: 4




Training | Client index: 3




Training | Client index: 2




Training | Client index: 5




Training loss for Global Training Round:  4 is : 0.5218
Training accuracy for Global Training Round:  4 is : 0.7394


In [11]:
#Find test accuracy and loss

In [12]:
trees = global_model.get_booster().trees_to_dataframe()

In [13]:
trees

Unnamed: 0,Tree,Node,ID,Feature,Split,Yes,No,Missing,Gain,Cover,Category
0,0,0,0-0,V288,1.0,0-1,0-2,0-1,17.150059,35.750000,
1,0,1,0-1,V1714,1.0,0-3,0-4,0-3,6.059588,28.750000,
2,0,2,0-2,V3167,2.0,0-5,0-6,0-5,2.571428,7.000000,
3,0,3,0-3,V287,1.0,0-7,0-8,0-7,5.368132,27.000000,
4,0,4,0-4,Leaf,,,,,-0.127273,1.750000,
...,...,...,...,...,...,...,...,...,...,...,...
1050,92,6,92-6,Leaf,,,,,0.049298,1.513802,
1051,92,7,92-7,V3740,2.0,92-9,92-10,92-9,0.455433,11.856306,
1052,92,8,92-8,Leaf,,,,,0.040641,1.025197,
1053,92,9,92-9,Leaf,,,,,-0.028623,9.931289,


In [14]:
trees.loc[trees['Feature'] == 'Leaf']["Gain"]

4      -0.127273
5      -0.171429
6      -0.000000
8      -0.100000
9       0.058333
          ...   
1048    0.015782
1050    0.049298
1052    0.040641
1053   -0.028623
1054    0.017634
Name: Gain, Length: 574, dtype: float64

In [15]:
#Possible Loophole: Using full batch instead of splitting in client_update [fedavg, mcmahan]

In [16]:
dump_list = global_model.get_booster().get_dump()
len(dump_list)

93