Federated XGBoost based on FedAvg - as proposed by McMahan

In [25]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import pickle
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [26]:
file = 'RELATHE.csv'
df_main = pd.read_csv(file)
df_main = df_main.sample(frac = 1)
df_main.shape

(1427, 4323)

In [27]:
df_main.head

<bound method NDFrame.head of      V1  V2  V3  V4  V5  V6  V7  V8  V9  V10  ...  V4314  V4315  V4316  V4317  \
425   0   0   0   0   0   0   0   0   0    0  ...      0      0      0      0   
27    0   0   0   0   0   0   0   0   0    0  ...      0      0      0      0   
51    0   0   0   0   0   0   0   0   0    0  ...      0      0      0      0   
813   0   0   0   0   0   0   0   0   0    0  ...      0      0      0      0   
371   0   0   0   0   0   0   0   0   0    0  ...      0      0      0      0   
..   ..  ..  ..  ..  ..  ..  ..  ..  ..  ...  ...    ...    ...    ...    ...   
983   0   0   0   0   0   0   0   0   0    0  ...      0      0      0      0   
736   0   0   0   0   0   0   0   0   0    0  ...      0      0      0      0   
726   0   0   0   0   0   0   0   0   0    0  ...      0      0      0      0   
134   0   0   0   0   0   0   0   0   0    0  ...      0      0      0      0   
515   0   0   0   0   0   0   0   0   0    0  ...      0      0      0      0  

In [28]:
# DATA DIVISION

df_1 = df_main.iloc[:300, :]
df_2 = df_main.iloc[300:600, :]
df_3 = df_main.iloc[600:900, :]
df_4 = df_main.iloc[900:1200, :]
df_5 = df_main.iloc[1200:, :]

In [29]:
global_model = pickle.load(open("xgb_clf.dat", "rb"))

In [53]:
class Client():
    def __init__(self, cli, df, model):
        self.cli = cli
        self.df = df
        self.model = model
    def generate_weights(self):
        df = self.df
        model = self.model
        X = df.drop('TARGET', axis = 1)
        y = df['TARGET']
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)
        eval_set = [(X_test, y_test)]
        model.fit(X_train, 
                y_train, 
                early_stopping_rounds=15,                  
                eval_set=eval_set,
                verbose = False)
        results = model.evals_result()
        return model.get_booster().trees_to_dataframe(), sum(results['validation_0']['logloss'])/len(results['validation_0']['logloss'])
    def inference(self, model):    #return the acc, loss of given model on local dataset
        df = self.df
        self.model = model
        X = df.drop('TARGET', axis = 1)
        y = df['TARGET']
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)
        y_pred = model.predict(X_test)
        return accuracy_score(y_test, y_pred), 

In [54]:
def update_model(global_model, local_weights):    #change the weights in the model itself
    # local weights is a trees to dataframe
    
    global_new_model = global_model
    return global_new_model

In [55]:
cl1 = Client(1, df_1, global_model)
cl2 = Client(2, df_2, global_model)
cl3 = Client(3, df_3, global_model)
cl4 = Client(4, df_4, global_model)
cl5 = Client(5, df_5, global_model)

cli_list = [cl1, cl2, cl3, cl4, cl5]

# DRIVER CODE

In [56]:
c = 1    #Fraction of Clients to be used
global_training_rounds = 1
train_loss = []
train_accuracy = []
for i in range(global_training_rounds):
    m = max(c*5, 1)
    st = np.random.choice(5, m, replace = False)    #random set of m clients
    local_w = []
    local_loss = []
    print('Global Training Round: ', end = "")
    print(i+1)
    for k in st:
        print('Training | Client index: ', end = "")
        print(k+1)
        w, loss = cli_list[k].generate_weights()    #w is a trees to dataframe element
        local_w.append(w)
        local_loss.append(loss)
        #w(t+1)k = clientupdate(k, wt)
        #w(t+1) = sum((nk/n) * w(t+1)k)
    global_model = update_model(global_model, local_w)
    train_loss.append(sum(local_loss)/len(local_loss))
    # Calculate avg training accuracy over all users at every epoch
    list_acc = []
    for j in range(5):
        acc = cli_list[j].inference(global_model)
        list_acc.append(sum(acc))
    train_accuracy.append(sum(list_acc)/len(list_acc))
    print("Training loss for Global Training Round: %2d is : %2.4f" % (i+1, train_loss[i]))
    print("Training accuracy for Global Training Round: %2d is : %2.4f" % (i+1, train_accuracy[i]))

Global Training Round: 1
Training | Client index: 1




1
Training | Client index: 2




2
Training | Client index: 4




3
Training | Client index: 3




4
Training | Client index: 5




5
Training loss for Global Training Round:  1 is : 0.5411
Training accuracy for Global Training Round:  1 is : 0.7284


In [11]:
#Find test accuracy and loss

In [14]:
trees.loc[trees['Feature'] == 'Leaf']["Gain"]

4      -0.127273
5      -0.171429
6      -0.000000
8      -0.100000
9       0.058333
          ...   
1048    0.015782
1050    0.049298
1052    0.040641
1053   -0.028623
1054    0.017634
Name: Gain, Length: 574, dtype: float64

In [61]:
for i in range(5):
    print(local_w[i])

      Tree  Node     ID Feature  Split    Yes     No Missing       Gain  \
0        0     0    0-0    V288    1.0    0-1    0-2     0-1  22.504303   
1        0     1    0-1   V2070    1.0    0-3    0-4     0-3   7.828083   
2        0     2    0-2   V2969    1.0    0-5    0-6     0-5   2.211075   
3        0     3    0-3   V1594    1.0    0-7    0-8     0-7   5.604492   
4        0     4    0-4   V3817    1.0    0-9   0-10     0-9   2.103593   
...    ...   ...    ...     ...    ...    ...    ...     ...        ...   
1169   107     4  107-4    Leaf    NaN    NaN    NaN     NaN  -0.045068   
1170   107     5  107-5   V2411    2.0  107-7  107-8   107-7   0.342338   
1171   107     6  107-6    Leaf    NaN    NaN    NaN     NaN  -0.013421   
1172   107     7  107-7    Leaf    NaN    NaN    NaN     NaN   0.040409   
1173   107     8  107-8    Leaf    NaN    NaN    NaN     NaN  -0.010202   

          Cover  Category  
0     35.750000       NaN  
1     28.500000       NaN  
2      7.250000

In [64]:
trees = local_w
trees

[      Tree  Node     ID Feature  Split    Yes     No Missing       Gain  \
 0        0     0    0-0    V288    1.0    0-1    0-2     0-1  22.504303   
 1        0     1    0-1   V2070    1.0    0-3    0-4     0-3   7.828083   
 2        0     2    0-2   V2969    1.0    0-5    0-6     0-5   2.211075   
 3        0     3    0-3   V1594    1.0    0-7    0-8     0-7   5.604492   
 4        0     4    0-4   V3817    1.0    0-9   0-10     0-9   2.103593   
 ...    ...   ...    ...     ...    ...    ...    ...     ...        ...   
 1169   107     4  107-4    Leaf    NaN    NaN    NaN     NaN  -0.045068   
 1170   107     5  107-5   V2411    2.0  107-7  107-8   107-7   0.342338   
 1171   107     6  107-6    Leaf    NaN    NaN    NaN     NaN  -0.013421   
 1172   107     7  107-7    Leaf    NaN    NaN    NaN     NaN   0.040409   
 1173   107     8  107-8    Leaf    NaN    NaN    NaN     NaN  -0.010202   
 
           Cover  Category  
 0     35.750000       NaN  
 1     28.500000       NaN  

In [None]:
avg_col = trees[0].loc[trees['Feature'] == 'Leaf']["Gain"]
for i in range(1, 5):
    avg_col += trees[i].loc[trees['Feature'] == 'Leaf']["Gain"]
    trees[i].loc[trees['Feature'] == 'Leaf']["Gain"] =

In [None]:
trees.loc[trees['Feature'] == 'Leaf']["Gain"]

In [15]:
#Possible Loophole: Using full batch instead of splitting in client_update [fedavg, mcmahan]

In [16]:
dump_list = global_model.get_booster().get_dump()
len(dump_list)

93