**This notebook server to train and save a glovel model**

Mount the google drive and authorizations

In [1]:
from google.colab import drive
drive.mount('/content/drive')
!pip install -U -q PyDrive
from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials# Authenticate and create the PyDrive client.
auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


Common imports and setting up code source paths

In [None]:
import numpy as np # linear algebra
import os
import sys
import time
import gc
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import torch
import datetime
####################################
# Turning off the debug options to speed up the execution
# torch.autograd.set_detect_anomaly(False)
# torch.autograd.profiler.profile(False)
# torch.autograd.profiler.emit_nvtx(False)

sys.path.append('/content/drive/MyDrive/AdaVFL-GitHub')

Upload the data first

In [None]:
import tensorflow as tf
from Data import MakeTrainingTimes, CheckLocalPredictionData, GenerateRandomSamples, LocalSequentialDataset, LocalSampledDataset, GlobalSequentialDataset, GlobalSampledDataset, build_adjMatrix, MakeAttackTimes, initialize_ratio_contribution
from federationarguments import arguments

args = arguments()
if(args.dataset == "bikeNYC"):
  args.trainingInterval = 24*60*60*1000
  args.x_axis = 8
  args.y_axis = 16
else:
  if(args.dataset == "Yelp"):
    args.trainingInterval = 8*24*60*60*1000
    args.x_axis = 8
    args.y_axis = 8
  else:
    raise SystemError('Invalid data folder')  

device_name = tf.test.gpu_device_name()
if device_name == '/device:GPU:0':
  print('Found GPU at: {}'.format(device_name))
  args.device_name = 'cuda'
  torch.cuda.set_device(0)
else:
  args.device_name = 'cpu'

fluff, folder = args.dataset_link.split('folders/')
filePath = "'%s' in parents and trashed=false" %folder
print(filePath)
downloaded = drive.ListFile({'q':filePath}).GetList()

LocalTrainData = []
LocalPredictionSamples = {}
GlobalPredictionSamples = GlobalSampledDataset(args)
GlobalTestData = GlobalSequentialDataset(args,args.testRatioBegin,args.testRatioEnd)
GlobalTrainData = GlobalSequentialDataset(args,args.trainRatioBegin,args.trainRatioEnd)

sampled_list = []

if len(downloaded) > 0:
  sampledPrediction = True
  for file in downloaded:
    try:
      downloaded = drive.CreateFile({'id':file['id']}) 
      downloaded.GetContentFile(file['title'])  
      timestamps = pd.read_csv(file['title'])
      grid = file['title'].split(".")
      axis = grid[0].split("X")
      x_axis = int(axis[0])
      y_axis = int(axis[1])
      
      train = LocalSequentialDataset(timestamps,x_axis, y_axis, args.trainRatioBegin,args.trainRatioEnd, args)
      train.make_data()
      LocalTrainData.append(train)

      if(sampledPrediction == True):
        sampled_list = GenerateRandomSamples(timestamps, args)    
        sampledPrediction = False  
      
      predictsample = LocalSampledDataset(timestamps, x_axis, y_axis, args)
      predictsample.make_data(sampled_list)
      sample_ID = str(x_axis)+"X"+str(y_axis)
      LocalPredictionSamples[sample_ID] = predictsample
      GlobalTestData.add_data(file['title'], timestamps)
      GlobalTrainData.add_data(file['title'], timestamps)
      GlobalPredictionSamples.add_data(file['title'], predictsample)
    except Exception as e:
      print("hit an exception when making data ",e)
      exit('hit an exception when making data')
else:
  raise SystemError('empty data folder')      

GlobalTestData.make_data()
print("--- Checking global test data ---- ")
GlobalTestData.check_data()

GlobalTrainData.make_data()
print("--- Checking local test data ---- ")
GlobalTrainData.check_data()

print("--- Checking local prediction data ---- ")
CheckLocalPredictionData(LocalPredictionSamples, sampled_list)

print("--- Checking global prediction data ---- ")
GlobalPredictionSamples.make_data()
GlobalPredictionSamples.check_data()
print("len(LocalPredictionSamples)",len(LocalPredictionSamples))

## TODO: need to set the attacker start time and end time
MakeTrainingTimes(LocalTrainData, args)
print("--- making the timestamp training bouadries ---- ")
print("args.beginTrainingTimestamp",args.beginTrainingTimestamp)
print("args.endTrainingTimestamp",args.endTrainingTimestamp)

adj = build_adjMatrix(args)
adj = adj.to(args.device_name)
beginningTime = args.beginTrainingTimestamp
endingTime = args.endTrainingTimestamp
iterations = ((endingTime - beginningTime)//args.trainingInterval)+1

print("starting training time", datetime.datetime.fromtimestamp(args.start_training_time/1000).strftime('%Y-%m-%d %H:%M:%S'))
print("ending training time ", datetime.datetime.fromtimestamp(args.start_ending_time/1000).strftime('%Y-%m-%d %H:%M:%S'))

del sampled_list
del timestamps
gc.collect()
print(adj.shape)


**Only when calculating the metric contribution for each dataset:**
* Pre-train a model to use for the metric calculation

In [None]:
from Models import GRU, MyGAT
from tqdm import tqdm
import copy
from Data import build_map,build_adjMatrix
from WeightTools import train_global_weights, test_model
import datetime
import ast 
import torchvision.models as models
import random 

adj = adj.to(args.device_name)
beginningTime = args.beginTrainingTimestamp
endingTime = args.endTrainingTimestamp
global_test_output=[]

globalModel = None
if (args.global_model == 'GNN'):
  globalModel = MyGAT(args,adj)
  globalModel.to(args.device_name)
  globalModel.train()
else:
  exit('Error: unrecognized global model')

start_time = time.time()
global_training_loss = []
weight_dict = {}
number_of_training_rounds = 0

global_loss_epoch =10
# while global_loss_epoch > 0.1:
for epoch in tqdm(range(args.pretrainepochs)):
  beginningTime = args.beginTrainingTimestamp + epoch*args.trainingInterval
  for i in tqdm(range(iterations)):
    timestamp = random.randrange(beginningTime, endingTime, args.trainingInterval)
    _,global_loss_epoch = train_global_weights(timestamp, globalModel,GlobalTrainData,args, adj)      
    print("global_loss_epoch:",global_loss_epoch)
    global_training_loss.append(global_loss_epoch)
    gc.collect()
global_loss_training = sum(global_training_loss) / len(global_training_loss)

### testing the global model:    
globalModel.eval() 

with torch.no_grad():
  global_test_output, test_acc, test_loss = test_model(globalModel, adj, GlobalTestData,args)    
  now = datetime.datetime.now()
  adding_line = '[\'{}\',{},\'{}\',\'{}\',\'{}\',\'{}\',{},{},{},\'{}\',{},{},{},{},{},{},{},{},{}]'.\
      format(args.dataset, args.input_length,args.local_model,args.local_model_loss, args.global_model,args.global_model_loss, args.epochs, args.x_axis, args.y_axis, 
              now.strftime("%Y-%m-%d %H:%M:%S"), test_acc["MAE"],test_acc["RMSE"],
              test_acc["AE"],test_acc["WMAPE"],args.scratch_prediction,args.test_normalization,args.train_normalization, 
              test_loss,global_loss_training)
  print(f' \n Results after {args.epochs} global rounds of training:',adding_line)

**Use the pretrained model to compute the contribution**

In [None]:
from tqdm import tqdm
import copy

from Data import RandomFeature, marginal
from WeightTools import predict_model 

feature_contribution_metrics = [[]]
feature_contribution_metrics = np.full((args.x_axis,args.y_axis), -1.) 
with torch.no_grad():
#choose feature
  for i in range(0,args.x_axis):
    for j in range(0,args.y_axis):
      print("processing ",i," and ",j)
      max_marginal = -1.0
      for sample in GlobalTestData.sequence:
        #calculate the metric:
        seq,label = sample
        #calculate the model's output
        original_output = predict_model(globalModel,seq,adj,args)
        #replace the feature with random values
        ranseq = RandomFeature(seq, i,j,args)
        random_output = predict_model(globalModel,ranseq,adj,args)
        marginal_contribution = marginal(random_output,original_output)
        if(marginal_contribution > max_marginal):
          max_marginal = marginal_contribution
      feature_contribution_metrics[i][j] = max_marginal

for i in range(0,args.x_axis):
  for j in range(0,args.y_axis):
    print("feature_contribution_metrics[i][j]:",feature_contribution_metrics[i][j])

**build the feature contribution**

In [None]:
import sys
sys.path.append('/content/drive/MyDrive/AdaVFL-GitHub')
from Data import compute_feature_contribution,initialize_ratio_contribution

organization_contribution_distribution = initialize_ratio_contribution(args,feature_contribution_metrics)
feature_contribution_metrics = compute_feature_contribution(organization_contribution_distribution,feature_contribution_metrics,args)
for i in range(0,args.x_axis):
  for j in range(0,args.y_axis):
    print("feature_contribution_metrics[i][j]:",feature_contribution_metrics[i][j])

**Initialize the budgets**

In [None]:
from Data import generate_local_prediction
from Models import GRU
import random 
from WeightTools import (rho_to_sigma, sigma_to_rho,compute_advcomp_sigma, compute_advcomp_budget, 
                         rho_to_dp,compute_cumulated_budget, output_results, update_weights, pertub_weights, 
                         update_global_weights, Pair, test_model, update_budget_training, 
                         update_budget_accuracy , update_budget_increase, calculate_validation_accuracy,  
                         dp_to_zcdp, grad_func, noisyMax, perturb_gradients, compute_epsilon,
                         build_candidates, loss_score, override_model, grad_avg, sigma_to_epsilon, epsilon_to_sigma)


args.tracked_error = []  
privacy_budgets = []
for data in LocalTrainData:
    if args.PrivacyMode != "None":
      total_epsilon = args.epsilon_0
      total_delta = args.delta_0
      total_rho = dp_to_zcdp(args.epsilon_0,args.delta_0)
      rho_t = total_rho/iterations
      if args.featureMode == "Uniform":
          ## initialize with advanced composition
          total_epsilon = args.epsilon_1
          total_delta = args.delta_0
          total_rho = dp_to_zcdp(total_epsilon,total_delta)
          rho_t = total_rho/iterations  
          epsilon_t, delta_t = compute_advcomp_budget(total_epsilon,total_delta,iterations)
          sigma_t = compute_advcomp_sigma(total_epsilon,total_delta,iterations)
          privacy_budgets.append(sigma_t)
          ## initialize with naive composition 
          # initial_budget = args.epsilon_0/iterations
          # initial_delta = args.delta_0/iterations
          # initial_sigma = compute_sigma(initial_budget,initial_delta)
      else:
        if args.featureMode == "InvContribution":
          alpha = (1./args.epsilon_0) - (1./args.epsilon_1)
          beta = 1./(args.epsilon_1+1.-(args.epsilon_1/args.epsilon_0))
          total_epsilon = 1./((alpha*feature_contribution_metrics[data.x_axis][data.y_axis])+((1.-alpha)*beta))
          total_delta = args.delta_0/feature_contribution_metrics[data.x_axis][data.y_axis]
          total_rho = dp_to_zcdp(total_epsilon,total_delta)
          rho_t = total_rho/iterations
          sigma_t = rho_to_sigma(rho_t)
          delta_t = total_delta/iterations
          epsilon_t = rho_to_dp(sigma_t,delta_t)
          privacy_budgets.append(sigma_t)
          print("initial_sigma",sigma_t)
        else:
          if args.featureMode == "LinContribution":
            beta = args.epsilon_1
            alpha = args.epsilon_0-beta
            total_epsilon = (alpha*feature_contribution_metrics[data.x_axis][data.y_axis])+beta
            total_delta = args.delta_0/feature_contribution_metrics[data.x_axis][data.y_axis]
            total_rho = dp_to_zcdp(total_epsilon,total_delta)
            rho_t = total_rho/iterations
            sigma_t = rho_to_sigma(rho_t)
            delta_t = total_delta/iterations
            epsilon_t = rho_to_dp(sigma_t,delta_t)
            privacy_budgets.append(sigma_t)
            print("initial_sigma",sigma_t)
          else:  
            print("a problem initializing the privacy budget ")
            exit('a problem initializing the privacy budget ')
#       print("initial_budget", epsilon_t)
#       print("initial_delta",delta_t)
#       print("initial_sigma",sigma_t)
#       print("total_rho",total_rho)
for budget in privacy_budgets:
    print("Assigned budget",budget)

**Train the model that will be tested against the privacy attack**

In [None]:
from Models import  GRU, MyGAT
from tqdm import tqdm
import copy
import random 
from Data import generate_local_prediction, build_map,build_adjMatrix
from WeightTools import (output_results, update_weights, pertub_weights, update_global_weights, Pair, 
                         test_model, update_budget_training, update_budget_accuracy , update_budget_increase, 
                         calculate_validation_accuracy, dp_to_zcdp, grad_func, noisyMax, perturb_gradients, 
                         compute_epsilon, build_candidates, loss_score, override_model, grad_avg, sigma_to_epsilon, epsilon_to_sigma)

import datetime
import ast
import torch.multiprocessing as mp

device_models = []
global_test_output=[]
minimum_training_accuracy = []
Loop_accuracy = []

for data in LocalTrainData:
  if(args.local_model == 'GRU'):
    model = GRU(args)
    #for the parallelism
    model.share_memory() 
    model.train()
  else:
    exit('Error: unrecognized local model')  
  device_models.append(model)

# instantiate the global model to train:
if(args.global_model == 'GNN'):
  globalModel = MyGAT(args,adj) 
  globalModel.train()
else:
  exit('Error: unrecognized global model')

# initialize variables for the training:
start_time = time.time()
global_training_loss = []
global_training_accuracy_loss_RMSE = []
global_training_accuracy_loss_WMAPE = []
global_training_accuracy_loss_AE = []
local_training_accuracy = [[]]
local_training_accuracy = np.full((len(device_models),iterations),0.)
local_training_loss = [[]]
local_training_loss = np.full((len(device_models),iterations),0.)
average_local_training_loss = []
average_local_training_loss = np.full(len(device_models),0.)
## device side, training locally
weight_dict = {}
assigned_budget = [[]]
assigned_budget = np.full((len(device_models),iterations),0.)
minimum_training_accuracy = []
minimum_training_accuracy = np.full(len(device_models),0.)
total_budget = []
total_budget = np.full(len(device_models),0.)



global_loss_epoch = 10
mp.set_start_method('fork')
number_of_iterations = ((endingTime - beginningTime)//args.trainingInterval)+1
number_of_training_rounds = 0 
for epoch in tqdm(range(args.epochs)):
# while global_loss_epoch > 0.06:  
  for i in tqdm(range(number_of_iterations)):
    timestamp = random.randrange(beginningTime, endingTime, args.trainingInterval)
    # iterate over the dataset    
    # start the parallelism
    quotient = number_of_training_rounds // args.epoch_period
    remainder = number_of_training_rounds % args.epoch_period 
    processes = []
    
    weight_dict[timestamp] = []

    # iterate over the local models
    for i in range(len(device_models)):
      # train the local model
      device_models[i].train()
        
      if args.featureMode == "None":  
        p = mp.Process(target=update_weights, args=(timestamp, LocalTrainData[i], device_models[i], args))
      else:
        if args.featureMode ==  "InvContribution" or args.featureMode ==  "LinContribution" :
          p = mp.Process(target=pertub_weights, args=(timestamp, LocalTrainData[i], device_models[i],privacy_budgets[i],args))
        else:
          if args.featureMode ==  "Uniform":  
            p = mp.Process(target=pertub_weights, args=(timestamp, LocalTrainData[i], device_models[i],privacy_budgets[i],args))
        p.start()
        
        processes.append(p)
    for p in processes:
      p.join()
    
    for i in range(len(device_models)):
      device_models[i].eval()
      weight_dict[timestamp].append(Pair(LocalTrainData[i].x_axis, LocalTrainData[i].y_axis,device_models[i]))  
    # train the global model
    number_of_training_rounds += 1
    torch.cuda.empty_cache()
    weights = weight_dict[timestamp]
    predictiondata = {}
    with torch.no_grad():
      for weight_pair in weights:
        sample_ID = str(weight_pair.x_axis)+"X"+str(weight_pair.y_axis)
        predictiondataset = LocalPredictionSamples.get(sample_ID)
        predictiondata[sample_ID]= generate_local_prediction(weight_pair,predictiondataset,args)
      PredictionGlobalMap = build_map(predictiondata,args)
    global_acc_epoch,global_loss_epoch = update_global_weights(globalModel,GlobalPredictionSamples,PredictionGlobalMap,args,adj)      
    global_training_loss.append(global_loss_epoch)
    print("global_loss_epoch:",global_loss_epoch)
    global_training_accuracy_loss_RMSE.append(global_acc_epoch["RMSE"])
    global_training_accuracy_loss_WMAPE.append(global_acc_epoch["WMAPE"])
    global_training_accuracy_loss_AE.append(global_acc_epoch["AE"])
    del PredictionGlobalMap
    del predictiondata
    gc.collect()

global_acc_training_RMSE = sum(global_training_accuracy_loss_RMSE) / len(global_training_accuracy_loss_RMSE)

global_acc_training_WMAPE = sum(global_training_accuracy_loss_WMAPE) / len(global_training_accuracy_loss_WMAPE)
global_acc_training_AE = sum(global_training_accuracy_loss_AE) / len(global_training_accuracy_loss_AE)


### testing the global model: 
globalModel.eval() 
global_test_output, test_acc, test_loss = test_model(globalModel, adj, GlobalTestData,args)
print("global RMSE",test_acc["RMSE"],"global WMAPE",test_acc["WMAPE"],"global AE",test_acc["AE"])
print("global test_loss",test_loss) 


import numpy as np
import matplotlib.pyplot as plt

#epoch and loss
#epoch and accuracy
plt.subplot(3, 2, 1)
plt.plot(global_training_loss, color='g', label = 'training loss')
plt.legend(loc="upper left")
plt.title('global loss')
plt.xlabel('Epoch')

plt.subplot(3, 2, 2)
plt.plot(global_training_accuracy_loss_AE, color='g', label = 'training AE')
plt.legend(loc="upper left")
plt.title('training AE')
plt.xlabel('Epoch')

plt.subplot(3, 2, 3)
plt.plot(global_training_accuracy_loss_RMSE, color='g', label = 'training RMSE')
plt.legend(loc="upper left")
plt.title('training RMSE')
plt.xlabel('Epoch')

plt.subplot(3, 2, 4)
plt.plot(global_training_accuracy_loss_WMAPE, color='g', label = 'training WMAPE')
plt.legend(loc="upper left")
plt.title('training WMAPE')
plt.xlabel('Epoch')


**Save the trained model**

In [None]:
import os
import sys
import torch

sys.path.append('/content/drive/MyDrive/AdaVFL-GitHub')

ratio = args.featureRatio

if args.dataset == "Yelp":
  directory_name = "/content/drive/MyDrive/Colab Notebooks/Models/Yelp/"
else:
  directory_name = "/content/drive/MyDrive/Colab Notebooks/Models/BikeNYC/"

if args.featureMode== "None":
  file_name = "NoPrivacyGAN.pth"
else:
  if args.featureMode== "Uniform":
    file_name = "UniformPrivacyGAN.pth"
  else:
    if args.featureMode== "InvContribution":
      file_name = 'InvConPrivacyGAN{}.pth'.format(ratio)
    else:
      if args.featureMode== "LinContribution":
        file_name = 'LinConPrivacyGAN{}.pth'.format(ratio)

print('Saving global model...')
torch.save(globalModel.state_dict(), directory_name+file_name)
print('global saved successfully.')


filename is  /content/drive/MyDrive/Colab Notebooks/Models/BikeNYC/ConPrivacyGAN30.pth
Saving global model...
global saved successfully.
