In [None]:
# 说明：这个notebook演示了如何使用quantification方法（目前实现了3种方法进行隐私量化）
# dFIL
# distance correlation
# mutual information

In [21]:
# 导包
import torch
import os
import argparse
import pandas as pd
import tqdm
import numpy as np
os.environ['NUMEXPR_MAX_THREADS'] = '48'

# 导入各个指标
import sys
sys.path.append('/home/dengruijun/data/FinTech/PP-Split/')
from ppsplit.quantification.distance_correlation.distCor import distCorMetric
from ppsplit.quantification.fisher_information.dFIL_inverse import dFILInverseMetric
from ppsplit.quantification.shannon_information.mutual_information import MuInfoMetric

# 导入各个baseline模型及其数据集预处理方法
# 模型
from target_model.models.splitnn_utils import split_weights_client
from target_model.models.VGG import VGG,VGG5Decoder,model_cfg
from target_model.models.BankNet import BankNet1
from target_model.models.CreditNet import CreditNet1
from target_model.models.PurchaseNet import PurchaseClassifier1
# 数据预处理方法
from target_model.data_preprocessing.preprocess_cifar10 import get_cifar10_normalize,get_one_data,deprocess
from target_model.data_preprocessing.preprocess_bank import bank_dataset,preprocess_bank
from target_model.data_preprocessing.preprocess_credit import preprocess_credit
from target_model.data_preprocessing.preprocess_purchase import preprocess_purchase


In [22]:
# 基本参数：
# 硬件
# device = torch.device("cuda:1" if torch.cuda.is_available() else "cpu")

# 参数
# parser = argparse.ArgumentParser()
# parser.add_argument('--dataset', type = str, default = 'CIFAR10')
# parser.add_argument('--device', type = str, default = 'cuda:1')
# parser.add_argument('--batch_size',type=int, default=1) # muinfo最小为8，# distcor最小为2
# args = parser.parse_args()
args = {'dataset':'CIFAR10',
        # 'device':torch.device("cpu"),
        'device':torch.device("cuda:0" if torch.cuda.is_available() else "cpu"),
        'batch_size':1}
print(args['device'])

cuda:0


# 数据集及其模型加载

In [23]:
# 加载模型和数据集，并从unit模型中切割出client_model
if args['dataset']=='CIFAR10':
    save_inverse_dir  = f'../results/VGG5/'
    testset_len = 10000 # 10000个数据一次
    trainloader,testloader = get_cifar10_normalize(batch_size = args['batch_size'])
    one_data_loader = get_one_data(testloader,batch_size = 8) #拿到第一个测试数据

    client_net_route = '/home/dengruijun/data/project/Inverse_efficacy/trained_models/VGG5/BN+Tanh/VGG5-20ep.pth' # VGG5-BN+Tanh # 存储的是模型参数，不包括模型结构
    # VGG5 unit模型
    # vgg5_unit = VGG('Unit', 'VGG5', len(model_cfg['VGG5'])-1, model_cfg) # 加载模型结构
    # vgg5_unit.load_state_dict(torch.load(client_net_route)) # 加载模型参数
    # vgg5_unit.load_state_dict(torch.load(client_net_route,map_location=torch.device('cpu'))) # 完整的模型
    split_layer_list = list(range(len(model_cfg['VGG5'])))

    # 切割成client model
    split_layer = 3
    client_net = VGG('Client','VGG5',split_layer,model_cfg)
    pweights = torch.load(client_net_route)
    if split_layer < len(model_cfg['VGG5']):
        pweights = split_weights_client(pweights,client_net.state_dict())
    client_net.load_state_dict(pweights)

elif args.dataset=='credit':
    save_inverse_dir  = f'../results/Credit/'

    testset_len = 61503 # for the mutual information

    client_net_route = '../results/1-7/credit-20ep.pth'
    dataPath = '/home/dengruijun/data/FinTech/DATASET/kaggle-dataset/home_credit/dataset/application_train.csv'
    train_data, test_data = preprocess_credit(dataPath)
    test_dataset = bank_dataset(test_data)
    testloader = torch.utils.data.DataLoader(test_dataset, batch_size=args['batch_size'], shuffle=False,
                                            num_workers=8, drop_last=False)
    # one_data_loader = get_one_data(testloader,batch_size = batch_size) #拿到第一个测试数据
    # split_layer_list = ['linear1', 'linear2']
    split_layer_list = [0,3,6,9]
    split_layer = 3

    client_net = CreditNet1(layer=split_layer)
    pweights = torch.load('../results/1-7/credit-20ep.pth').state_dict()
    if split_layer < 9:
        pweights = split_weights_client(pweights,client_net.state_dict())
    client_net.load_state_dict(pweights)

elif args['dataset']=='bank':
    save_inverse_dir  = f'../results/Bank/MI/'

    testset_len=8238

    client_net_route = '../results/1-8/bank-20ep.pth'
    dataPath = '/home/dengruijun/data/FinTech/DATASET/kaggle-dataset/bank/bank-additional-full.csv'
    
    train_data, test_data = preprocess_bank(dataPath)
    test_dataset = bank_dataset(test_data)
    testloader = torch.utils.data.DataLoader(test_dataset, batch_size=args['batch_size'], shuffle=False,
                                            num_workers=8, drop_last=False)
    # one_data_loader = get_one_data(testloader,batch_size = batch_size) #拿到第一个测试数据
    # split_layer_list = ['linear1', 'linear2']
    split_layer_list = [0,2,4,6]
    split_layer = 2

    client_net = BankNet1(layer=split_layer)
    pweights = torch.load('../results/1-8/bank-20ep.pth').state_dict()
    if split_layer < 6:
        pweights = split_weights_client(pweights,client_net.state_dict())
    client_net.load_state_dict(pweights)

elif args['dataset']=='purchase':
    save_inverse_dir = f'../results/Purchase100/MI/'

    testset_len = 39465 # test len
    client_net_route = '../results/1-9/epoch_train0.pth'
    dataPath = '/home/dengruijun/data/FinTech/DATASET/kaggle-dataset/Purchase100/'

    trainloader, testloader = preprocess_purchase(data_path=dataPath, batch_size=args['batch_size'])
    # one_data_loader = get_one_data(testloader,batch_size = 1) #拿到第一个测试数据
    split_layer_list = [0,1,2,3,4,5,6,7,8]
    split_layer = 2

    # 读取（load）模型
    client_net = PurchaseClassifier1(layer=split_layer)
    pweights  = torch.load(client_net_route)['state_dict']
    if split_layer < 8: # 
        pweights = split_weights_client(pweights ,client_net.state_dict())
    client_net.load_state_dict(pweights)

else:
    exit(-1)

client_net = client_net.to(args['device'])
client_net.eval()

features.0.weight
features.0.bias
features.1.weight
features.1.bias
features.1.running_mean
features.1.running_var
features.1.num_batches_tracked
features.4.weight
features.4.bias
features.5.weight
features.5.bias
features.5.running_mean
features.5.running_var
features.5.num_batches_tracked


VGG(
  (features): Sequential(
    (0): Conv2d(3, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (1): BatchNorm2d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (2): Tanh()
    (3): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
    (4): Conv2d(32, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (5): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (6): Tanh()
    (7): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  )
  (denses): Sequential()
)

# 各种指标计算

## 1.dFIL-inverse

In [24]:
# dFIL inverse指标计算

eta_same_layer_list = []
eta_diff_layer_list=[]

metric = dFILInverseMetric()
# 对traingloader遍历计算所有 inverse dFIL
# for j, data in enumerate(tqdm.tqdm(testloader)):
for j, data in enumerate(tqdm.tqdm(one_data_loader)): # 测试第一个testloader
    # if j < 31705:
        # continue
    inputs, labels = data
    inputs, labels = inputs.to(args['device']), labels.to(args['device'])
    inputs.requires_grad_(True) # 需要求导
    
    # inference
    outputs = client_net(inputs)

    eta = metric.quantify(model=client_net, inputs=inputs, outputs=outputs, with_outputs=True)
    # 打印
    # print(str(j)+": "+str(eta.item()))
    eta_same_layer_list.append(eta)
eta_diff_layer_list.append(eta_same_layer_list)

# 结果储存到csv中
matrix = np.array(eta_diff_layer_list) # 有点大，x
transpose = matrix.T # 一行一条数据，一列代表一个layer 
# pd.DataFrame(data=transpose, columns=[i for i in split_layer_list]).to_csv(save_img_dir + f'dFIL-1.csv',index=False)
pd.DataFrame(data=transpose, columns=[split_layer]).to_csv(save_inverse_dir + f'dFIL.csv',index=False)




[A

RuntimeError: shape '[8, 32768, 24576]' is invalid for input of size 805306368

## 2. distance correlation

In [25]:
# distance correlation指标计算


distCorr_diff_layer_list = []
distCorr_same_layer_list = []
metric = distCorMetric()

# for j, data in enumerate(tqdm.tqdm(testloader)): # 对testloader遍历
for j, data in enumerate(tqdm.tqdm(one_data_loader)): # 测试第一个testloader
    tab, labels = data
    tab, labels = tab.to(args['device']), labels.to(args['device'])
    with torch.no_grad():
        pred = client_net(tab).cpu().detach()
        inputs = tab.cpu().detach()

        distCorr = metric.quantify(inputs=inputs,outputs=pred) # x,z
        distCorr_same_layer_list.append(distCorr)


print(f"Layer {split_layer} Avg distCorr: {sum(distCorr_same_layer_list)/len(distCorr_same_layer_list)}")
distCorr_diff_layer_list.append(distCorr_same_layer_list)

# 保存到csv中
matrix = np.array(distCorr_diff_layer_list) # 有点大，x
transpose = matrix.T # 一行一条数据，一列代表一个layer 
# pd.DataFrame(data=transpose, columns=[i for i in range (len(split_layer_list))]).to_csv(save_img_dir + f'DLoss-bs{batch_size}.csv',index=False)
pd.DataFrame(data=transpose, columns=[split_layer]).to_csv(save_inverse_dir + f'DLoss.csv',index=False)





[A[A

100%|██████████| 1/1 [00:00<00:00,  4.35it/s]

Layer 3 Avg distCorr: 0.9707143902778625





## 3. mutual information

In [26]:
# mutual information指标计算

MI_diff_layer_list = []
MI_same_layer_list = []
metric = MuInfoMetric()

# for j, data in enumerate(tqdm.tqdm(testloader)): # 对testloader遍历
for j, data in enumerate(tqdm.tqdm(one_data_loader)): # 测试第一个testloader
    images, labels = data
    images, labels = images.to(args['device']), labels.to(args['device'])
    with torch.no_grad():
        # inference
        outputs = client_net(images).clone().detach()
        inputs = images.cpu().detach()
        mi = metric.quantify(inputs=inputs, outputs = outputs)
        MI_same_layer_list.append(mi)
        
print(f"Layer {split_layer} MI: {sum(MI_same_layer_list)/len(MI_same_layer_list)}")
MI_diff_layer_list.append(MI_same_layer_list)

# 保存到csv中
matrix = np.array(MI_diff_layer_list) # 有点大，x
transpose = matrix.T # 一行一条数据，一列代表一个layer 
# pd.DataFrame(data=transpose, columns=[i for i in split_layer_list]).to_csv(save_inverse_dir + f'MI-bs{batch_size}.csv',index=False)
pd.DataFrame(data=transpose, columns=[split_layer]).to_csv(save_inverse_dir + f'MILoss.csv',index=False)




[A[A

100%|██████████| 1/1 [00:00<00:00,  5.08it/s]


Layer 3 MI: -97.71402831489374
