# Hetero NN 自定义: 自定义Top/Bottom Model 和 自定义数据集

在该版本中 整个nn的架构有很大的调整，nn模块开发了dataset与model_zoo模块，旨在提供数据集和模型的自定义功能，

Hetero-NN的模型与数据集的自定义，与Homo-NN十分相似，建议也阅读下Homo-NN自定义的教程，但是Hetero-NN在定义数据集时，对接口实现会多一些要求

在这个教程中， 我们将会介绍hetero-nn下使用dataset,model_zoo的方式

# 使用FATE自带数据集&数据集的自定义

Fate中nn.dataset下提供了一个Dataset基类，基于Pytorch Dataset开发。基于Dataset实现的数据集类，将其更新到nn.dataset模块中，FATE在运行时便可根据参数导入您自定义的数据集，进行训练。

在纵向联邦学习Hetero-NN中，对Dataset的开发使用，相比于Homo-NN会有更多要求：
- 考虑到guest与host方的id对齐问题，Dataset需要提供正确的样本id(sample id)，并确保guest/host方的数据集样本数量相同，sample id集合相同，这样才能保证您算法运行的正确性
- 设计host方使用的数据集时，\_\_getitem\_\_方法仅仅返回数据，不返回label，否则算法流程会报错
- 当使用自定义数据集时，Hetero-NN便无法使用为FATE Table设计的交集算法，你需要另外上传sample id进行样本对齐。

因此，在继承Dataset模块开发时，除了\_\_getitem\_\_， \_\_len\_\_，load，你还需额外实现两个要求: 1. 实现 get_classes 2. 初始化或者load时调用set_sample_ids方法设置样本id

FATE自带的图像，文本数据集模块: image和nlp_tokenizer都实现了不返回label的参数return_label，并且会自动解析设置sample_id

## 样例：实现一个简单的图像数据集，用于Hetero-NN任务

为了更好理解Hetero-NN下Dataset定制的一些要求，这里我们实现一个简单的图片数据集，读取MNIST图像，完成一个Hetero-NN场景下的图片分类任务
这里为了方便，我们用save_to_fate的jupyter接口，把代码更新到federatedml.nn.dataset下，名为mnist_dataset.py，当然你可以手动拷贝代码
文件到目录下

In [46]:
from pipeline.component.homo_nn import save_to_fate

In [70]:
%%save_to_fate dataset mnist_dataset.py
import numpy as np
from federatedml.nn.dataset.base import Dataset
from torchvision.datasets import ImageFolder
from torchvision import transforms

class MNISTDataset(Dataset):
    
    def __init__(self, return_label=True):  # guest方有标签，return label = True, host方无标签，return label = False
        super(MNISTDataset, self).__init__() # 记得这个
        self.return_label = return_label
        self.image_folder = None
        
    def load(self, path):  # 实现label 接口，从path读取图像， 设置sample ids
        
        # 读取
        self.image_folder = ImageFolder(root=path, transform=transforms.Compose([transforms.ToTensor()]))
        # 用image的名字作为id
        ids = []
        for image_name in self.image_folder.imgs:
            ids.append(image_name[0].split('/')[-1].replace('.jpg', ''))
        self.set_sample_ids(ids)

        return self
        
    def get_classes(self, ): # get classes接口，返回class种类， guest方需要用到
        return np.unique(self.image_folder.targets).tolist()
    
    def __len__(self,):  # len接口
        return len(self.image_folder)
    
    def __getitem__(self, idx): # get item 接口, 注意return label
        ret = self.image_folder[idx]
        img = ret[0][0].flatten() # 转换为一个flatten tensor 784维度
        if self.return_label:
            return img, ret[1] # img & label
        else:
            return img # no label, for host

In [71]:
# 测试一下能不能用 guest
! ls ../examples/data/mnist_guest/  # 十个类
ds = MNISTDataset().load('../examples/data/mnist_guest/')
print(len(ds))
print(ds[0][0].shape, ds[0][1]) # 有label
print(ds.get_classes())
print(ds.get_sample_ids()[0: 10])

0  1  2  3  4  5  6  7	8  9
1309
torch.Size([784]) 0
[0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
['img_1', 'img_1029', 'img_1046', 'img_1047', 'img_1076', 'img_108', 'img_1091', 'img_1093', 'img_1096', 'img_110']


In [72]:
# 测试一下能不能用 host
! ls ../examples/data/mnist_host/  # 所有图片放到一个文件夹里 无标签
ds = MNISTDataset(return_label=False).load('../examples/data/mnist_host/')
print(len(ds))
print(ds[0].shape) # 无label

not_labeled
1309
torch.Size([784])


Good! 可以用了, 那我们现在用这个开发的数据集跑一个Hetero-NN模型，双方用id对齐的两个数据集mnist_guest & mnist_host进行一次横向联邦训练

在使用时，我们不再遵循常规FATE组件的用法，而是直接绑定数据集地址到一个FATE的name&namespace 通过reader传递给Hetero-NN组件，Hetero-NN通过你设置
的DatasetParam调用你自定义的数据集，从path读取数据，进行训练:

### pipeline 初始化 绑定path到name&namespace

In [88]:
import os
import torch as t
from torch import nn
from pipeline import fate_torch_hook
from pipeline.component import HeteroNN
from pipeline.component.hetero_nn import DatasetParam
from pipeline.component.nn.backend.torch.cust_model import CustModel
from pipeline.backend.pipeline import PipeLine
from pipeline.component import Reader, Evaluation, DataTransform
from pipeline.interface import Data, Model
from pipeline.component.homo_nn import save_to_fate

fate_torch_hook(t)

# 绑定地址到fate name&namespace
fate_project_path = os.path.abspath('../')
guest = 10000
host = 9999

pipeline_img = PipeLine().set_initiator(role='guest', party_id=guest).set_roles(guest=guest, host=host)

guest_data = {"name": "mnist_guest", "namespace": "experiment"}
host_data = {"name": "mnist_host", "namespace": "experiment"}

guest_data_path = fate_project_path + '/examples/data/mnist_guest/'
host_data_path = fate_project_path + '/examples/data/mnist_host/'
pipeline_img.bind_table(name='mnist_guest', namespace='experiment', path=guest_data_path)
pipeline_img.bind_table(name='mnist_host', namespace='experiment', path=host_data_path)

{'namespace': 'experiment', 'table_name': 'mnist_host'}

### 定义HeteroNN模型

In [89]:
guest_data = {"name": "mnist_guest", "namespace": "cwj"}
host_data = {"name": "mnist_host", "namespace": "cwj"}
reader_0 = Reader(name="reader_0")
reader_0.get_party_instance(role='guest', party_id=guest).component_param(table=guest_data)
reader_0.get_party_instance(role='host', party_id=host).component_param(table=host_data)

In [90]:
hetero_nn_0 = HeteroNN(name="hetero_nn_0", epochs=5,
                       interactive_layer_lr=0.01, batch_size=128, validation_freqs=None, task_type='classification', seed=114514)

# 设置不同party的模型结构与数据集
guest_nn_0 = hetero_nn_0.get_party_instance(role='guest', party_id=guest)
host_nn_0 = hetero_nn_0.get_party_instance(role='host', party_id=host)

# 定义模型
# 图像特征784 单层模型
guest_bottom = t.nn.Sequential(
    nn.Linear(784, 32),
    nn.ReLU()
)
# 图像特征784 单层模型
host_bottom = t.nn.Sequential(
    nn.Linear(784,32),
    nn.ReLU()
)

# Top Model 是个分类器
guest_top = t.nn.Sequential(
    nn.Linear(16, 10), # 10类
    nn.Softmax(dim=1)
)


# fate_torch_hook后，nn模块可以使用InteractiveLayer
interactive_layer = t.nn.InteractiveLayer(out_dim=16, guest_dim=32, host_dim=32)

# 添加模型
guest_nn_0.add_top_model(guest_top)
guest_nn_0.add_bottom_model(guest_bottom)
host_nn_0.add_bottom_model(host_bottom)

# 优化器 loss函数
optimizer = t.optim.Adam(lr=0.01) # 注意！fate_torch_hook后，优化器可以不用parameter参数
loss = t.nn.CrossEntropyLoss()

# 设置数据集，这里使用DatasetParam， dataset_name为模块名，其余的参数会被传递到数据集的__init__接口上
# host 方不需要return label, return_label = False
guest_nn_0.add_dataset(DatasetParam(dataset_name='mnist_dataset', return_label=True))
host_nn_0.add_dataset(DatasetParam(dataset_name='mnist_dataset', return_label=False))

hetero_nn_0.set_interactve_layer(interactive_layer)
hetero_nn_0.compile(optimizer=optimizer, loss=loss)

In [91]:
pipeline_img.add_component(reader_0)
pipeline_img.add_component(hetero_nn_0, data=Data(train_data=reader_0.output.data))
pipeline_img.compile()

<pipeline.backend.pipeline.PipeLine at 0x7f5eda94b580>

In [None]:
pipeline_img.fit()

In [126]:
pipeline_img.get_component('hetero_nn_0').get_output_data()  # get result

Unnamed: 0,id,label,predict_result,predict_score,predict_detail,type
0,img_1,0,0,0.9976044297218323,"{'0': 0.9976044297218323, '1': 1.8047569028567...",train
1,img_3,4,3,0.9073200821876526,"{'0': 0.0010162688558921218, '1': 0.0317202284...",train
2,img_4,0,0,0.9976639747619629,"{'0': 0.9976639747619629, '1': 5.3269786803866...",train
3,img_5,0,5,0.6787770986557007,"{'0': 0.15763959288597107, '1': 3.596510214265...",train
4,img_6,7,7,0.99482262134552,"{'0': 6.393105422830558e-07, '1': 1.4373620160...",train
...,...,...,...,...,...,...
1304,img_32537,1,1,0.9985225796699524,"{'0': 1.5916774032120884e-07, '1': 0.998522579...",train
1305,img_32558,1,1,0.9956049919128418,"{'0': 3.0502631034323713e-06, '1': 0.995604991...",train
1306,img_32563,1,1,0.9994334578514099,"{'0': 4.608472181644174e-08, '1': 0.9994334578...",train
1307,img_32565,1,5,0.36305826902389526,"{'0': 0.0028015582356601954, '1': 0.0293765980...",train


# 模型的自定义

Hetero-NN与Homo-NN 共用model_zoo模块，因此自定义模型的方法与Homo-NN没任何区别，在使用时，请注意Bottom,interactive, Top模型之间的
输入输出是否能对上，以及Top模型的输出与label的shape，和数据类型是否能正确的算出loss

在本节，我们以flicker数据集为例，guest方有图像，以及2分类标签；而host方，有对图像的文本描述，因此，我们guest方使用fate自带的图像数据集和处理图像的模型，
而host使用一个lstm模型，和fate自带的nlp数据集

这里自定义了guest bottom/top以及host bottom的模型，interactive layer不支持自定义，我们使用save_to_fate进行快捷保存，或者你可以手动把他们都放到nn.model_zoo下

### Guest Model 自定义

In [94]:
%%save_to_fate model guest_bottom_image.py
from torch import nn
import torch as t
from torch.nn import functional as F

class ImgBottomNet(nn.Module):
    def __init__(self):
        super(ImgBottomNet, self).__init__()
        self.seq = t.nn.Sequential(
            nn.Conv2d(in_channels=3, out_channels=6, kernel_size=5),
            nn.MaxPool2d(kernel_size=3),
            nn.Conv2d(in_channels=6, out_channels=6, kernel_size=3),
            nn.AvgPool2d(kernel_size=5)
        )
        
        self.fc = t.nn.Sequential(
            nn.Linear(1176, 32),
            nn.ReLU(),
            nn.Linear(32, 8)
        )

    def forward(self, x):
        x = self.seq(x)
        x = x.flatten(start_dim=1)
        x = self.fc(x)
        return x


In [156]:
%%save_to_fate model guest_top_image.py

from torch import nn
import torch as t
from torch.nn import functional as F

class ImgTopNet(nn.Module):
    def __init__(self):
        super(ImgTopNet, self).__init__()
        
        self.fc = t.nn.Sequential(
            nn.Linear(4, 1),
            nn.Sigmoid()
        )

    def forward(self, x):
        x = self.fc(x)
        return x.flatten()

### Host Model 自定义

In [96]:
%%save_to_fate model host_bottom_lstm.py
from torch import nn
import torch as t
from torch.nn import functional as F

class LSTMBottom(nn.Module):
    
    def __init__(self, vocab_size):
        super(LSTMBottom, self).__init__()
        self.word_embed = nn.Embedding(num_embeddings=vocab_size, embedding_dim=16, padding_idx=0)
        self.lstm = t.nn.Sequential(
            nn.LSTM(input_size=16, hidden_size=16, num_layers=2, batch_first=True)
        )
        self.act = nn.ReLU()
        self.linear = nn.Linear(16, 8)

    def forward(self, x):
        embeddings = self.word_embed(x)
        lstm_fw, _ = self.lstm(embeddings)
        
        return self.act(self.linear(lstm_fw.sum(dim=1)))    

### 本地测试数据集，模型

这里我们测试一下我们的数据，还有定义的模型是否能正常工作

In [97]:
from federatedml.nn.dataset.image import ImageDataset
from federatedml.nn.dataset.nlp_tokenizer import TokenizerDataset

In [98]:
# flicke图像数据集
img_ds = ImageDataset(center_crop=True, center_crop_shape=(224, 224), return_label=True) # return label = True
img_ds.load('../examples/data/flicker_toy_data/flicker/images/')
# 文本数据集
txt_ds = TokenizerDataset(return_label=False) # host端无label
txt_ds.load('../examples/data/flicker_toy_data/text.csv')

In [100]:
print(len(img_ds))
print(img_ds[0])
print(img_ds.get_classes())
print(img_ds.get_sample_ids()[0: 10])

215
(tensor([[[0.5059, 0.5176, 0.5137,  ..., 0.4941, 0.5020, 0.5059],
         [0.4980, 0.5020, 0.4980,  ..., 0.4824, 0.5020, 0.5059],
         [0.5059, 0.4863, 0.4902,  ..., 0.4980, 0.4980, 0.5137],
         ...,
         [0.7843, 0.7922, 0.7529,  ..., 0.1412, 0.2078, 0.2196],
         [0.9922, 0.9922, 0.9647,  ..., 0.1176, 0.0941, 0.1333],
         [0.9961, 0.9922, 1.0000,  ..., 0.1647, 0.1294, 0.1373]],

        [[0.5765, 0.5882, 0.5843,  ..., 0.5490, 0.5569, 0.5608],
         [0.5686, 0.5804, 0.5765,  ..., 0.5490, 0.5529, 0.5529],
         [0.5608, 0.5569, 0.5647,  ..., 0.5569, 0.5490, 0.5529],
         ...,
         [0.7961, 0.8039, 0.7490,  ..., 0.1373, 0.1882, 0.2000],
         [0.9961, 0.9961, 0.9608,  ..., 0.1137, 0.1137, 0.1529],
         [0.9922, 0.9922, 1.0000,  ..., 0.1608, 0.1059, 0.1216]],

        [[0.6235, 0.6353, 0.6314,  ..., 0.5922, 0.6000, 0.6118],
         [0.6078, 0.6235, 0.6196,  ..., 0.5804, 0.5882, 0.6000],
         [0.6039, 0.6118, 0.6196,  ..., 0.5843, 0.584

In [103]:
print(len(txt_ds))
print(txt_ds[0]) # word idx
print(txt_ds.get_vocab_size()) # 词汇表大小

215
tensor([  101,  1037,  2158,  1998,  2450,  2729,  2005,  2019, 10527,  2247,
         1996,  2217,  1997,  1037,  2303,  1997,  2300,  1012,   102,     0,
            0,     0,     0,     0,     0,     0])
30522


In [104]:
# 测试下模型是否可用
img_bottom = ImgBottomNet()
lstm_bottom = LSTMBottom(vocab_size=txt_ds.get_vocab_size())

In [114]:
lstm_bottom(t.vstack([txt_ds[0], txt_ds[1]]))  # forward是否OK

tensor([[0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.2519, 1.6196],
        [0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.1977, 1.0931]],
       grad_fn=<ReluBackward0>)

In [125]:
img_bottom(t.vstack([img_ds[0][0].unsqueeze(dim=0), img_ds[1][0].unsqueeze(dim=0)])) # 可用

tensor([[ 0.0045, -0.0910,  0.1303,  0.0460,  0.0381, -0.0751,  0.0361,  0.1149],
        [ 0.0267, -0.1280,  0.1166,  0.0720,  0.0644, -0.0472,  0.0101,  0.0964]],
       grad_fn=<AddmmBackward0>)

### 提交pipeline 
本地初步测试OK，我们提交一个Pipeline任务

In [174]:
import os
import torch as t
from torch import nn
from pipeline import fate_torch_hook
from pipeline.component import HeteroNN
from pipeline.component.hetero_nn import DatasetParam
from pipeline.component.nn.backend.torch.cust_model import CustModel
from pipeline.backend.pipeline import PipeLine
from pipeline.component import Reader, Evaluation, DataTransform
from pipeline.interface import Data, Model
from pipeline.component.homo_nn import save_to_fate

fate_torch_hook(t)

# 绑定地址到fate name&namespace
fate_project_path = os.path.abspath('../')
guest = 10000
host = 9999

pipeline_mix = PipeLine().set_initiator(role='guest', party_id=guest).set_roles(guest=guest, host=host)

guest_data = {"name": "flicker_guest", "namespace": "experiment"}
host_data = {"name": "flicker_host", "namespace": "experiment"}

guest_data_path = fate_project_path + '/examples/data/flicker_toy_data/flicker/images'
host_data_path = fate_project_path + '/examples/data/flicker_toy_data/text.csv'

pipeline_mix.bind_table(name='flicker_guest', namespace='experiment', path=guest_data_path)
pipeline_mix.bind_table(name='flicker_host', namespace='experiment', path=host_data_path)

{'namespace': 'experiment', 'table_name': 'flicker_host'}

In [175]:
reader_0 = Reader(name="reader_0")
reader_0.get_party_instance(role='guest', party_id=guest).component_param(table=guest_data)
reader_0.get_party_instance(role='host', party_id=host).component_param(table=host_data)

In [176]:
hetero_nn_0 = HeteroNN(name="hetero_nn_0", epochs=5,
                       interactive_layer_lr=0.001, batch_size=64, validation_freqs=1, task_type='classification')
guest_nn_0 = hetero_nn_0.get_party_instance(role='guest', party_id=guest)
host_nn_0 = hetero_nn_0.get_party_instance(role='host', party_id=host)

In [177]:
# bottom model
guest_bottom = nn.CustModel(name='guest_bottom_image')

# 放在sequential里也行
guest_top = t.nn.Sequential(
    nn.CustModel(name='guest_top_image')
)
# bottom model
host_bottom = nn.CustModel(name='host_bottom_lstm', vocab_size=txt_ds.get_vocab_size())

interactive_layer = t.nn.InteractiveLayer(out_dim=4, guest_dim=8, host_dim=8, host_num=1)

In [178]:
guest_nn_0.add_top_model(guest_top)
guest_nn_0.add_bottom_model(guest_bottom)
host_nn_0.add_bottom_model(host_bottom)
optimizer = t.optim.Adam(lr=0.001)
loss = t.nn.BCELoss()

hetero_nn_0.set_interactve_layer(interactive_layer)
hetero_nn_0.compile(optimizer=optimizer, loss=loss)

In [179]:
# 添加dataset
guest_nn_0.add_dataset(DatasetParam(dataset_name='image', return_label=True, center_crop=True, center_crop_shape=(224, 224), label_dtype='float'))
host_nn_0.add_dataset(DatasetParam(dataset_name='nlp_tokenizer', return_label=False))

In [180]:
pipeline_mix.add_component(reader_0)
pipeline_mix.add_component(hetero_nn_0, data=Data(train_data=reader_0.output.data))
pipeline_mix.compile()

<pipeline.backend.pipeline.PipeLine at 0x7f5ece42a310>

In [None]:
pipeline_mix.fit()