## Model/data owner
The data owner will train and save a model (this notebook) and ensure the model reproductivity (Model_Reproduction.ipynb)


## Get the dataset

In [3]:
# Setup
from IPython.display import clear_output
import warnings
warnings.filterwarnings('ignore')

In [4]:
# define dataset path
dataset_path = '../Datasets/CIFAR10'
img_path = '../Datasets/CIFAR10/images'

In [5]:
# Get the data file and all of the labels
import pandas as pd
data_df = pd.read_csv(f'{dataset_path}/data.csv')
classes = ('plane', 'car', 'bird', 'cat', 'deer', 'dog', 'frog', 'horse', 'ship', 'truck')
label2id = {i:k for i, k in enumerate(classes)}
print('\n'.join([f'{k}\t{i}' for i, k in enumerate(classes)]))

plane	0
car	1
bird	2
cat	3
deer	4
dog	5
frog	6
horse	7
ship	8
truck	9


In [9]:
# get training device
import torch

device = 'cuda' if torch.cuda.is_available() else 'cpu'

In [10]:
# User A has 2000 images of frog(6), 1000 images of bird(2), and 1000 images of others
data = data_df.copy()
data_A_idx = data.query('label == 6').sample(2000).index.tolist() + \
             data.query('label == 2').sample(1000).index.tolist() + \
             data.query('label not in (2,6)').sample(1000).index.tolist()
data_A = data.iloc[data_A_idx].reset_index(drop=True)
data_df__A = data.iloc[[i for i in data.index if i not in data_A_idx]].reset_index(drop=True) # Get the data without A

A_label_map = {
    0: 'others',
    1: 'frog',
    2: 'bird'
}
A_label2id = {v:k for k,v in A_label_map.items()}
data_A.label = data_A.label_name.apply(lambda x: A_label2id[x] if x in A_label2id else 0)

In [11]:
# In future, the above process of selecting dataset would use the get_dataset function from Dataset_Splitting.ipynb

In [12]:
# User B has 3000 images of frog(6), 2000 images of ship(8), 1000 images of horses(7), and 1000 others
# data = data_df__A.copy()
# data_B_idx = data.query('label == 6').sample(3000).index.tolist() + \
#              data.query('label == 8').sample(2000).index.tolist() + \
#              data.query('label == 7').sample(1000).index.tolist() + \
#              data.query('label not in (6,7,8)').sample(1000).index.tolist()
# data_B = data.iloc[data_B_idx].reset_index(drop=True)
# data_df__AB = data.iloc[[i for i in data.index if i not in data_B_idx]].reset_index(drop=True) # Get the data without A and B

# B_label_map = {
#     0: 'others',
#     1: 'frog',
#     2: 'ship',
#     3: 'horse',
# }
# B_label2id = {v:k for k,v in B_label_map.items()}
# data_B.label = data_B.label_name.apply(lambda x: B_label2id[x] if x in B_label2id else 0)

In [13]:
# User C has 1000 images of frog(6), 1000 images of ship(8), 1000 images of horses(7), 2000 images of planes(0) and 1000 others
# data = data_df__AB.copy()
# data_C_idx = data.query('label == 6').sample(1000).index.tolist() + \
#              data.query('label == 8').sample(1000).index.tolist() + \
#              data.query('label == 7').sample(1000).index.tolist() + \
#              data.query('label == 0').sample(2000).index.tolist() + \
#              data.query('label not in (0,6,7,8)').sample(1000).index.tolist()
# data_C = data.iloc[data_C_idx].reset_index(drop=True)
# data_df__ABC = data.iloc[[i for i in data.index if i not in data_C_idx]].reset_index(drop=True) # Get the data without A, B and C

# C_label_map = {
#     0: 'others',
#     1: 'frog',
#     2: 'ship',
#     3: 'horse',
# }
# C_label2id = {v:k for k,v in C_label_map.items()}
# data_C.label = data_C.label_name.apply(lambda x: C_label2id[x] if x in C_label2id else 0)

In [14]:
# download vision transformer model 
from transformers import ViTFeatureExtractor, ViTForImageClassification
feature_extractor = ViTFeatureExtractor.from_pretrained('google/vit-base-patch16-224-in21k')
vit_model = ViTForImageClassification.from_pretrained('google/vit-base-patch16-224', output_hidden_states=True).to(device)

# freeze the pre-trained model
vit_model.eval()
for param in vit_model.parameters():
    param.requires_grad = False

In [15]:
import torch
from torch import nn
import numpy as np
from PIL import Image

import albumentations as A

# create custom dataset
class Dataset(torch.utils.data.Dataset):
    def __init__(self, data, feature_extractor, vit_model, transform):
        self.images = [f'{img_path}/{i}' for i in data.image]
        self.labels = data.label.tolist()
        self.transform = transform
        self.feature_extractor = feature_extractor

    def __len__(self):
        return len(self.images)

    def __getitem__(self, index):
        image = Image.open(self.images[index])
        if self.transform is not None:
            image = np.array(image)
            image = Image.fromarray(self.transform(image=image)['image'], 'RGB')
        image = self.feature_extractor(image, return_tensors='pt')['pixel_values'][0]
        logits = vit_model(image.unsqueeze(0).to(device)).hidden_states[-1][:,-1,:].cpu().squeeze().detach()
        return logits, self.labels[index]

def get_loader(data, feature_extractor, vit_model, transform, batch_size=32, shuffle=True):
    return torch.utils.data.DataLoader(Dataset(data, feature_extractor, vit_model, transform), batch_size=batch_size, shuffle=shuffle)

transform = A.Compose([
    A.Resize(256, 256),
    A.RandomCrop(224, 224),
    A.Flip(p=0.5),
    A.RandomBrightnessContrast(brightness_limit=0.2, contrast_limit=0.2, p=0.5)
])

A_loader = get_loader(data_A, feature_extractor, vit_model, transform)
# B_loader = get_loader(data_B, feature_extractor, vit_model, transform)
# C_loader = get_loader(data_C, feature_extractor, vit_model, transform)

In [16]:
# initialize model, criterion and optimizer
fc_net = nn.Sequential(
    nn.LazyBatchNorm1d(),
    nn.LazyLinear(128),
    nn.GELU(),
    nn.LazyLinear(len(A_label_map))
).to(device)
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(fc_net.parameters(), lr=0.001)

In [17]:
# A simple training process for 5 epochs
from tqdm import tqdm
fc_net.train()
for i in range(5):
    for batch in tqdm(A_loader):
        logits = fc_net(batch[0].to(device))
        true_labels = batch[1].long().to(device)
        optimizer.zero_grad()
        loss = criterion(logits, true_labels)
        loss.backward()
        optimizer.step()


100%|██████████| 125/125 [02:30<00:00,  1.20s/it]
100%|██████████| 125/125 [02:30<00:00,  1.21s/it]
100%|██████████| 125/125 [02:27<00:00,  1.18s/it]
100%|██████████| 125/125 [02:38<00:00,  1.27s/it]
100%|██████████| 125/125 [02:30<00:00,  1.21s/it]


In [45]:
# TODO: model evaluation
sample_batch_A = next(iter(A_loader))
fc_net.eval()
with torch.no_grad():
    logits = fc_net(sample_batch_A[0].to(device))
    y_pred = logits.argmax(1).detach().cpu().numpy()
    y_true = sample_batch_A[1]
    y_true = np.array(y_true)
    print('Batch accuracy:', np.mean(y_pred == y_true))


Batch accuracy: 0.6875


In [50]:
import base
uploader = base.login("info@openmined.org", "changethis",8082)


Anyone can login as an admin to your node right now because your password is still the default PySyft username and password!!!

Connecting to localhost... done! 	 Logging into server_domain... done!


In [75]:
import syft as sy
from syft.core.adp.data_subject import DataSubject
dataset={}
data_subject = DataSubject('image_data')
dataset['image_data'] = sy.Tensor(sample_batch_A[0]).private(min_val=0, max_val=0,data_subjects=[data_subject] * len(sample_batch_A[0]))
data_subject = DataSubject('label')
dataset['label'] = sy.Tensor(sample_batch_A[1]).private(min_val=0, max_val=0,data_subjects=[data_subject] * len(sample_batch_A[1]))

uploader.load_dataset(
    assets=dataset,
    name=" CIFAR10",
    description="none",
    metadata="none",
)

Converting PyTorch tensor to numpy tensor for internal representation...
Converting PyTorch tensor to numpy tensor for internal representation...
Loading dataset...Loading dataset... checking assets...Loading dataset... checking dataset name for uniqueness...Loading dataset... checking dataset name for uniqueness...                                                                                                                    Loading dataset... checking asset types...                              Loading dataset... uploading...🚀                        

Uploading `image_data`: 100%|[32m█████████████████████████████████████████[0m| 1/1 [00:00<00:00, 20.83it/s][0m
Uploading `label`: 100%|[32m██████████████████████████████████████████████[0m| 1/1 [00:00<00:00, 62.50it/s][0m


Dataset is uploaded successfully !!! 🎉

Run `<your client variable>.datasets` to see your new dataset loaded into your machine!


In [71]:
ds_domain = sy.login(
    email = "sheldon@caltech.edu",
    password = "bazinga",
    port = 8082
)

User created successfully!
Connecting to localhost... done! 	 Logging into server_domain... done!


In [77]:
sent=ds_domain.datasets[0]
asset= sent.assets
for i in asset:
    result = sent[i['name']]
    result.request(reason='research')

In [79]:
for i in uploader.requests:
    i.accept()

In [81]:
result= sent['image_data']
x=result.get().child.child
x=torch.tensor(x).to(dtype=torch.float32)
result= sent['label']
y=result.get().child.child
y=torch.tensor(y).to(dtype=torch.float32)
    
print(x,y)


tensor([[  1.4194,  -8.9307,   1.1547,  ...,  11.7206,  12.0427,  -7.0161],
        [ -1.5749,  -7.4393,   6.1821,  ...,  -1.6212,   6.9088,   1.5066],
        [ -2.6264,   0.7515,   3.2751,  ...,   4.3273,   8.7533, -11.1464],
        ...,
        [ -3.0625,  -7.9869,   4.0079,  ...,   4.9463,  -1.0552,  -7.4823],
        [ -4.8830, -17.7617, -11.2055,  ...,  13.5408,   7.7301,  -4.0789],
        [ -0.8447, -11.0745,   3.1785,  ...,  -1.3299,   7.4800,  -4.7089]]) tensor([1., 2., 2., 2., 1., 0., 1., 1., 1., 1., 0., 1., 2., 1., 2., 2., 0., 1.,
        2., 2., 1., 2., 1., 1., 1., 1., 1., 1., 0., 2., 1., 1.])
