<a href="https://colab.research.google.com/github/CCTQL/weiqi/blob/master/datamining.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
import numpy as np
import torch 
from torchvision import transforms, datasets
from torch import nn, optim
import torch.nn.functional as F
import os


In [0]:
train_transforms = transforms.Compose([
    transforms.ToPILImage(),
    transforms.RandomAffine(15),
    transforms.RandomPerspective(),
    # transforms.RandomRotation(15),
    transforms.ToTensor(),
    transforms.Normalize((0.5,),(0.5,))
])

test_transforms = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize((0.5,),(0.5,))
])

train_dataset = datasets.MNIST('./data/train', train=True, download=True, transform=train_transforms)
test_dataset = datasets.MNIST('./data/test', train=False, download=True, transform=test_transforms)
testloader = torch.utils.data.DataLoader(test_dataset, shuffle=False, batch_size=len(test_dataset))

In [0]:
num_teachers = 10


# Split the dataset into equal-sized partitions for all the teachers
# trainsets = tuple(th.utils.data.random_split(train_dataset, [len(train_dataset)//num_teachers]*num_teachers))
idxs = np.random.permutation(len(train_dataset))
split_size = len(train_dataset)//num_teachers

trainsplits = tuple((train_dataset.data[idxs[i:i+split_size]],train_dataset.targets[idxs[i:i+split_size]]) for i in range(num_teachers))

In [0]:
class TeacherModel(nn.Module):
    def __init__(self):
        super(TeacherModel, self).__init__()
        
        # Input shape is (1,28,28) => 784
        self.fc1 = nn.Linear(784, 512)
        
        self.fc2 = nn.Linear(512, 256)
        
        self.fc3 = nn.Linear(256,128)
        
        self.fc4 = nn.Linear(128, 64)
        
        self.fc5 = nn.Linear(64, 32)
        
        self.fc6 = nn.Linear(32, 16)
        
        self.fc7 = nn.Linear(16, 10)
        
    def forward(self, x):
        # reshape the data for fc layers
        x = x.view(-1, 28*28)
        
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = F.relu(self.fc3(x))
        x = F.relu(self.fc4(x))
        x = F.relu(self.fc5(x))
        x = F.relu(self.fc6(x))
        
        # Get the linear output. Classification is done outside the model.
        x = self.fc7(x)
        
        return x

In [0]:
trainsets = trainsplits
teacher_models = list(
        # Models are stored in a list since we have to reassign them later
        TeacherModel()
    for i in range(num_teachers))

In [0]:
retrain = False

for i in range(num_teachers):
    if not os.path.exists(f'teacher_{i}_chkpt.pth') or retrain:
        epochs = 20
        trainloader = th.utils.data.DataLoader(trainsets[i], shuffle=True, batch_size=64)
        model = teacher_models[i]

        criterion = nn.NLLLoss()
        optimizer = optim.Adam(model.parameters(), lr=0.001)

        for e in range(epochs):
            running_loss = 0
            steps = 0
            for images, labels in trainloader:
                images, labels = Variable(images).cuda(), Variable(labels).cuda()
                optimizer.zero_grad()

                # Use log_softmax for local classification
                log_ps = F.log_softmax(model(images))
                loss = criterion(log_ps, labels)
                loss.backward()
                optimizer.step()

                running_loss += loss.item()
                steps += 1

                if steps % 20 == 0:
                    print(f'Teacher {i}/{num_teachers} | Epoch: {e}/{epochs} | Loss: {np.round(running_loss/steps+1, 3)}')
        else:
            th.save(model.state_dict(), f'teacher_{i}_chkpt.pth')

In [0]:
teacher_models = list(
        # Models are stored in a list since we have to reassign them later
        TeacherModel()
    for i in range(num_teachers))

for i in range(num_teachers):
    chkpt_path = f'teacher_{i}_chkpt.pth'
    if os.path.exists(chkpt_path):
        state_dict = torch.load(chkpt_path)
        teacher_models[i].load_state_dict(state_dict)



In [0]:
opinions = None

unlabeled_data, labels = next(iter(testloader))

for i in range(num_teachers):    
    ps = torch.exp(teacher_models[i](unlabeled_data)) # get teacher's opinion
    _, top_class = ps.topk(1, dim=1)
      
    if opinions is None:
        opinions = top_class
    else:
        opinions = torch.cat((opinions, top_class), dim=1) # concatenate all opinions
    
    # unlabeled_data = unlabeled_data.get() # retrieve the data

In [0]:
def noisy_argmax(x, epsilon=0.1):

    # First get the vote count for each datapoint.
    count = np.zeros([10000,10])
    for i in range(10000):
      for j in range(10):
        count[i,x[i,j]] += 1 

    # Add Laplacian noise to the votecount.
    
    count = torch.from_numpy(count)
    beta = 1 / epsilon
    noise = torch.from_numpy(np.random.laplace(0, beta, count.shape))
    
    n_labels = count.double() + noise
    
    # Then get the highest votecount index
    n_labels = torch.argmax(n_labels, dim=1)
    return n_labels

In [16]:
print(opinions)

tensor([[7, 7, 7,  ..., 7, 7, 7],
        [2, 2, 2,  ..., 2, 2, 2],
        [1, 1, 1,  ..., 1, 1, 1],
        ...,
        [4, 4, 4,  ..., 4, 4, 4],
        [5, 5, 5,  ..., 5, 5, 5],
        [6, 6, 6,  ..., 6, 6, 6]])


In [0]:
noisy_labels = noisy_argmax(opinions, epsilon=1)

In [56]:
print(noisy_labels)

tensor([7, 2, 1,  ..., 4, 5, 6])


In [65]:
equals = labels == noisy_labels
accuracy = torch.mean(equals.float())
print(f"Noisy Argmax Accuracy: {int(accuracy*100)}%")

Noisy Argmax Accuracy: 95%
