<a href="https://colab.research.google.com/github/dchu1/AI_P2_cl/blob/master/PathNet.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
n_tasks = 20

# Imports

In [0]:
import math
import random

import torch
from torch.nn.parameter import Parameter
from torch.nn import init
from torch.nn import Module

import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

import torchvision
from torchvision import datasets, transforms

import torch.utils.data as data_utils

import numpy as np
import subprocess
import os
import random
from PIL import Image

import matplotlib.pyplot as plt
from IPython.core.debugger import set_trace

import pickle

# Constructing Dataset

In [0]:
def rotate_dataset(d, rotation):
  result = torch.FloatTensor(d.size(0), 784)
  tensor = transforms.ToTensor()

  for i in range(d.size(0)):
    img = Image.fromarray(d[i].numpy(), mode="L")
    result[i] = tensor(img.rotate(rotation)).view(784)
  return result

mnist_path = "mnist.npz"
if not os.path.exists(os.path.join("/content", mnist_path)):
  subprocess.call("wget https://s3.amazonaws.com/img-datasets/mnist.npz", shell=True)

f = np.load(mnist_path)
x_tr = torch.from_numpy(f["x_train"])
y_tr = torch.from_numpy(f["y_train"]).long()
x_te = torch.from_numpy(f["x_test"])
y_te = torch.from_numpy(f["y_test"]).long()
f.close()

# Rotate Dataset
tasks_tr = []
tasks_te = []
mnist_rot_path = "mnist_rotations.pt"
if not os.path.exists(os.path.join("/content", mnist_rot_path)):
    torch.manual_seed(0)

    for t in range(n_tasks):
      min_rot = 1.0 * t / n_tasks * (180.0 - 0.0) + 0.0
      max_rot = 1.0 * (t + 1) / n_tasks * (180.0 - 0.0) + 0.0
      rot = random.random() * (max_rot - min_rot) + min_rot

      tasks_tr.append([rot, rotate_dataset(x_tr, rot), y_tr])
      tasks_te.append([rot, rotate_dataset(x_te, rot), y_te])

    torch.save([tasks_tr, tasks_te], 'mnist_rotations.pt')
else:
    tasks_tr, tasks_te = torch.load('/content/mnist_rotations.pt')

# Genotype Definition

In [0]:
import random
import numpy as np
import copy

class Genotype():

    def __init__(self, L, M, N):
        self.genotype = np.random.randint(0, M, (L,N))
        self.L = L
        self.M = M
        self.N = N

    def apply_mutation(self, i, j):
        gene = self.genotype[i][j] + random.randint(-2, 2)
        if gene < 0:
            gene += self.M
        elif gene > self.M - 1:
            gene -= self.M
        self.genotype[i][j] = gene

    def mutate(self):
        for i in range(self.L):
            for j in range(self.N):
                if random.random() < 1.0 / (self.L * self.N):
                    self.apply_mutation(i, j)

    def return_genotype(self):
        return self.genotype

    def overwrite(self, genotype):
        self.genotype = copy.deepcopy(genotype)


class Genetic():

    def __init__(self, L, M, N, pop): 
        """L: layers, M: units in each layer, N: number of active units, pop: number of gene"""
        self.genotypes = [Genotype(L, M, N) for _ in range(pop)]
        self.pop = pop
        self.control_fixed = random.sample(self.genotypes,1)[0]

    def return_all_genotypes(self):
        genotypes = [gene.return_genotype() for gene in self.genotypes]
        return genotypes

    def return_control(self):
        return self.control_fixed

    def return_control_genotype(self):
        return self.control_fixed.return_genotype()

    def sample(self):
        return random.sample(self.genotypes, 2)

    def overwrite(self, genotypes, fitnesses):
        win = genotypes[fitnesses.index(max(fitnesses))]
        lose = genotypes[fitnesses.index(min(fitnesses))]
        genotype = win.return_genotype()
        lose.overwrite(genotype)
        lose.mutate()

# PathNet Model 2 Layer Conv

In [0]:
from __future__ import division

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.autograd import Variable
from torch.autograd import gradcheck

from IPython.core.debugger import set_trace

class Net(nn.Module):

    def __init__(self, L, M, N, num_neurons, lr, use_cuda):
        super(Net, self).__init__()
        self.L = L
        self.M = M
        self.N = N
        self.lr = lr
        self.num_neurons = num_neurons
        self.use_cuda = use_cuda
        self.final_layers = []
        #self.init(None, None)

    def init(self, best_path, task):
        nochange_modules = [[]] * self.L
        if len(best_path) == 0:
            nochange_modules = [[None] * self.M] * self.L
        else:
            for i in range(len(best_path)):
                nochange_modules = np.concatenate((nochange_modules,best_path[i]), axis=1)
        num_neurons = self.num_neurons
        module_num = [self.M] * self.L
  
        # Construct our network. Don't touch modules that are frozen (nochange_modules)
        self.fc1 = []
        self.fc2 = []
        # self.fc3 = []

        for i in range(module_num[0]):
            if not i in nochange_modules[0]:
                """All parameters should be declared as member variable, so I think this is the simplest way to do so"""
                exec("self.m1" + str(i) + " = nn.Conv2d(1, 32, 3, 1)")
            exec("self.fc1.append(self.m1" + str(i) + ")")

        for i in range(module_num[1]):
            if not i in nochange_modules[1]:
                exec("self.m2" + str(i) + " = nn.Conv2d(32, 64, 3, 1)")
            exec("self.fc2.append(self.m2" + str(i) + ")")

        # for i in range(module_num[2]):
        #     if not i in best_path[2]:
        #         #exec("self.m3" + str(i) + " = nn.Linear(" + str(neuron_num) + ", 10)")
        #         exec("self.m3" + str(i) + " = nn.Linear(" + str(neuron_num) + "," + str(neuron_num) + ")")
        #     exec("self.fc3.append(self.m3" + str(i) + ")")


        if task != None:
            exec("self.final_layer" + str(task) + " = nn.Linear(9216, 10)")
            exec("self.final_layers.append(" + "self.final_layer" + str(task) + ")")

        # Get our trainable params for the optimizer
        trainable_params = []
        params_set = [self.fc1, self.fc2]
        for path, params in zip(nochange_modules, params_set):
            #print("Fixing Layer parameters")
            for i, param in enumerate(params):
                if  i in path:
                    param.requires_grad = False
                    #print("Fixing Module {} parameters".format(i))
                else:
                    p = {'params': param.parameters()}
                    trainable_params.append(p)
                    
        p = {'params': self.final_layers[-1].parameters()}
        trainable_params.append(p)
        self.optimizer = optim.SGD(trainable_params, lr=self.lr)
        if self.use_cuda:
            self.cuda()

    def forward(self, x, path, last):
        # flatten the 28*28 images into 1d array for linear layers
        #x = x.view(-1, 28*28)
        # reshape 1d array into 28*28 images
        #x = x.view(16,1,28,28)

        #M = self.M
        y = F.relu(self.fc1[path[0][0]](x))
        for j in range(1,len(path[0])):
            y += F.relu(self.fc1[path[0][j]](x))
        x = y
        y = F.relu(self.fc2[path[1][0]](x))
        for j in range(1,len(path[0])):
            y += F.relu(self.fc2[path[1][j]](x))
        x = y
        x = F.max_pool2d(x, 2)

        # y = F.relu(self.fc3[path[2][0]](x))
        # for j in range(1,self.N):
        #     y += F.relu(self.fc3[path[2][j]](x))
        # x = y
        
        x = torch.flatten(x, 1)
        x = self.final_layers[last](x)
        output = F.log_softmax(x, dim=1)
        return output

    def train_model(self, train_loader, path, num_batch):
        self.train()
        fitness = float(0)
        train_len = 0
        for batch_idx, (data, target) in enumerate(train_loader):
            if self.use_cuda:
                data, target = data.cuda(), target.cuda()
            data, target = Variable(data), Variable(target)
            self.optimizer.zero_grad()
            #x = data.view(16,1,28,28)
            output = self(data.view(16,1,28,28), path, -1)
            pred = output.data.max(1)[1] # get the index of the max log-probability
            correct = pred.eq(target.data).cpu().sum()
            fitness += correct
            train_len += len(target.data)
            loss = F.cross_entropy(output, target)
            loss.backward()
            #print("Batch:", batch_idx, "Acc:", str(correct.item()) + "/" + str(len(target.data)))
            self.optimizer.step()
            if not batch_idx < num_batch -1:
                break
        #set_trace()
        fitness = fitness / train_len
        return fitness

    def test_model(self, test_loader, path, last):
        self.eval()

        # For now we will throw out the path given to us and just run on the
        # entire network.
        path = [np.arange(self.M)] * 2

        fitness = float(0)
        train_len = 0
        for batch_idx, (data, target) in enumerate(test_loader):
            if self.use_cuda:
                data, target = data.cuda(), target.cuda()
            data, target = Variable(data), Variable(target)
            self.optimizer.zero_grad()
            #x = data.view(1,1,28,28)
            output = self(data.view(1,1,28,28), path, last)
            pred = output.data.max(1)[1] # get the index of the max log-probability
            fitness += pred.eq(target.data).cpu().sum()
            train_len += len(target.data)
            if batch_idx > 1000:
                break
        fitness = fitness / train_len
        return fitness


# Run Experiment

In [0]:
def train_pathnet(model, gene, train_loader, num_batch, best_fitness, best_path, gen):
    pathways = gene.sample()
    fitnesses = []
    train_data = [(data, target) for (data,target) in train_loader]
    for pathway in pathways:
        path = pathway.return_genotype()
        fitness = model.train_model(train_data, path, num_batch)
        fitnesses.append(fitness)
    #print("Generation {} : Fitnesses = {} vs {}".format(gen, fitnesses[0], fitnesses[1]))

    gene.overwrite(pathways, fitnesses)
    genes = gene.return_all_genotypes()
    #visualizer.show(genes, vis_color)
    if max(fitnesses) > best_fitness:
        best_fitness = max(fitnesses)
        best_path = pathways[fitnesses.index(max(fitnesses))].return_genotype()
    #print("Generation {} : Best Fitness = {}".format(gen, best_fitness.item()))
    return best_fitness, best_path, max(fitnesses)

def train_control(model, gene, train_loader, gen):        
    path = gene.return_control_genotype()
    train_data = [(data, target) for (data,target) in train_loader]
    fitness = model.train_model(train_data, path, args.num_batch)
    #print("Generation {} : Fitness = {}".format(gen, fitness))
    genes = [gene.return_control_genotype()] * args.pop
    #visualizer.show(genes, 'm')
    return fitness

def evaluate_on_tasks(model, tasks_trained, test_loaders):
    print("Evaluating on task test sets")
    test_acc = []
    for k in range(n_tasks):
        fitness = model.test_model(test_loaders[k], None, k if k <= tasks_trained else -1)
        print("Test Accuracy on Task Set {}: {}".format(k, fitness))
        test_acc.append(fitness.item())
    print("Average Test Accuracy: {}".format(i,np.mean(test_acc)))
    return test_acc

In [0]:
def main():
    # Training settings
    L = 2
    M = 10
    N = 4
    pop = 64
    batch_size = 16
    batch_limit = 150
    lr = 0.01
    num_neurons = 20
    generation_limit = 50
    control = False
    fine_tune = False
    use_cuda = True
    seed = 0

    cuda = use_cuda and torch.cuda.is_available()
    kwargs = {'num_workers': 1, 'pin_memory': True} if cuda else {}

    torch.manual_seed(seed)
    if cuda:
        torch.cuda.manual_seed(seed)

    model = Net(L, M, N, num_neurons, lr, cuda)
    #module_num = [args.M] * args.L

    if cuda:
        model.cuda()

    if not os.path.isdir('./result'):
        os.makedirs("./result")

    if os.path.exists('./result/result_mnist.pickle'):
        f = open('./result/result_mnist.pickle','rb')
        result = pickle.load(f)
        f.close()
    else:
        result = []

    test_loaders = []
    for i in range(n_tasks):
        test_loaders.append(data_utils.DataLoader(data_utils.TensorDataset(tasks_te[i][1], tasks_te[i][2]), shuffle = False))

    best_paths = []
    best_path = [[None] * N] * L
    total_test_acc = []
    tasks_fitness = []
    gene = Genetic(L, M, N, pop)

    # Initialize (or reinitialise) our model
    model.init(best_paths, 0)
    # First we will evaluate before doing any training
    #total_test_acc.append(evaluate_on_tasks(model, i, test_loaders))

    # Then train across tasks
    for i in range(n_tasks):
        gen = 0
        best_fitness = 0.0
        task_fitness = []

        # If not control, generate a new gene which controls the permutations of
        # pathways
        if not control:
          gene = Genetic(L, M, N, pop)

        # Initialize (or reinitialise) our model
        model.init(best_paths, i+1)
        
        # Load our training data
        train = data_utils.TensorDataset(tasks_tr[i][1], tasks_tr[i][2])
        train_loader = data_utils.DataLoader(train, batch_size=batch_size, shuffle = True)
        print("Training Task {} started...".format(i))
        
        # Begin our training tournament
        for gen in range(generation_limit):
            if not control:
                best_fitness, best_path, max_fitness = train_pathnet(model, gene, train_loader, batch_limit, best_fitness, best_path, gen)
                task_fitness.append(max_fitness)
            else: ##control experiment
                fitness = train_control(model, gene, train_loader, gen)
                task_fitness.append(fitness)

        # Store our best fitness and path
        tasks_fitness.append(task_fitness)
        best_paths.append(best_path)
        #print("Best Paths:", best_paths)
        # append the new best path to the existing best path (if its not the first)
        # iteration. Otherwise the new best path is the best path
        # if i == 0:
        #     set_trace()
        #     best_path = new_best_path
        #     print(best_path)
        # else:
        #     best_path = np.concatenate((best_path, new_best_path), axis=1)
        #     print(best_path)

        # Evaluate on our test sets
        test_acc = evaluate_on_tasks(model, i, test_loaders)
        total_test_acc.append(test_acc)

        # test_acc = []
        # for k in range(n_tasks):
        #     fitness = model.test_model(tests[k], best_path, k if k <= i else -1)
        #     #fitness = model.test_model(tests[k], best_path, -1)
        #     print("After Training Task {}: Test Accuracy on Task {}: {}".format(i, k, fitness))
        #     test_acc.append(fitness.item())
        # total_test_acc.append(test_acc)
        # print("Average Test Accuracy After Training Task {}: {}".format(i,np.mean(test_acc)))
        print("Task {} done.".format(i))

    average_acc = np.mean(total_test_acc[n_tasks-1])
    print("Accuracy:", average_acc)
    print("Confusion matrix:")
    print('\n'.join([','.join([str(item) for item in row]) for row in total_test_acc]))
    # print(total_test_acc)
    # for i in range(len(total_test_acc)):
    #     print("{}(mean={})".format(total_test_acc[i], np.mean(total_test_acc[i])))

    # Save our results
    if control:
        if fine_tune:
            result.append(('fine_tune', total_test_acc))
        else:
            result.append(('control', total_test_acc))
    else:
        result.append(('connfustion_matrix', total_test_acc))
    f = open('./result/result_mnist.pickle', 'wb')
    pickle.dump(result, f)
    f.close()

In [0]:
main()

Training Task 0 started...
Evaluating on task test sets
Test Accuracy on Task Set 0: 0.038922157138586044
Test Accuracy on Task Set 1: 0.9590818285942078
Test Accuracy on Task Set 2: 0.8582834601402283
Test Accuracy on Task Set 3: 0.6866267323493958
Test Accuracy on Task Set 4: 0.538922131061554


# Plotting

In [0]:
import argparse
import os
import pickle
import numpy as np
import matplotlib.pyplot as plt
plt.style.use('ggplot')

parser = argparse.ArgumentParser(description='PyTorch MNIST Example')
parser.add_argument('--mnist', action='store_true', default=True,
                    help='open mnist result')
args = parser.parse_args(args=[])


def subplot(subplot, data_first, data_second, data_third, data_first_control, data_second_control, data_third_control, title):
    plt.subplot(subplot)
    if args.mnist:
        x = np.arange(0,100)
    else:
        x = np.arange(0,500)
    y_first = np.mean(data_first, axis=0)
    y_second = np.mean(data_second, axis=0)
    y_third = np.mean(data_third, axis=0)
    y_first_control = np.mean(data_first_control, axis=0)
    y_second_control = np.mean(data_second_control, axis=0)
    y_third_control = np.mean(data_third_control, axis=0)
    #y_first_err = np.std(data_first, axis=0) / 2.
    #y_second_err = np.std(data_second, axis=0) / 2. 
    #y_third_err = np.std(data_third, axis=0) / 2. 
    
    #plt.fill_between(x, y_first - y_first_err, y_first + y_first_err, color='m', alpha=0.3)
    #plt.fill_between(x, y_second - y_second_err, y_second + y_second_err, color='y', alpha=0.3)
    #plt.fill_between(x, y_third - y_third_err, y_third + y_third_err, color='c', alpha=0.3)
    plt.plot(x, y_first, color='r', label='Task A')
    plt.plot(x, y_second, color='g', label='Task B')
    plt.plot(x, y_third, color='b', label='Task C')
    plt.plot(x, y_first_control, linestyle='dashed', color='r', label='Task A (control')
    plt.plot(x, y_second_control, linestyle='dashed', color='g', label='Task B (control)')
    plt.plot(x, y_third_control, linestyle='dashed', color='b', label='Task C (control)')
    plt.legend(bbox_to_anchor=(0.8, 0.3), loc=2, ncol=1, fontsize=15)
    axes = plt.gca()

    if args.mnist:
        axes.set_xlim([0, 100])
        axes.set_ylim([0, 1.2])
    else:
        axes.set_xlim([0, 500])
        axes.set_ylim([0, 0.6])
    plt.title(title, fontsize=20, y = 0.9)
    plt.ylabel('Accuracy',fontsize=15)
    plt.xlabel('Generations',fontsize=15)
    plt.grid(True)


try: 
    if args.mnist:
        f = open(os.path.join('./result/result_mnist.pickle'), 'rb')
        result = pickle.load(f)
        f.close()
        pathnet_first = [result[1][2]]
        pathnet_second = [result[1][3]]
        pathnet_third = [result[1][4]]

        pathnet_first_control = [result[0][2]]
        pathnet_second_control = [result[0][3]]
        pathnet_third_control = [result[0][4]]

        #for res in result:
            #pathnet_first.append(res[2])
            #pathnet_second.append(res[3])
            #pathnet_third.append(res[4])

        subplot('111', pathnet_first, pathnet_second, pathnet_third, pathnet_first_control, pathnet_second_control, pathnet_third_control, 'MNIST')
        plt.show()

    else:
        f = open(os.path.join('./result/result_cifar_svhn.pickle'))
        result = pickle.load(f)
        f.close()

        cifar_first = []
        cifar_second = []
        svhn_first = []
        svhn_second = []

        for res in result:
            if res[0] == 'pathnet_cifar_first':
                cifar_first.append(res[2])
                svhn_second.append(res[3])
            else:
                svhn_first.append(res[2])
                cifar_second.append(res[3])

        subplot('211', cifar_first, cifar_second,'CIFAR-10')
        subplot('212', svhn_first, svhn_second,'cSVHN')

        plt.show()


except IOError:
    print("Result file does not exist")
