# ¿How well are the first task representations preserved?

In [1]:
import torch
from functools import partial
import torch.nn as nn
from torchvision import transforms
import torch.nn.functional as F
import torch.optim as optim
import torchvision
from tqdm import tqdm
import matplotlib.pyplot as plt


# reproducibility
seed = 1993
torch.manual_seed(1)
torch.cuda.manual_seed(1)
torch.cuda.manual_seed_all(1)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False
device = 'cuda' if torch.cuda.is_available() else 'cpu'
data_dir = "/home/studio-lab-user/CIL_Survey/data"

## Load and preprocess data

In [2]:
train_dataset_gpu = {}
eval_dataset_gpu = {}

# dataset
train = torchvision.datasets.CIFAR100(root=data_dir, download=True, transform=transforms.ToTensor())
eval = torchvision.datasets.CIFAR100(root=data_dir, train=False, transform=transforms.ToTensor())

# move dataset to gpu
train_dataset_gpu_loader = torch.utils.data.DataLoader(train, batch_size=len(train), drop_last=True,
                                            shuffle=True, num_workers=2, persistent_workers=False)
eval_dataset_gpu_loader = torch.utils.data.DataLoader(eval, batch_size=len(eval), drop_last=True,
                                            shuffle=False, num_workers=1, persistent_workers=False)
train_dataset_gpu['images'], train_dataset_gpu['targets'] = [item.to(device="cuda", non_blocking=True) for item in next(iter(train_dataset_gpu_loader))]
eval_dataset_gpu['images'],  eval_dataset_gpu['targets']  = [item.to(device="cuda", non_blocking=True) for item in next(iter(eval_dataset_gpu_loader)) ]

# normalize images
train_cifar_std, train_cifar_mean = torch.std_mean(train_dataset_gpu['images'], dim=(0, 2, 3)) 
print(f"Mean: {[f'{x:.4f}' for x in train_cifar_mean.tolist()]}")
print(f"Std: {[f'{x:.4f}' for x in train_cifar_std.tolist()]}")
def batch_normalize_images(input_images, mean, std):
    return (input_images - mean.view(1, -1, 1, 1)) / std.view(1, -1, 1, 1)
batch_normalize_images = partial(batch_normalize_images, mean=train_cifar_mean, std=train_cifar_std)
train_dataset_gpu['images'] = batch_normalize_images(train_dataset_gpu['images'])
eval_dataset_gpu['images']  = batch_normalize_images(eval_dataset_gpu['images'])

data = {
        'train': train_dataset_gpu,
        'eval': eval_dataset_gpu
    }

# pad images for later random cropping
pad_amount = 4
data['train']['images'] = F.pad(data['train']['images'], (pad_amount,)*4, 'reflect')

Files already downloaded and verified
Mean: ['0.5071', '0.4865', '0.4409']
Std: ['0.2673', '0.2564', '0.2762']


## Get model

In [3]:
net = torchvision.models.resnet18()
net.to(device);  

## Train model

### Train model

In [4]:
from batch_transforms import get_batches

# training
def train(optimizer, task):
    net.train()
    train_loss = 0
    correct = 0
    total = 0
    indices = range(increment * task, increment * (task + 1))
    for batch_idx, (inputs, targets) in enumerate(get_batches(data, "train", batch_size, indices=indices)):
        optimizer.zero_grad()
        outputs = net(inputs)
        loss = criterion(outputs, targets) if task == 0 else criterion(outputs[:, task * increment:], targets - task * increment)

        loss.backward()
        optimizer.step()

        train_loss += loss.item()
        _, predicted = outputs.max(1)
        total += len(targets)
        correct += predicted.eq(targets).sum().item()

    return train_loss/(batch_idx + 1), 100.*correct/total

def eval(from_task, to_task):
    net.eval()
    test_loss = 0
    correct = 0
    total = 0
    indices = range(from_task * increment, (to_task+1) * increment)
    with torch.no_grad():
        for batch_idx, (inputs, targets) in enumerate(get_batches(data, "eval", batch_size, indices=indices)):
            outputs = net(inputs)
            loss = criterion(outputs, targets)

            test_loss += loss.item()
            _, predicted = outputs.max(1)
            total += len(targets)
            correct += predicted.eq(targets).sum().item()
        return test_loss/(batch_idx + 1), 100.*correct/total

### Method to train the linear classifier on a specific task

In [5]:
def train_classifier(optimizer, task):
    net.train()
    train_loss = 0
    correct = 0
    total = 0
    indices = range(increment * task, increment * (task + 1))
    # freeze all layers but the classifier
    for n, p in net.named_parameters():
        if n != "fc.weight" and n != "fc.bias":
            p.requires_grad = False
    for batch_idx, (inputs, targets) in enumerate(get_batches(data, "train", batch_size, indices=indices)):
        optimizer.zero_grad()
        outputs = net(inputs)
        loss = criterion(outputs, targets) if task == 0 else criterion(outputs[:, task * incement:], targets - task * increment)

        loss.backward()
        optimizer.step()

        train_loss += loss.item()
        _, predicted = outputs.max(1)
        total += len(targets)
        correct += predicted.eq(targets).sum().item()

    return train_loss/(batch_idx + 1), 100.*correct/total

### Incremental training

#### Train ideal model on first task and get the accuracy 

In [6]:
criterion = nn.CrossEntropyLoss()
increment = 5
total_classes = 100 
feature_dim = 512
batch_size = 128

# optimizer and scheduler config
init_epochs = 200
init_lr = 0.1
init_weight_decay = 0.0005

rest_epochs = 80
rest_lr = 0.1
rest_weight_decay = 2e-4
rest_milestones = [40, 70]
rest_lr_decay = 0.1

In [7]:
def incremental_train(task, to_task, linear_probe=False):
    # update linear classifier
    new_fc = nn.Linear(feature_dim, increment * (task+1), device="cuda")
    nn.init.kaiming_uniform_(new_fc.weight, nonlinearity='linear')
    nn.init.constant_(new_fc.bias, 0)
    if linear_probe == False:
        if task != 0:
            old_fc = net.fc
            new_fc.weight.data[:task * increment] = old_fc.weight
            new_fc.bias.data[:task * increment] = old_fc.bias
        net.fc = new_fc

    # select optimize and scheduler
    if task == 0:
        epochs = init_epochs
        optimizer = optim.SGD(net.parameters(), momentum=0.9, lr=init_lr, weight_decay=init_weight_decay)
        scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=epochs)
    else: 
        epochs = rest_epochs
        optimizer = optim.SGD(net.parameters(), momentum=0.9, lr=rest_lr, weight_decay=rest_weight_decay)
        scheduler = optim.lr_scheduler.MultiStepLR(optimizer=optimizer, milestones=rest_milestones, gamma=rest_lr_decay)

    pbar = tqdm(range(epochs), unit="epoch")
    for epoch in pbar:
        tloss, tacc = train(optimizer, task) if linear_probe == False else train_classifier(optimizer, task)
        eloss, eacc = eval(from_task=0, to_task=to_task)
        scheduler.step()
        # metrics 
        # if epoch == epochs - 1:
        #     tlosses.append(tloss)
        #     taccs.append(tacc)
        #     elosses.append(eloss)
        #     eaccs.append(eacc)
        pbar.set_postfix({"Train Loss": tloss, "Train Acc": tacc, "Eval Loss": eloss, "Eval Acc": eacc})

    return eloss, eacc

In [8]:
loss0, accuracy0 = incremental_train(task=0, linear_probe=False, to_task=0)

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████| 200/200 [01:40<00:00,  1.99epoch/s, Train Loss=0.00422, Train Acc=99.9, Eval Loss=0.908, Eval Acc=82.8]


In [9]:
print(loss0)
print(accuracy0)

0.9079679250717163
82.8125


#### Train encoder on second task and measure accuracy

In [10]:
loss1, accuracy1 = incremental_train(task=1, linear_probe=False, to_task=0)

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████| 80/80 [00:39<00:00,  2.01epoch/s, Train Loss=0.0167, Train Acc=97.2, Eval Loss=4.33, Eval Acc=4.95]


In [11]:
print(loss1)
print(accuracy1)

4.334564685821533
4.947916666666667


#### Train linear classifier on first task and measure accuracy

In [12]:
loss_lp, accuracy_lp = incremental_train(task=0, linear_probe=True, to_task=0)

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████| 200/200 [00:21<00:00,  9.36epoch/s, Train Loss=0.822, Train Acc=68.5, Eval Loss=0.831, Eval Acc=68.5]


In [13]:
print(loss_lp)
print(accuracy_lp)

0.831021249294281
68.48958333333333


#### Compare the ideal accuracy with the degraded accuarcy with the linear probe accuracy

If the linear probe accuracy is close to the ideal accuracy, then the catasthrofical fogetting is caused beacuse a degradation on the linear classifier,
If not, it is caused because of a degradation on the feature representation.