# Catastrophic Forgetting

install dependencies:

In [None]:
!pip install avalanche-lib==0.3.1

tested with:
- python 3.10
- avalanche 0.3.1

In [1]:
import torch
import avalanche
import torchvision
import matplotlib.pyplot as plt

# Dataset
We start by loading CIFAR10. Unlike the previous exercises, now we split it into two datasets of 5 classes each.

In [2]:
from avalanche.benchmarks import SplitCIFAR10
from avalanche.benchmarks.generators import benchmark_with_validation_stream

benchmark = SplitCIFAR10(n_experiences=2)
benchmark = benchmark_with_validation_stream(benchmark, validation_size=0.3)

Files already downloaded and verified
Files already downloaded and verified


In [10]:
e0, e1 = benchmark.train_stream
e0.classes_in_this_experience, e1.classes_in_this_experience

([3, 4, 5, 7, 9], [0, 1, 2, 6, 8])

In [11]:
e0.task_label, e1.task_label

(0, 0)

# Training - Naive Finetuning
We import the model directly from Avalanche

In [14]:
def train(exp, model, optimizer, criterion):
    model.train()
    train_data = exp.dataset
    train_data = train_data.train()

    for ep in range(1):
        dataloader = DataLoader(train_data, batch_size=32)
        pbar = tqdm(dataloader)
        for (x, y, _) in pbar:
            x, y = x.to(device), y.to(device)
            optimizer.zero_grad()   
            pred = model(x)
            loss = criterion(pred, y)
            loss.backward()
            optimizer.step()
            pbar.set_description(f"Loss: {loss.item():0.4f}")

In [15]:
from tqdm import tqdm
from avalanche.models import SlimResNet18
from torch.utils.data import DataLoader

device = 'cpu'  # do yourself a favor and use a gpu by setting device='cuda'
model = SlimResNet18(nclasses=100)
model = model.to(device)
optimizer = torch.optim.SGD(model.parameters(), lr=0.01)
criterion = torch.nn.CrossEntropyLoss()

# We train sequentially on each element of the stream
for exp in benchmark.train_stream:
    train(exp, model, optimizer, criterion)

Loss: 1.1519: 100%|██████████████████████████████████████████████████████████████████| 547/547 [02:19<00:00,  3.93it/s]
Loss: 1.2352: 100%|██████████████████████████████████████████████████████████████████| 547/547 [02:20<00:00,  3.89it/s]


# Evaluation

In [18]:
def eval(exp, model):
    model = model.eval()
    valid_data = exp.dataset
    valid_data = valid_data.eval()
    dataloader = DataLoader(valid_data, batch_size=32)
    pbar = tqdm(dataloader)
    correct, tot = 0, 0
    for (x, y, _) in pbar:
        x, y = x.to(device), y.to(device)
        pred = model(x)
        _, pred = torch.max(pred.data, 1)
        correct += (pred == y).sum().item()
        tot += x.shape[0]
        pbar.set_description(f"ACC: {correct / tot:.4f}")

In [19]:
for exp in benchmark.valid_stream:
    eval(exp, model)

ACC: 0.0000: 100%|███████████████████████████████████████████████████████████████████| 235/235 [00:20<00:00, 11.38it/s]
ACC: 0.6792: 100%|███████████████████████████████████████████████████████████████████| 235/235 [00:20<00:00, 11.62it/s]


# Training - MultiTask

The previous baseline was not using task labels. What happens if you finetune the model but you use a multi-head classifier?

We are going to use Avalanche for the implementation. We are going to look at the implementation in a future lecture.

In [31]:
from avalanche.training import Naive
from avalanche.models import as_multitask

# a benchmark with task labels
benchmark = SplitCIFAR10(n_experiences=2, return_task_id=True)
benchmark = benchmark_with_validation_stream(benchmark, validation_size=0.3)

device = 'cpu'  # do yourself a favor and use a gpu by setting device='cuda'
model = SlimResNet18(nclasses=100)

# change the classifier to a multi-head classifier
model = as_multitask(model, "linear")
model = model.to(device)
optimizer = torch.optim.SGD(model.parameters(), lr=0.01)
criterion = torch.nn.CrossEntropyLoss()

# naive is the naive finetuning, the same training method that we used above
strategy = Naive(
    model=model, 
    optimizer=optimizer, 
    criterion=criterion, 
    train_mb_size=32, train_epochs=1
)

for exp in benchmark.train_stream:
    strategy.train(exp)
    strategy.eval(benchmark.valid_stream)

Files already downloaded and verified
Files already downloaded and verified
-- >> Start of training phase << --
100%|████████████████████████████████████████████████████████████████████████████████| 547/547 [02:20<00:00,  3.89it/s]
Epoch 0 ended.
	Loss_Epoch/train_phase/train_stream/Task000 = 1.1345
	Top1_Acc_Epoch/train_phase/train_stream/Task000 = 0.5091
-- >> End of training phase << --
-- >> Start of eval phase << --
-- Starting eval on experience 0 (Task 0) from valid stream --
100%|████████████████████████████████████████████████████████████████████████████████| 235/235 [00:21<00:00, 11.19it/s]
> Eval on experience 0 (Task 0) from valid stream ended.
	Loss_Exp/eval_phase/valid_stream/Task000/Exp000 = 0.9699
	Top1_Acc_Exp/eval_phase/valid_stream/Task000/Exp000 = 0.6105
-- Starting eval on experience 1 (Task 1) from valid stream --
100%|████████████████████████████████████████████████████████████████████████████████| 235/235 [00:21<00:00, 10.81it/s]
> Eval on experience 1 (Task 1) 

What's happening here? Why do we have forgetting even though we are using task labels?

# Training - Cumulative
Cumulative trains each step on all the data seen up to now:
- t=0 -> train on batch0
- t=1 -> trian on batch0 U batch1, starting from the previous model

...

In [23]:
from avalanche.training import Cumulative

device = 'cpu'  # do yourself a favor and use a gpu by setting device='cuda'
model = SlimResNet18(nclasses=100)
model = model.to(device)
optimizer = torch.optim.SGD(model.parameters(), lr=0.01)
criterion = torch.nn.CrossEntropyLoss()

strategy = Cumulative(
    model=model, 
    optimizer=optimizer, 
    criterion=criterion, 
    train_mb_size=32, train_epochs=1
)

for exp in benchmark.train_stream:
    strategy.train(exp)
    strategy.eval(benchmark.valid_stream)

-- >> Start of training phase << --
100%|████████████████████████████████████████████████████████████████████████████████| 547/547 [02:18<00:00,  3.96it/s]
Epoch 0 ended.
	Loss_Epoch/train_phase/train_stream/Task000 = 1.2911
	Top1_Acc_Epoch/train_phase/train_stream/Task000 = 0.4885
-- >> End of training phase << --
-- >> Start of eval phase << --
-- Starting eval on experience 0 (Task 0) from valid stream --
100%|████████████████████████████████████████████████████████████████████████████████| 235/235 [00:19<00:00, 12.05it/s]
> Eval on experience 0 (Task 0) from valid stream ended.
	Loss_Exp/eval_phase/valid_stream/Task000/Exp000 = 1.0170
	Top1_Acc_Exp/eval_phase/valid_stream/Task000/Exp000 = 0.5819
-- Starting eval on experience 1 (Task 0) from valid stream --
100%|████████████████████████████████████████████████████████████████████████████████| 235/235 [00:19<00:00, 12.12it/s]
> Eval on experience 1 (Task 0) from valid stream ended.
	Loss_Exp/eval_phase/valid_stream/Task000/Exp001 = 

# Training - Joint

just as a baseline we can also train on all the data at once

In [26]:
from avalanche.training import JointTraining

device = 'cpu'  # do yourself a favor and use a gpu by setting device='cuda'
model = SlimResNet18(nclasses=100)
model = model.to(device)
optimizer = torch.optim.SGD(model.parameters(), lr=0.01)
criterion = torch.nn.CrossEntropyLoss()

strategy = JointTraining(
    model=model, 
    optimizer=optimizer, 
    criterion=criterion, 
    train_mb_size=32, train_epochs=1
)

# we train on the entire stream all at once
# internally, it's going to concatenate all the batches and train on the joint data
strategy.train(benchmark.train_stream)
strategy.eval(benchmark.valid_stream)

-- >> Start of training phase << --
 29%|██████████████████████▊                                                        | 316/1094 [02:02<03:15,  3.99it/s]

KeyboardInterrupt: 

# Exercises
- try to measure the forgetting by monitoring how the accuracy on each task is changing over time
- can you identify some of the causes of forgetting? Is it:
    - a high number of epochs?
    - a high learning rate?
    - the model architecture (width, depth, batch-norm, ...)?
    - the type of drift?
- try to repeat the experiments with a new type of drift. Each experience contains the same classes but with new instances. For example, you can split CIFAR10 in a class-balanced way or use PermutedMNIST, which permutes the images differently at every step (you need to use a feedforward net in this case). 
- what happens if you revisit the stream multiple times? you can try to repeat the stream multiple times in the multi-head finetuning example