In [1]:
# Init/Load model
from transformers import BartForConditionalGeneration, BartTokenizer
import numpy as np
import torch
import os

device = "cuda"

# Define a directory to save the models
SAVE_DIR = '../saved_models'
if not os.path.exists(SAVE_DIR):
    os.makedirs(SAVE_DIR)

start_epoch = 0

class SimpleBART(torch.nn.Module):
    def __init__(self):
        super(SimpleBART, self).__init__()
        if start_epoch > 0:
            self.bart = BartForConditionalGeneration.from_pretrained(os.path.join(SAVE_DIR, f'epoch_{start_epoch}'))
            print(f'Loaded epoch_{start_epoch}')
        else:
            self.bart = BartForConditionalGeneration.from_pretrained('facebook/bart-base')
            print('Loaded facebook/bart-base')
        self.tokenizer = BartTokenizer.from_pretrained('facebook/bart-base')

    def forward(self, input_ids, attention_mask):
        return self.bart(input_ids=input_ids, attention_mask=attention_mask)


model = SimpleBART().to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=1e-5)
criterion = torch.nn.CrossEntropyLoss()

Loaded facebook/bart-base


In [2]:
# Load Raw Datasets
from torch.utils.data import Dataset, DataLoader
import csv

ACCUMULATION_STEPS = 14
BATCH_SIZE = 14 # best performing batch size so far (in execution performance)
DATA_SIZE = 0

class CustomDataset(Dataset):
    def __init__(self, data, tokenizer, max_length=200):
        self.data = data
        self.tokenizer = tokenizer
        self.max_length = max_length
        self.pad_token_id = tokenizer.pad_token_id
        self.start_token_id = tokenizer.cls_token_id
        self.end_token_id = tokenizer.eos_token_id

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        text, tokens = self.data[idx]
        
        inputs = self.tokenizer(text, return_tensors="pt", truncation=True, padding="max_length", max_length=self.max_length)
        
        # Add start and end tokens and then pad
        tokens = [self.start_token_id] + tokens + [self.end_token_id]
        tokens_padded = [self.pad_token_id] * self.max_length
        tokens_padded[:len(tokens)] = tokens
        tokens_padded[len(tokens):] = [self.pad_token_id] * (self.max_length - len(tokens))
        
        return inputs["input_ids"].squeeze(0), inputs["attention_mask"].squeeze(0), torch.tensor(tokens_padded, dtype=torch.long)


def load_data_from_csv(file_path):
    with open(file_path, 'r') as file:
        reader = csv.reader(file)
        data = [(row[0], [int(tok) for tok in row[1].split(",")]) for row in reader]

    return data

def apply_concept(params, validation=False):
    global validationLoader
    global dataloader
    global DATA_SIZE

    merged_data = load_data_from_csv(f"../concept/egg.csv")
    
    print("Concepts loaded;")
    for file_name, percentage in params.items():
        data = load_data_from_csv(f"../concept/{file_name}.csv")
        cutoff = int(len(data) * percentage)
        
        if validation:
            loaded_data = data[-cutoff:]
        else:
            loaded_data = data[:cutoff]

        print(f"    - {file_name}: {len(loaded_data)}")
        merged_data.extend(loaded_data)

    DATA_SIZE = len(merged_data)
    dataset = CustomDataset(merged_data, model.tokenizer)
    if validation:
        validationLoader = DataLoader(dataset, batch_size=BATCH_SIZE, shuffle=True)
    else:
        dataloader = DataLoader(dataset, batch_size=BATCH_SIZE, shuffle=True)
    print(f'  Total: {DATA_SIZE}')

In [3]:
# Setup Validator
def validate():
    EOS_TOKEN_ID = model.tokenizer.eos_token_id

    model.eval()

    # Initialize counters for accuracy calculation
    total_correct_sequences = 0
    total_sequences = 0

    # Initialize counters for average sequence accuracy within the mask
    total_accuracy = 0

    with torch.no_grad():
        for batch_idx, (input_ids, attention_mask, targets) in enumerate(validationLoader):
            input_ids, attention_mask, targets = input_ids.to(device), attention_mask.to(device), targets.to(device)
            outputs = model(input_ids, attention_mask)
            logits = outputs.logits

            # Identify where the EOS token is in the target sequence
            eos_positions = (targets == EOS_TOKEN_ID).cumsum(dim=1).type(torch.bool)
            mask = ~eos_positions | (targets == EOS_TOKEN_ID)

            _, predicted = logits.max(2)
            correct_sequences = ((predicted == targets) | ~mask).all(dim=1).float().sum().item()
            total_sequences += targets.size(0)
            total_correct_sequences += correct_sequences

            # Compute the accuracy for each sequence
            correct_tokens_per_sequence = ((predicted == targets) & mask).float().sum(dim=1)
            total_tokens_per_sequence = mask.float().sum(dim=1)
            total_accuracy += (correct_tokens_per_sequence / total_tokens_per_sequence).sum().item()

    # Compute and print the accuracy for the entire validation dataset
    validation_accuracy = total_correct_sequences / total_sequences
    print(f"  Total Seq Acc: {validation_accuracy*100:.3f}%")
    avg_accuracy = total_accuracy / total_sequences
    print(f"    Avg Seq Acc: {avg_accuracy*100:.3f}%")

    return avg_accuracy, validation_accuracy

In [4]:
# Setup trainer
def save_model():
    global start_epoch
    model_save_path = os.path.join(SAVE_DIR, f'epoch_{start_epoch}')
    model.bart.save_pretrained(model_save_path)

def trainFor(num_epochs, target_loss=0, target_p50_loss=0, target_acc=1.0, target_seq_acc=1.0):
    global start_epoch

    EOS_TOKEN_ID = model.tokenizer.eos_token_id
    acc_batch = int(ACCUMULATION_STEPS / BATCH_SIZE)
    total_batches = len(dataloader)

    for epoch in range(start_epoch+1, start_epoch+num_epochs+1):
        start_epoch = epoch
        model.train()

        # Resetting the accumulated gradients
        optimizer.zero_grad()

        # Initialize counters for accuracy calculation
        total_correct_sequences = 0
        total_sequences = 0
        cumulative_loss = 0.0

        # Initialize list to store batch losses
        batch_losses = []

        for batch_idx, (input_ids, attention_mask, targets) in enumerate(dataloader):

            input_ids, attention_mask, targets = input_ids.to(device), attention_mask.to(device), targets.to(device)
            outputs = model(input_ids, attention_mask)
            logits = outputs.logits

            # Identify where the EOS token is in the target sequence
            eos_positions = (targets == EOS_TOKEN_ID).cumsum(dim=1).type(torch.bool)
            mask = ~eos_positions | (targets == EOS_TOKEN_ID)

            # Apply mask to filter out tokens after the EOS token for loss computation
            active_loss = mask.view(-1).bool()
            active_logits = logits.view(-1, logits.size(-1))[active_loss]
            active_labels = targets.view(-1)[active_loss]
            loss = criterion(active_logits, active_labels)

            _, predicted = logits.max(2)
            correct_sequences = ((predicted == targets) | ~mask).all(dim=1).float().sum().item()
            total_sequences += targets.size(0)
            total_correct_sequences += correct_sequences

            # Accumulate the gradients
            loss.backward()
            loss_val = loss.item()

            cumulative_loss += loss_val
            batch_losses.append(loss_val)

            isLast = batch_idx == len(dataloader) - 1

            # Only perform an optimization step every ACCUMULATION_STEPS
            if isLast or batch_idx % acc_batch == 0:
                optimizer.step()
                optimizer.zero_grad()
            
            print(f"\rEpoch: {epoch}, Batch: {batch_idx} of {total_batches}, loss: {loss_val:.6f}      ", end='')


        # Compute and print the accuracy for the entire epoch
        epoch_accuracy = total_correct_sequences / total_sequences
        cumulative_loss = cumulative_loss / len(batch_losses)
        p25_loss = np.percentile(batch_losses, 25)
        p50_loss = np.percentile(batch_losses, 50)
        p75_loss = np.percentile(batch_losses, 75)
        print(f"\rEpoch: {epoch}, Accuracy: {epoch_accuracy*100:.2f}%                                             ")
        print(f"  Loss: 25%: {p25_loss:.6f} 50%: {p50_loss:.6f} 75%: {p75_loss:.6f}\n   Avg: {cumulative_loss:.6f}")

        # Write loss values to loss.csv
        percentiles = [np.percentile(batch_losses, i) for i in range(101)]
        with open('loss.csv', mode='a', newline='') as file:
            writer = csv.writer(file)
            writer.writerow([epoch] + percentiles)

        if epoch % 10 == 0: # Save the model
            save_model()

        seq_acc, total_acc = validate()

        if total_acc >= target_acc:
            break
        if cumulative_loss <= target_loss:
            break
        if p50_loss <= target_p50_loss:
            break

        if seq_acc >= target_seq_acc:
            break

    # Make sure last epoch is always saved
    save_model()

In [5]:
ACCUMULATION_STEPS = BATCH_SIZE # pure memorisation so accumulation won't help
apply_concept({"vocabulary": 1.0, "noise": 0.1}, validation=True)
apply_concept({"vocabulary": 1.0, "noise": 0.02})
trainFor(50, target_seq_acc=0.50)
# (34mins)

Concepts loaded;
    - vocabulary: 6808
    - noise: 15614
  Total: 22423
Concepts loaded;
    - vocabulary: 6808
    - noise: 3122
  Total: 9931
Epoch: 1, Accuracy: 0.00%                                             
  Loss: 25%: 4.403065 50%: 4.849018 75%: 5.296900
   Avg: 4.866282
  Total Seq Acc: 0.004%
    Avg Seq Acc: 39.869%
Epoch: 2, Accuracy: 0.04%                                             
  Loss: 25%: 4.114165 50%: 4.543126 75%: 4.914825
   Avg: 4.488807
  Total Seq Acc: 0.000%
    Avg Seq Acc: 39.873%
Epoch: 3, Accuracy: 0.04%                                             
  Loss: 25%: 4.032996 50%: 4.430487 75%: 4.820580
   Avg: 4.396551
  Total Seq Acc: 0.076%
    Avg Seq Acc: 39.906%
Epoch: 4, Accuracy: 0.06%                                             
  Loss: 25%: 3.972137 50%: 4.378263 75%: 4.781254
   Avg: 4.345113
  Total Seq Acc: 0.120%
    Avg Seq Acc: 39.934%
Epoch: 5, Accuracy: 0.12%                                             
  Loss: 25%: 3.932351 50%: 4.367048

In [6]:
ACCUMULATION_STEPS = BATCH_SIZE # pure memorisation so accumulation won't help
apply_concept({"vocabulary": 1.0, "noise": 0.04})
trainFor(50, target_seq_acc=0.60)
# (5mins)

Concepts loaded;
    - vocabulary: 6808
    - noise: 6245
  Total: 13054
Epoch: 10, Accuracy: 13.81%                                             
  Loss: 25%: 2.656691 50%: 2.988907 75%: 3.271646
   Avg: 2.973416
  Total Seq Acc: 19.721%
    Avg Seq Acc: 68.222%


In [7]:
ACCUMULATION_STEPS = 28 # *14 close to 32
apply_concept({"vocabulary": 1.0, "noise": 0.06})
trainFor(50, target_seq_acc=0.65)
# (5mins)

Concepts loaded;
    - vocabulary: 6808
    - noise: 9368
  Total: 16177
Epoch: 11, Accuracy: 17.17%                                             
  Loss: 25%: 2.074187 50%: 2.373098 75%: 2.669538
   Avg: 2.376195
  Total Seq Acc: 27.387%
    Avg Seq Acc: 76.537%


Aiming for:
$$
\begin{align*}
  \frac{unique}{tokens} &= \frac{4174}{6808} = 61.31\%  & \text{sign pairs to text only used once} \\
  \frac{text}{tokens}   &= \frac{5342}{6808}  = 78.47\% & \text{sign pairs to text unique text} \\
  & & \text{unique meaning the text is only used for one tokenID}
\end{align*}
$$

i.e. there are six different signs which can be used for "present"  
which means we're actually aiming for $96\%$ effective accuracy $\frac{75\%}{78\%}$

But we don't want to over-fit either  
Hence why we slowly introduce new concepts while still memorising vocabulary

`target_seq_acc` includes the `EOS` token, which we of course we want to be right
So our target should be $\frac{61.31\% + 100\%}{2} = 80.65\%$

In [None]:
ACCUMULATION_STEPS = 70 # *14 close to 64
apply_concept({"vocabulary": 1.0, "noise": 0.08})
trainFor(50, target_seq_acc=0.80655)
# (mins)

In [None]:
apply_concept({"vocabulary": 1.0, "noise": 0.1}, validation=True)
validate()
apply_concept({"vocabulary": 1.0}, validation=True)
validate()
apply_concept({"noise": 0.1}, validation=True)
validate()

The pure memorisation is over, as the vocabulary has been sufficiently learnt  
While it's not perfect, it will improve further over the later training, as the vocab remains in the training set

In [None]:
ACCUMULATION_STEPS = 1022 # *14 close to 1024
apply_concept({"vocabulary": 1.0, "noise": 0.1}, validation=True)
apply_concept({"vocabulary": 1.0, "noise": 0.225})
trainFor(50, target_seq_acc=0.6)
# (mins)

In [None]:
# Break down of validation per concept
apply_concept({"vocabulary": 1.0}, validation=True)
validate()
apply_concept({"noise": 0.1}, validation=True)
validate()

In [None]:
ACCUMULATION_STEPS = 1022 # *14 close to 1024
apply_concept({"vocabulary": 1.0, "noise": 0.1}, validation=True)
apply_concept({"vocabulary": 1.0, "noise": 0.45})
trainFor(50, target_seq_acc=0.7)
# (mins)

In [None]:
# Validate the model saved correctly
apply_concept({"vocabulary": 1.0, "noise": 0.1}, validation=True)
validate()
apply_concept({"vocabulary": 1.0}, validation=True)
validate()
apply_concept({"noise": 0.1}, validation=True)
validate()

In [None]:
ACCUMULATION_STEPS = 1022 # *14 close to 1024
apply_concept({"vocabulary": 1.0, "noise": 0.1}, validation=True)
apply_concept({"vocabulary": 1.0, "noise": 0.9})
trainFor(50, target_seq_acc=0.75)
# (mins)

In [None]:
apply_concept({"vocabulary": 1.0, "noise": 0.1}, validation=True)
validate()
apply_concept({"vocabulary": 1.0}, validation=True)
validate()
apply_concept({"noise": 0.1}, validation=True)
validate()

In [None]:
ACCUMULATION_STEPS = 1022 # *14 close to 1024
apply_concept({"vocabulary": 1.0, "noise": 0.1}, validation=True)
apply_concept({"vocabulary": 1.0, "noise": 0.9})
trainFor(50, target_p50_loss=0.2, target_seq_acc=0.90)
# (31mins)

In [None]:
apply_concept({"vocabulary": 1.0, "noise": 0.1}, validation=True)
validate()
apply_concept({"vocabulary": 1.0}, validation=True)
validate()
apply_concept({"noise": 0.1}, validation=True)
validate()

In [None]:
ACCUMULATION_STEPS = 4088 # *14 close to 4096
apply_concept({"vocabulary": 1.0, "word-masking": 0.1}, validation=True)
apply_concept({"vocabulary": 1.0, "word-masking": 0.45})
trainFor(20, target_p50_loss=0.2, target_seq_acc=0.70)
# (106mins)

In [None]:
ACCUMULATION_STEPS = 4088 # *14 close to 4096
apply_concept({"vocabulary": 1.0, "word-masking": 0.1}, validation=True)
apply_concept({"vocabulary": 1.0, "word-masking": 0.9})
trainFor(20, target_p50_loss=0.2, target_seq_acc=0.9)
# (151mins)

In [None]:
save_model()

Stopped early, because the new samples are so much longer than the noise, the correct use of the EOS token isn't skewing the `avg seq acc` as much, hence we want to stop closer to our $78.47\%$ accuracy estimate from before

Taking this estimated maximum possible accuracy into account we have currently obtained a $\frac{75.416\%}{78.47\%} = 96.12\%$ accuracy

In [None]:
ACCUMULATION_STEPS = 4088 # *14 close to 4096
apply_concept({"vocabulary": 1.0, "grammar": 0.1}, validation=True)
apply_concept({"vocabulary": 1.0, "grammar": 0.50})
trainFor(20, target_p50_loss=0.2, target_seq_acc=0.50)
# (100mins)

In [None]:
ACCUMULATION_STEPS = 4088 # *14 close to 4096
apply_concept({"vocabulary": 1.0, "grammar": 0.1}, validation=True)
apply_concept({"vocabulary": 1.0, "grammar": 0.6})
trainFor(20, target_p50_loss=0.2, target_seq_acc=0.60)
# (287mins)

In [None]:
ACCUMULATION_STEPS = 4088 # *14 close to 4096
apply_concept({"vocabulary": 1.0, "grammar": 0.1}, validation=True)
apply_concept({"vocabulary": 1.0, "grammar": 0.9})
trainFor(20, target_p50_loss=0.2, target_seq_acc=0.75)
# (414mins)

In [None]:
ACCUMULATION_STEPS = 4088 # *14 close to 4096
apply_concept({"vocabulary": 1.0, "grammar": 0.1}, validation=True)
apply_concept({"vocabulary": 1.0, "grammar": 0.9})
trainFor(20, target_p50_loss=0.2, target_seq_acc=0.8)
# (207mins)

In [None]:
ACCUMULATION_STEPS = 4088 # *14 close to 4096
apply_concept({"vocabulary": 1.0, "grammar": 0.1}, validation=True)
apply_concept({"vocabulary": 1.0, "grammar": 0.45})
trainFor(20, target_p50_loss=0.2, target_seq_acc=0.7)
# (910mins)

In [None]:
ACCUMULATION_STEPS = 8190 # *14 close to 8192
apply_concept({"vocabulary": 1.0, "grammar": 0.1}, validation=True)
apply_concept({"vocabulary": 1.0, "grammar": 0.9})
trainFor(5, target_p50_loss=0.2, target_seq_acc=0.7)
# (190mins)

In [None]:
ACCUMULATION_STEPS = 8190 # *14 close to 8192
apply_concept({"vocabulary": 1.0, "grammar": 0.1}, validation=True)
apply_concept({"vocabulary": 1.0, "grammar": 0.9})
trainFor(5, target_p50_loss=0.2, target_seq_acc=0.8)
# (472mins)

In [None]:
ACCUMULATION_STEPS = 8190 # *14 close to 8192
apply_concept({"vocabulary": 1.0, "grammar": 0.1}, validation=True)
apply_concept({"vocabulary": 1.0, "grammar": 0.9})
trainFor(8, target_p50_loss=1.0, target_seq_acc=0.8)
# (760mins)

In [None]:
ACCUMULATION_STEPS = 8190 # *14 close to 8192
apply_concept({"vocabulary": 1.0, "grammar": 0.1}, validation=True)
apply_concept({"vocabulary": 1.0, "grammar": 0.9})
trainFor(5, target_p50_loss=1.0, target_seq_acc=0.8)
# (471mins)

In [None]:
ACCUMULATION_STEPS = 8190 # *14 close to 8192
apply_concept({"vocabulary": 1.0, "grammar": 0.1}, validation=True)
apply_concept({"vocabulary": 1.0, "grammar": 0.9})
trainFor(5, target_p50_loss=1.0, target_seq_acc=0.75)
# (473mins)

In [None]:
ACCUMULATION_STEPS = 8190 # *14 close to 8192
apply_concept({"vocabulary": 1.0, "grammar": 0.1}, validation=True)
apply_concept({"vocabulary": 1.0, "grammar": 0.9})
trainFor(5, target_p50_loss=1.0, target_seq_acc=0.75)
# (380mins)

In [None]:
ACCUMULATION_STEPS = 8190 # *14 close to 8192
apply_concept({"vocabulary": 1.0, "grammar": 0.1}, validation=True)
apply_concept({"vocabulary": 1.0, "grammar": 0.9})
trainFor(5, target_p50_loss=1.0, target_seq_acc=0.80)
# (491mins)

In [None]:
ACCUMULATION_STEPS = 8190 # *14 close to 8192
apply_concept({"vocabulary": 1.0, "grammar": 0.1}, validation=True)
apply_concept({"vocabulary": 1.0, "grammar": 0.9})
trainFor(6, target_p50_loss=1.0, target_seq_acc=0.80)
# (380mins)

In [None]:
ACCUMULATION_STEPS = 8190 # *14 close to 8192
apply_concept({"vocabulary": 1.0, "grammar": 0.1}, validation=True)
apply_concept({"vocabulary": 1.0, "grammar": 0.9})
trainFor(160-start_epoch, target_p50_loss=1.0, target_seq_acc=0.90)
# (190mins)