In [1]:
# Init/Load model
from transformers import BartForConditionalGeneration, BartTokenizer
import numpy as np
import torch
import os

device = "cuda"

# Define a directory to save the models
SAVE_DIR = '../saved_models'
if not os.path.exists(SAVE_DIR):
    os.makedirs(SAVE_DIR)

start_epoch = 160

class SimpleBART(torch.nn.Module):
    def __init__(self):
        super(SimpleBART, self).__init__()
        if start_epoch > 0:
            self.bart = BartForConditionalGeneration.from_pretrained(os.path.join(SAVE_DIR, f'epoch_{start_epoch}'))
            print(f'Loaded epoch_{start_epoch}')
        else:
            self.bart = BartForConditionalGeneration.from_pretrained('facebook/bart-base')
            print('Loaded facebook/bart-base')
        self.tokenizer = BartTokenizer.from_pretrained('facebook/bart-base')

    def forward(self, input_ids, attention_mask):
        return self.bart(input_ids=input_ids, attention_mask=attention_mask)


model = SimpleBART().to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=1e-5)
criterion = torch.nn.CrossEntropyLoss()

Loaded epoch_154


In [2]:
# Load Raw Datasets
from torch.utils.data import Dataset, DataLoader
import csv

ACCUMULATION_STEPS = 14
BATCH_SIZE = 14 # best performing batch size so far (in execution performance)
DATA_SIZE = 0

class CustomDataset(Dataset):
    def __init__(self, data, tokenizer, max_length=200):
        self.data = data
        self.tokenizer = tokenizer
        self.max_length = max_length
        self.pad_token_id = tokenizer.pad_token_id
        self.start_token_id = tokenizer.cls_token_id
        self.end_token_id = tokenizer.eos_token_id

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        text, tokens = self.data[idx]
        
        inputs = self.tokenizer(text, return_tensors="pt", truncation=True, padding="max_length", max_length=self.max_length)
        
        # Add start and end tokens and then pad
        tokens = [self.start_token_id] + tokens + [self.end_token_id]
        tokens_padded = [self.pad_token_id] * self.max_length
        tokens_padded[:len(tokens)] = tokens
        tokens_padded[len(tokens):] = [self.pad_token_id] * (self.max_length - len(tokens))
        
        return inputs["input_ids"].squeeze(0), inputs["attention_mask"].squeeze(0), torch.tensor(tokens_padded, dtype=torch.long)


def load_data_from_csv(file_path):
    with open(file_path, 'r') as file:
        reader = csv.reader(file)
        data = [(row[0], [int(tok) for tok in row[1].split(",")]) for row in reader]

    return data

def apply_concept(params, validation=False):
    global validationLoader
    global dataloader
    global DATA_SIZE

    merged_data = load_data_from_csv(f"../concept/egg.csv")
    
    print("Concepts loaded;")
    for file_name, percentage in params.items():
        data = load_data_from_csv(f"../concept/{file_name}.csv")
        cutoff = int(len(data) * percentage)
        
        if validation:
            loaded_data = data[-cutoff:]
        else:
            loaded_data = data[:cutoff]

        print(f"    - {file_name}: {len(loaded_data)}")
        merged_data.extend(loaded_data)

    DATA_SIZE = len(merged_data)
    dataset = CustomDataset(merged_data, model.tokenizer)
    if validation:
        validationLoader = DataLoader(dataset, batch_size=BATCH_SIZE, shuffle=True)
    else:
        dataloader = DataLoader(dataset, batch_size=BATCH_SIZE, shuffle=True)
    print(f'  Total: {DATA_SIZE}')

In [3]:
# Setup Validator
def validate():
    EOS_TOKEN_ID = model.tokenizer.eos_token_id

    model.eval()

    # Initialize counters for accuracy calculation
    total_correct_sequences = 0
    total_sequences = 0

    # Initialize counters for average sequence accuracy within the mask
    total_accuracy = 0

    with torch.no_grad():
        for batch_idx, (input_ids, attention_mask, targets) in enumerate(validationLoader):
            input_ids, attention_mask, targets = input_ids.to(device), attention_mask.to(device), targets.to(device)
            outputs = model(input_ids, attention_mask)
            logits = outputs.logits

            # Identify where the EOS token is in the target sequence
            eos_positions = (targets == EOS_TOKEN_ID).cumsum(dim=1).type(torch.bool)
            mask = ~eos_positions | (targets == EOS_TOKEN_ID)

            _, predicted = logits.max(2)
            correct_sequences = ((predicted == targets) | ~mask).all(dim=1).float().sum().item()
            total_sequences += targets.size(0)
            total_correct_sequences += correct_sequences

            # Compute the accuracy for each sequence
            correct_tokens_per_sequence = ((predicted == targets) & mask).float().sum(dim=1)
            total_tokens_per_sequence = mask.float().sum(dim=1)
            total_accuracy += (correct_tokens_per_sequence / total_tokens_per_sequence).sum().item()

    # Compute and print the accuracy for the entire validation dataset
    validation_accuracy = total_correct_sequences / total_sequences
    print(f"  Total Seq Acc: {validation_accuracy*100:.3f}%")
    avg_accuracy = total_accuracy / total_sequences
    print(f"    Avg Seq Acc: {avg_accuracy*100:.3f}%")

    return avg_accuracy, validation_accuracy

In [4]:
# Setup trainer
def save_model():
    global start_epoch
    model_save_path = os.path.join(SAVE_DIR, f'epoch_{start_epoch}')
    model.bart.save_pretrained(model_save_path)

def trainFor(num_epochs, target_loss=0, target_p50_loss=0, target_acc=1.0, target_seq_acc=1.0):
    global start_epoch

    EOS_TOKEN_ID = model.tokenizer.eos_token_id
    acc_batch = int(ACCUMULATION_STEPS / BATCH_SIZE)
    total_batches = len(dataloader)

    for epoch in range(start_epoch+1, start_epoch+num_epochs+1):
        start_epoch = epoch
        model.train()

        # Resetting the accumulated gradients
        optimizer.zero_grad()

        # Initialize counters for accuracy calculation
        total_correct_sequences = 0
        total_sequences = 0
        cumulative_loss = 0.0

        # Initialize list to store batch losses
        batch_losses = []

        for batch_idx, (input_ids, attention_mask, targets) in enumerate(dataloader):

            input_ids, attention_mask, targets = input_ids.to(device), attention_mask.to(device), targets.to(device)
            outputs = model(input_ids, attention_mask)
            logits = outputs.logits

            # Identify where the EOS token is in the target sequence
            eos_positions = (targets == EOS_TOKEN_ID).cumsum(dim=1).type(torch.bool)
            mask = ~eos_positions | (targets == EOS_TOKEN_ID)

            # Apply mask to filter out tokens after the EOS token for loss computation
            active_loss = mask.view(-1).bool()
            active_logits = logits.view(-1, logits.size(-1))[active_loss]
            active_labels = targets.view(-1)[active_loss]
            loss = criterion(active_logits, active_labels)

            _, predicted = logits.max(2)
            correct_sequences = ((predicted == targets) | ~mask).all(dim=1).float().sum().item()
            total_sequences += targets.size(0)
            total_correct_sequences += correct_sequences

            # Accumulate the gradients
            loss.backward()
            loss_val = loss.item()

            cumulative_loss += loss_val
            batch_losses.append(loss_val)

            isLast = batch_idx == len(dataloader) - 1

            # Only perform an optimization step every ACCUMULATION_STEPS
            if isLast or batch_idx % acc_batch == 0:
                optimizer.step()
                optimizer.zero_grad()
            
            print(f"\rEpoch: {epoch}, Batch: {batch_idx} of {total_batches}, loss: {loss_val:.6f}      ", end='')


        # Compute and print the accuracy for the entire epoch
        epoch_accuracy = total_correct_sequences / total_sequences
        cumulative_loss = cumulative_loss / len(batch_losses)
        p25_loss = np.percentile(batch_losses, 25)
        p50_loss = np.percentile(batch_losses, 50)
        p75_loss = np.percentile(batch_losses, 75)
        print(f"\rEpoch: {epoch}, Accuracy: {epoch_accuracy*100:.2f}%                                             ")
        print(f"  Loss: 25%: {p25_loss:.6f} 50%: {p50_loss:.6f} 75%: {p75_loss:.6f}\n   Avg: {cumulative_loss:.6f}")

        if epoch % 10 == 0: # Save the model
            save_model()

        seq_acc, total_acc = validate()

        if total_acc >= target_acc:
            break
        if cumulative_loss <= target_loss:
            break
        if p50_loss <= target_p50_loss:
            break

        if seq_acc >= target_seq_acc:
            break

    # Make sure last epoch is always saved
    save_model()

In [5]:
ACCUMULATION_STEPS = BATCH_SIZE # pure memorisation so accumulation won't help
apply_concept({"vocabulary": 1.0, "noise": 0.1}, validation=True)
apply_concept({"vocabulary": 1.0, "noise": 0.02})
trainFor(50, target_seq_acc=0.50)
# (31mins)

Concepts loaded;
    - vocabulary: 6808
    - noise: 15614
  Total: 22423
Concepts loaded;
    - vocabulary: 6808
    - noise: 3122
  Total: 9931
Epoch: 1, Accuracy: 0.01%
  Loss: 25%: 4.393126 50%: 4.887876 75%: 5.315798
   Avg: 4.884056
  Total Seq Acc: 0.000%
    Avg Seq Acc: 39.868%
Epoch: 2, Accuracy: 0.03%
  Loss: 25%: 4.117525 50%: 4.550774 75%: 4.877478
   Avg: 4.482987
  Total Seq Acc: 0.004%
    Avg Seq Acc: 39.875%
Epoch: 3, Accuracy: 0.03%
  Loss: 25%: 4.068293 50%: 4.452379 75%: 4.818588
   Avg: 4.398917
  Total Seq Acc: 0.062%
    Avg Seq Acc: 39.897%
Epoch: 4, Accuracy: 0.05%
  Loss: 25%: 4.008659 50%: 4.428133 75%: 4.759971
   Avg: 4.360003
  Total Seq Acc: 0.085%
    Avg Seq Acc: 39.924%
Epoch: 5, Accuracy: 0.16%
  Loss: 25%: 3.946199 50%: 4.349040 75%: 4.712277
   Avg: 4.299947
  Total Seq Acc: 0.214%
    Avg Seq Acc: 40.006%
Epoch: 6, Accuracy: 0.48%
  Loss: 25%: 3.817751 50%: 4.224310 75%: 4.577750
   Avg: 4.175182
  Total Seq Acc: 1.057%
    Avg Seq Acc: 40.588%
Ep

In [6]:
ACCUMULATION_STEPS = BATCH_SIZE # pure memorisation so accumulation won't help
apply_concept({"vocabulary": 1.0, "noise": 0.04})
trainFor(50, target_seq_acc=0.60)
# (4mins)

Concepts loaded;
    - vocabulary: 6808
    - noise: 6245
  Total: 13054
Epoch: 9, Accuracy: 10.68%
  Loss: 25%: 2.948524 50%: 3.301470 75%: 3.602107
   Avg: 3.285144
  Total Seq Acc: 16.104%
    Avg Seq Acc: 63.344%


In [7]:
ACCUMULATION_STEPS = 28 # *14 close to 32
apply_concept({"vocabulary": 1.0, "noise": 0.06})
trainFor(50, target_seq_acc=0.65)
# (5mins)

Concepts loaded;
    - vocabulary: 6808
    - noise: 9368
  Total: 16177
Epoch: 10, Accuracy: 14.79%
  Loss: 25%: 2.290249 50%: 2.601158 75%: 2.916466
   Avg: 2.600121
  Total Seq Acc: 25.349%
    Avg Seq Acc: 75.091%


Aiming for:
$$
\begin{align*}
  \frac{unique}{tokens} &= \frac{4174}{6808} = 61.31\%  & \text{sign pairs to text only used once} \\
  \frac{text}{tokens}   &= \frac{5342}{6808}  = 78.47\% & \text{sign pairs to text unique text} \\
  & & \text{unique meaning the text is only used for one tokenID}
\end{align*}
$$

i.e. there are six different signs which can be used for "present"  
which means we're actually aiming for $96\%$ effective accuracy $\frac{75\%}{78\%}$

But we don't want to over-fit either  
Hence why we slowly introduce new concepts while still memorising vocabulary

`target_seq_acc` includes the `EOS` token, which we of course we want to be right
So our target should be $\frac{61.31\% + 100\%}{2} = 80.65\%$

In [8]:
ACCUMULATION_STEPS = 70 # *14 close to 64
apply_concept({"vocabulary": 1.0, "noise": 0.08})
trainFor(50, target_seq_acc=0.80655)
# (10mins)

Concepts loaded;
    - vocabulary: 6808
    - noise: 12491
  Total: 19300
Epoch: 11, Accuracy: 18.79%
  Loss: 25%: 1.740285 50%: 1.988600 75%: 2.250141
   Avg: 2.004923
  Total Seq Acc: 31.401%
    Avg Seq Acc: 79.550%
Epoch: 12, Accuracy: 23.83%
  Loss: 25%: 1.321281 50%: 1.537223 75%: 1.742366
   Avg: 1.547170
  Total Seq Acc: 35.058%
    Avg Seq Acc: 81.933%


In [9]:
apply_concept({"vocabulary": 1.0, "noise": 0.1}, validation=True)
validate()
apply_concept({"vocabulary": 1.0}, validation=True)
validate()
apply_concept({"noise": 0.1}, validation=True)
validate()

Concepts loaded;
    - vocabulary: 6808
    - noise: 15614
  Total: 22423
  Total Seq Acc: 35.058%
    Avg Seq Acc: 81.933%
Concepts loaded;
    - vocabulary: 6808
  Total: 6809
  Total Seq Acc: 62.109%
    Avg Seq Acc: 87.325%
Concepts loaded;
    - noise: 15614
  Total: 15615
  Total Seq Acc: 23.260%
    Avg Seq Acc: 79.579%


(0.795788674411047, 0.23259686199167467)

The pure memorisation is over, as the vocabulary has been sufficiently learnt  
While it's not perfect, it will improve further over the later training, as the vocab remains in the training set

In [11]:
ACCUMULATION_STEPS = 1022 # *14 close to 1024
apply_concept({"vocabulary": 1.0, "noise": 0.1}, validation=True)
apply_concept({"vocabulary": 1.0, "noise": 0.225})
trainFor(50, target_seq_acc=0.6)
# (10mins)

Concepts loaded;
    - vocabulary: 6808
    - noise: 15614
  Total: 22423
Concepts loaded;
    - vocabulary: 6808
    - noise: 35132
  Total: 41941
Epoch: 13, Accuracy: 19.62%
  Loss: 25%: 1.276297 50%: 1.521877 75%: 1.806241
   Avg: 1.567434
  Total Seq Acc: 37.671%
    Avg Seq Acc: 83.350%


In [12]:
# Break down of validation per concept
apply_concept({"vocabulary": 1.0}, validation=True)
validate()
apply_concept({"noise": 0.1}, validation=True)
validate()

Concepts loaded;
    - vocabulary: 6808
  Total: 6809
  Total Seq Acc: 64.576%
    Avg Seq Acc: 88.098%
Concepts loaded;
    - noise: 15614
  Total: 15615
  Total Seq Acc: 25.937%
    Avg Seq Acc: 81.276%


(0.8127563179130933, 0.25936599423631124)

In [13]:
ACCUMULATION_STEPS = 1022 # *14 close to 1024
apply_concept({"vocabulary": 1.0, "noise": 0.1}, validation=True)
apply_concept({"vocabulary": 1.0, "noise": 0.45})
trainFor(50, target_seq_acc=0.7)
# (16mins)

Concepts loaded;
    - vocabulary: 6808
    - noise: 15614
  Total: 22423
Concepts loaded;
    - vocabulary: 6808
    - noise: 70264
  Total: 77073
Epoch: 14, Accuracy: 21.77%
  Loss: 25%: 0.978782 50%: 1.179458 75%: 1.420422
   Avg: 1.226024
  Total Seq Acc: 42.332%
    Avg Seq Acc: 86.016%


In [14]:
# Validate the model saved correctly
apply_concept({"vocabulary": 1.0, "noise": 0.1}, validation=True)
validate()
apply_concept({"vocabulary": 1.0}, validation=True)
validate()
apply_concept({"noise": 0.1}, validation=True)
validate()

Concepts loaded;
    - vocabulary: 6808
    - noise: 15614
  Total: 22423
  Total Seq Acc: 42.332%
    Avg Seq Acc: 86.016%
Concepts loaded;
    - vocabulary: 6808
  Total: 6809
  Total Seq Acc: 67.190%
    Avg Seq Acc: 88.945%
Concepts loaded;
    - noise: 15614
  Total: 15615
  Total Seq Acc: 31.489%
    Avg Seq Acc: 84.735%


(0.8473481228669942, 0.3148895292987512)

In [15]:
ACCUMULATION_STEPS = 1022 # *14 close to 1024
apply_concept({"vocabulary": 1.0, "noise": 0.1}, validation=True)
apply_concept({"vocabulary": 1.0, "noise": 0.9})
trainFor(50, target_seq_acc=0.75)
# (29mins)

Concepts loaded;
    - vocabulary: 6808
    - noise: 15614
  Total: 22423
Concepts loaded;
    - vocabulary: 6808
    - noise: 140528
  Total: 147337
Epoch: 15, Accuracy: 25.32%
  Loss: 25%: 0.735360 50%: 0.893032 75%: 1.094783
   Avg: 0.942284
  Total Seq Acc: 46.069%
    Avg Seq Acc: 87.619%


In [16]:
apply_concept({"vocabulary": 1.0, "noise": 0.1}, validation=True)
validate()
apply_concept({"vocabulary": 1.0}, validation=True)
validate()
apply_concept({"noise": 0.1}, validation=True)
validate()

Concepts loaded;
    - vocabulary: 6808
    - noise: 15614
  Total: 22423
  Total Seq Acc: 46.069%
    Avg Seq Acc: 87.619%
Concepts loaded;
    - vocabulary: 6808
  Total: 6809
  Total Seq Acc: 69.863%
    Avg Seq Acc: 89.861%
Concepts loaded;
    - noise: 15614
  Total: 15615
  Total Seq Acc: 35.690%
    Avg Seq Acc: 86.637%


(0.8663697381680721, 0.3569004162664105)

In [17]:
ACCUMULATION_STEPS = 1022 # *14 close to 1024
apply_concept({"vocabulary": 1.0, "noise": 0.9})
trainFor(50, target_acc=0.8)
# (583mins)

Concepts loaded;
    - vocabulary: 6808
    - noise: 140528
  Total: 147337
Epoch: 16, Accuracy: 29.25%
  Loss: 25%: 0.571872 50%: 0.698485 75%: 0.865685
   Avg: 0.749544
  Total Seq Acc: 36.574%
    Avg Seq Acc: 87.146%
Epoch: 17, Accuracy: 31.80%
  Loss: 25%: 0.486688 50%: 0.593760 75%: 0.741733
   Avg: 0.647466
  Total Seq Acc: 38.834%
    Avg Seq Acc: 87.917%
Epoch: 18, Accuracy: 33.24%
  Loss: 25%: 0.436313 50%: 0.527889 75%: 0.659536
   Avg: 0.579617
  Total Seq Acc: 39.065%
    Avg Seq Acc: 88.180%
Epoch: 19, Accuracy: 34.66%
  Loss: 25%: 0.394753 50%: 0.478254 75%: 0.594729
   Avg: 0.527243
  Total Seq Acc: 39.577%
    Avg Seq Acc: 88.354%
Epoch: 20, Accuracy: 35.63%
  Loss: 25%: 0.367282 50%: 0.446331 75%: 0.552213
   Avg: 0.491565
  Total Seq Acc: 39.801%
    Avg Seq Acc: 88.395%
Epoch: 21, Accuracy: 36.52%
  Loss: 25%: 0.344380 50%: 0.417024 75%: 0.515350
   Avg: 0.460924
  Total Seq Acc: 39.923%
    Avg Seq Acc: 88.536%
Epoch: 22, Accuracy: 37.15%
  Loss: 25%: 0.327414 50%:

In [18]:
save_model()

I accidentally set the `target_acc` which is accuracy of total sequences, instead of `target_seq_acc` which is the average accuracy of sequences, so this training stage had to be manually stopped

In [19]:
apply_concept({"vocabulary": 1.0, "noise": 0.1}, validation=True)
validate()
apply_concept({"vocabulary": 1.0}, validation=True)
validate()
apply_concept({"noise": 0.1}, validation=True)
validate()

Concepts loaded;
    - vocabulary: 6808
    - noise: 15614
  Total: 22423
  Total Seq Acc: 52.339%
    Avg Seq Acc: 90.122%
Concepts loaded;
    - vocabulary: 6808
  Total: 6809
  Total Seq Acc: 78.132%
    Avg Seq Acc: 92.685%
Concepts loaded;
    - noise: 15614
  Total: 15615
  Total Seq Acc: 41.089%
    Avg Seq Acc: 89.000%


(0.8899978608990418, 0.41088696765930194)

In [5]:
ACCUMULATION_STEPS = 1022 # *14 close to 1024
apply_concept({"vocabulary": 1.0, "noise": 0.1}, validation=True)
apply_concept({"vocabulary": 1.0, "noise": 0.9})
trainFor(50, target_p50_loss=0.2, target_seq_acc=0.90)
# (31mins)

Concepts loaded;
    - vocabulary: 6808
    - noise: 15614
  Total: 22423
Concepts loaded;
    - vocabulary: 6808
    - noise: 140528
  Total: 147337
Epoch: 36, Accuracy: 41.15%10525, loss: 0.603985      
  Loss: 25%: 0.235659 50%: 0.282068 75%: 0.340233
   Avg: 0.309650
  Total Seq Acc: 52.727%
    Avg Seq Acc: 90.170%


In [6]:
apply_concept({"vocabulary": 1.0, "noise": 0.1}, validation=True)
validate()
apply_concept({"vocabulary": 1.0}, validation=True)
validate()
apply_concept({"noise": 0.1}, validation=True)
validate()

Concepts loaded;
    - vocabulary: 6808
    - noise: 15614
  Total: 22423
  Total Seq Acc: 52.727%
    Avg Seq Acc: 90.170%
Concepts loaded;
    - vocabulary: 6808
  Total: 6809
  Total Seq Acc: 78.249%
    Avg Seq Acc: 92.715%
Concepts loaded;
    - noise: 15614
  Total: 15615
  Total Seq Acc: 41.595%
    Avg Seq Acc: 89.056%


(0.8905585711635232, 0.4159462055715658)

In [5]:
ACCUMULATION_STEPS = 4088 # *14 close to 4096
apply_concept({"vocabulary": 1.0, "word-masking": 0.1}, validation=True)
apply_concept({"vocabulary": 1.0, "word-masking": 0.45})
trainFor(20, target_p50_loss=0.2, target_seq_acc=0.70)
# (106mins)

Concepts loaded;
    - vocabulary: 6808
    - word-masking: 10000
  Total: 16809
Concepts loaded;
    - vocabulary: 6808
    - word-masking: 45000
  Total: 51809
Epoch: 37, Accuracy: 10.22%                                             
  Loss: 25%: 5.452652 50%: 5.740592 75%: 6.173782
   Avg: 5.867780
  Total Seq Acc: 32.179%
    Avg Seq Acc: 57.678%
Epoch: 38, Accuracy: 10.30%                                             
  Loss: 25%: 4.742584 50%: 4.972636 75%: 5.184268
   Avg: 4.951151
  Total Seq Acc: 32.340%
    Avg Seq Acc: 60.602%
Epoch: 39, Accuracy: 10.39%                                             
  Loss: 25%: 4.270635 50%: 4.504084 75%: 4.723184
   Avg: 4.476866
  Total Seq Acc: 32.435%
    Avg Seq Acc: 62.755%
Epoch: 40, Accuracy: 10.53%                                             
  Loss: 25%: 3.873336 50%: 4.118207 75%: 4.329885
   Avg: 4.091265
  Total Seq Acc: 32.506%
    Avg Seq Acc: 64.345%
Epoch: 41, Accuracy: 10.53%                                             
  Los

In [5]:
ACCUMULATION_STEPS = 4088 # *14 close to 4096
apply_concept({"vocabulary": 1.0, "word-masking": 0.1}, validation=True)
apply_concept({"vocabulary": 1.0, "word-masking": 0.9})
trainFor(20, target_p50_loss=0.2, target_seq_acc=0.9)
# (151mins)

Concepts loaded;
    - vocabulary: 6808
    - word-masking: 10000
  Total: 16809
Concepts loaded;
    - vocabulary: 6808
    - word-masking: 90000
  Total: 96809
Epoch: 46, Accuracy: 6.21%                                             
  Loss: 25%: 2.853651 50%: 3.078444 75%: 3.296024
   Avg: 3.069364
  Total Seq Acc: 32.804%
    Avg Seq Acc: 71.986%
Epoch: 47, Accuracy: 6.24%                                             
  Loss: 25%: 2.680615 50%: 2.886154 75%: 3.098930
   Avg: 2.881209
  Total Seq Acc: 32.643%
    Avg Seq Acc: 70.560%
Epoch: 48, Accuracy: 6.24%                                             
  Loss: 25%: 2.540300 50%: 2.741766 75%: 2.946689
   Avg: 2.741542
  Total Seq Acc: 32.715%
    Avg Seq Acc: 72.377%
Epoch: 49, Accuracy: 6.31%                                             
  Loss: 25%: 2.414057 50%: 2.623077 75%: 2.826812
   Avg: 2.617838
  Total Seq Acc: 32.727%
    Avg Seq Acc: 72.722%
Epoch: 50, Accuracy: 6.30%                                             
  Loss: 25

In [6]:
save_model()

Stopped early, because the new samples are so much longer than the noise, the correct use of the EOS token isn't skewing the `avg seq acc` as much, hence we want to stop closer to our $78.47\%$ accuracy estimate from before

Taking this estimated maximum possible accuracy into account we have currently obtained a $\frac{75.416\%}{78.47\%} = 96.12\%$ accuracy

In [5]:
ACCUMULATION_STEPS = 4088 # *14 close to 4096
apply_concept({"vocabulary": 1.0, "grammar": 0.1}, validation=True)
apply_concept({"vocabulary": 1.0, "grammar": 0.50})
trainFor(20, target_p50_loss=0.2, target_seq_acc=0.50)
# (100mins)

Concepts loaded;
    - vocabulary: 6808
    - grammar: 10000
  Total: 16809
Concepts loaded;
    - vocabulary: 6808
    - grammar: 50000
  Total: 56809
Epoch: 53, Accuracy: 8.17%                                             
  Loss: 25%: 5.661148 50%: 5.839899 75%: 6.294619
   Avg: 6.079971
  Total Seq Acc: 30.924%
    Avg Seq Acc: 48.485%
Epoch: 54, Accuracy: 7.87%                                             
  Loss: 25%: 5.338311 50%: 5.470820 75%: 5.596282
   Avg: 5.461576
  Total Seq Acc: 30.829%
    Avg Seq Acc: 48.938%
Epoch: 55, Accuracy: 7.57%                                             
  Loss: 25%: 5.084069 50%: 5.227675 75%: 5.357668
   Avg: 5.222457
  Total Seq Acc: 30.365%
    Avg Seq Acc: 49.104%
Epoch: 56, Accuracy: 6.74%                                             
  Loss: 25%: 4.944715 50%: 5.099912 75%: 5.235361
   Avg: 5.089562
  Total Seq Acc: 29.103%
    Avg Seq Acc: 48.949%
Epoch: 57, Accuracy: 5.98%                                             
  Loss: 25%: 4.84913

In [6]:
ACCUMULATION_STEPS = 4088 # *14 close to 4096
apply_concept({"vocabulary": 1.0, "grammar": 0.1}, validation=True)
apply_concept({"vocabulary": 1.0, "grammar": 0.6})
trainFor(20, target_p50_loss=0.2, target_seq_acc=0.60)
# (287mins)

Concepts loaded;
    - vocabulary: 6808
    - grammar: 10000
  Total: 16809
Concepts loaded;
    - vocabulary: 6808
    - grammar: 60000
  Total: 66809
Epoch: 61, Accuracy: 5.37%                                             
  Loss: 25%: 4.585829 50%: 4.748495 75%: 4.904201
   Avg: 4.740234
  Total Seq Acc: 28.277%
    Avg Seq Acc: 50.651%
Epoch: 62, Accuracy: 5.57%                                             
  Loss: 25%: 4.519236 50%: 4.690837 75%: 4.850698
   Avg: 4.681852
  Total Seq Acc: 28.419%
    Avg Seq Acc: 50.969%
Epoch: 63, Accuracy: 5.73%                                             
  Loss: 25%: 4.465399 50%: 4.640532 75%: 4.804271
   Avg: 4.631159
  Total Seq Acc: 28.610%
    Avg Seq Acc: 51.342%
Epoch: 64, Accuracy: 5.83%                                             
  Loss: 25%: 4.427047 50%: 4.595013 75%: 4.751634
   Avg: 4.584568
  Total Seq Acc: 28.741%
    Avg Seq Acc: 51.402%
Epoch: 65, Accuracy: 5.89%                                             
  Loss: 25%: 4.36758

In [7]:
ACCUMULATION_STEPS = 4088 # *14 close to 4096
apply_concept({"vocabulary": 1.0, "grammar": 0.1}, validation=True)
apply_concept({"vocabulary": 1.0, "grammar": 0.9})
trainFor(20, target_p50_loss=0.2, target_seq_acc=0.75)
# (414mins)

Concepts loaded;
    - vocabulary: 6808
    - grammar: 10000
  Total: 16809
Concepts loaded;
    - vocabulary: 6808
    - grammar: 90000
  Total: 96809
Epoch: 81, Accuracy: 4.57%                                             
  Loss: 25%: 3.732700 50%: 3.915807 75%: 4.088503
   Avg: 3.907259
  Total Seq Acc: 29.651%
    Avg Seq Acc: 56.869%
Epoch: 82, Accuracy: 4.59%                                             
  Loss: 25%: 3.674026 50%: 3.859215 75%: 4.042680
   Avg: 3.849874
  Total Seq Acc: 29.651%
    Avg Seq Acc: 57.284%
Epoch: 83, Accuracy: 4.59%                                             
  Loss: 25%: 3.613226 50%: 3.798069 75%: 3.976622
   Avg: 3.792179
  Total Seq Acc: 29.621%
    Avg Seq Acc: 57.726%
Epoch: 84, Accuracy: 4.54%                                             
  Loss: 25%: 3.542132 50%: 3.738827 75%: 3.919701
   Avg: 3.727699
  Total Seq Acc: 29.567%
    Avg Seq Acc: 58.218%
Epoch: 85, Accuracy: 4.53%                                             
  Loss: 25%: 3.48960

In [8]:
ACCUMULATION_STEPS = 4088 # *14 close to 4096
apply_concept({"vocabulary": 1.0, "grammar": 0.1}, validation=True)
apply_concept({"vocabulary": 1.0, "grammar": 0.9})
trainFor(20, target_p50_loss=0.2, target_seq_acc=0.8)
# (207mins)

Concepts loaded;
    - vocabulary: 6808
    - grammar: 10000
  Total: 16809
Concepts loaded;
    - vocabulary: 6808
    - grammar: 90000
  Total: 96809
Epoch: 101, Accuracy: 4.76%                                             
  Loss: 25%: 2.582887 50%: 2.773144 75%: 2.960040
   Avg: 2.768915
  Total Seq Acc: 29.252%
    Avg Seq Acc: 65.546%
Epoch: 102, Accuracy: 4.79%                                             
  Loss: 25%: 2.531624 50%: 2.719196 75%: 2.908400
   Avg: 2.718570
  Total Seq Acc: 29.240%
    Avg Seq Acc: 66.019%
Epoch: 103, Accuracy: 4.79%                                             
  Loss: 25%: 2.490860 50%: 2.673386 75%: 2.860445
   Avg: 2.675337
  Total Seq Acc: 29.234%
    Avg Seq Acc: 65.966%
Epoch: 104, Accuracy: 4.84%                                             
  Loss: 25%: 2.450328 50%: 2.630445 75%: 2.810037
   Avg: 2.632108
  Total Seq Acc: 29.258%
    Avg Seq Acc: 66.305%
Epoch: 105, Accuracy: 4.80%                                             
  Loss: 25%: 2.

In [5]:
ACCUMULATION_STEPS = 4088 # *14 close to 4096
apply_concept({"vocabulary": 1.0, "grammar": 0.1}, validation=True)
apply_concept({"vocabulary": 1.0, "grammar": 0.45})
trainFor(20, target_p50_loss=0.2, target_seq_acc=0.7)
# (910mins)

Concepts loaded;
    - vocabulary: 6808
    - grammar: 50000
  Total: 56809
Concepts loaded;
    - vocabulary: 6808
    - grammar: 225000
  Total: 231809
Epoch: 111, Accuracy: 2.34%                                             
  Loss: 25%: 2.276829 50%: 2.457360 75%: 2.641550
   Avg: 2.461111
  Total Seq Acc: 9.407%
    Avg Seq Acc: 58.416%
Epoch: 112, Accuracy: 2.36%                                             
  Loss: 25%: 2.206423 50%: 2.379490 75%: 2.559779
   Avg: 2.384682
  Total Seq Acc: 9.449%
    Avg Seq Acc: 58.906%
Epoch: 113, Accuracy: 2.38%                                             
  Loss: 25%: 2.157212 50%: 2.328168 75%: 2.504146
   Avg: 2.331942
  Total Seq Acc: 9.481%
    Avg Seq Acc: 59.789%
Epoch: 114, Accuracy: 2.42%                                             
  Loss: 25%: 2.100640 50%: 2.272418 75%: 2.444943
   Avg: 2.277403
  Total Seq Acc: 9.484%
    Avg Seq Acc: 60.633%
Epoch: 115, Accuracy: 2.48%                                             
  Loss: 25%: 2.04

In [5]:
ACCUMULATION_STEPS = 8190 # *14 close to 8192
apply_concept({"vocabulary": 1.0, "grammar": 0.1}, validation=True)
apply_concept({"vocabulary": 1.0, "grammar": 0.9})
trainFor(5, target_p50_loss=0.2, target_seq_acc=0.7)
# (190mins)

Concepts loaded;
    - vocabulary: 6808
    - grammar: 50000
  Total: 56809
Concepts loaded;
    - vocabulary: 6808
    - grammar: 450000
  Total: 456809
Epoch: 131, Accuracy: 2.23%                                             
  Loss: 25%: 1.521034 50%: 1.679262 75%: 1.864767
   Avg: 1.712413
  Total Seq Acc: 10.402%
    Avg Seq Acc: 69.580%
Epoch: 132, Accuracy: 2.27%                                             
  Loss: 25%: 1.457995 50%: 1.602164 75%: 1.763342
   Avg: 1.622421
  Total Seq Acc: 10.562%
    Avg Seq Acc: 70.585%


In [5]:
ACCUMULATION_STEPS = 8190 # *14 close to 8192
apply_concept({"vocabulary": 1.0, "grammar": 0.1}, validation=True)
apply_concept({"vocabulary": 1.0, "grammar": 0.9})
trainFor(5, target_p50_loss=0.2, target_seq_acc=0.8)
# (472mins)

Concepts loaded;
    - vocabulary: 6808
    - grammar: 50000
  Total: 56809
Concepts loaded;
    - vocabulary: 6808
    - grammar: 450000
  Total: 456809
Epoch: 133, Accuracy: 2.28%                                             
  Loss: 25%: 1.490670 50%: 1.647786 75%: 1.826489
   Avg: 1.674148
  Total Seq Acc: 10.500%
    Avg Seq Acc: 70.198%
Epoch: 134, Accuracy: 2.30%                                             
  Loss: 25%: 1.435652 50%: 1.579442 75%: 1.736099
   Avg: 1.597088
  Total Seq Acc: 10.597%
    Avg Seq Acc: 70.851%
Epoch: 135, Accuracy: 2.37%                                             
  Loss: 25%: 1.417854 50%: 1.559237 75%: 1.719589
   Avg: 1.580035
  Total Seq Acc: 10.622%
    Avg Seq Acc: 71.207%
Epoch: 136, Accuracy: 2.40%                                             
  Loss: 25%: 1.399602 50%: 1.541654 75%: 1.697758
   Avg: 1.561598
  Total Seq Acc: 10.627%
    Avg Seq Acc: 71.203%
Epoch: 137, Accuracy: 2.44%                                             
  Loss: 25%: 

In [5]:
ACCUMULATION_STEPS = 8190 # *14 close to 8192
apply_concept({"vocabulary": 1.0, "grammar": 0.1}, validation=True)
apply_concept({"vocabulary": 1.0, "grammar": 0.9})
trainFor(8, target_p50_loss=1.0, target_seq_acc=0.8)
# (760mins)

Concepts loaded;
    - vocabulary: 6808
    - grammar: 50000
  Total: 56809
Concepts loaded;
    - vocabulary: 6808
    - grammar: 450000
  Total: 456809
Epoch: 138, Accuracy: 2.38%                                             
  Loss: 25%: 1.403795 50%: 1.553975 75%: 1.726176
   Avg: 1.579655
  Total Seq Acc: 10.694%
    Avg Seq Acc: 71.961%
Epoch: 139, Accuracy: 2.44%                                             
  Loss: 25%: 1.356656 50%: 1.497774 75%: 1.653503
   Avg: 1.517331
  Total Seq Acc: 10.750%
    Avg Seq Acc: 71.842%
Epoch: 140, Accuracy: 2.49%                                             
  Loss: 25%: 1.343703 50%: 1.479851 75%: 1.636741
   Avg: 1.502873
  Total Seq Acc: 10.766%
    Avg Seq Acc: 72.292%
Epoch: 141, Accuracy: 2.51%                                             
  Loss: 25%: 1.327164 50%: 1.465816 75%: 1.620993
   Avg: 1.487350
  Total Seq Acc: 10.778%
    Avg Seq Acc: 71.991%
Epoch: 142, Accuracy: 2.55%                                             
  Loss: 25%: 

In [5]:
ACCUMULATION_STEPS = 8190 # *14 close to 8192
apply_concept({"vocabulary": 1.0, "grammar": 0.1}, validation=True)
apply_concept({"vocabulary": 1.0, "grammar": 0.9})
trainFor(5, target_p50_loss=1.0, target_seq_acc=0.8)
# (471mins)

Concepts loaded;
    - vocabulary: 6808
    - grammar: 50000
  Total: 56809
Concepts loaded;
    - vocabulary: 6808
    - grammar: 450000
  Total: 456809
Epoch: 146, Accuracy: 2.60%                                             
  Loss: 25%: 1.300828 50%: 1.450290 75%: 1.629443
   Avg: 1.490706
  Total Seq Acc: 10.868%
    Avg Seq Acc: 72.612%
Epoch: 147, Accuracy: 2.67%                                             
  Loss: 25%: 1.243615 50%: 1.377183 75%: 1.529420
   Avg: 1.400512
  Total Seq Acc: 11.039%
    Avg Seq Acc: 73.415%
Epoch: 148, Accuracy: 2.69%                                             
  Loss: 25%: 1.233071 50%: 1.366198 75%: 1.516507
   Avg: 1.389540
  Total Seq Acc: 11.111%
    Avg Seq Acc: 73.853%
Epoch: 149, Accuracy: 2.71%                                             
  Loss: 25%: 1.225038 50%: 1.355762 75%: 1.507573
   Avg: 1.380100
  Total Seq Acc: 11.090%
    Avg Seq Acc: 73.963%
Epoch: 150, Accuracy: 2.76%                                             
  Loss: 25%: 

In [6]:
ACCUMULATION_STEPS = 8190 # *14 close to 8192
apply_concept({"vocabulary": 1.0, "grammar": 0.1}, validation=True)
apply_concept({"vocabulary": 1.0, "grammar": 0.9})
trainFor(5, target_p50_loss=1.0, target_seq_acc=0.75)
# (473mins)

Concepts loaded;
    - vocabulary: 6808
    - grammar: 50000
  Total: 56809
Concepts loaded;
    - vocabulary: 6808
    - grammar: 450000
  Total: 456809
Epoch: 146, Accuracy: 2.58%                                             
  Loss: 25%: 1.306898 50%: 1.460468 75%: 1.655614
   Avg: 1.516379
  Total Seq Acc: 11.018%
    Avg Seq Acc: 73.374%
Epoch: 147, Accuracy: 2.69%                                             
  Loss: 25%: 1.245178 50%: 1.380684 75%: 1.530738
   Avg: 1.402660
  Total Seq Acc: 10.984%
    Avg Seq Acc: 73.345%
Epoch: 148, Accuracy: 2.70%                                             
  Loss: 25%: 1.235114 50%: 1.368473 75%: 1.519865
   Avg: 1.392687
  Total Seq Acc: 11.002%
    Avg Seq Acc: 73.712%
Epoch: 149, Accuracy: 2.73%                                             
  Loss: 25%: 1.224935 50%: 1.357892 75%: 1.509347
   Avg: 1.380911
  Total Seq Acc: 11.109%
    Avg Seq Acc: 73.988%
Epoch: 150, Accuracy: 2.77%                                             
  Loss: 25%: 

In [5]:
ACCUMULATION_STEPS = 8190 # *14 close to 8192
apply_concept({"vocabulary": 1.0, "grammar": 0.1}, validation=True)
apply_concept({"vocabulary": 1.0, "grammar": 0.9})
trainFor(5, target_p50_loss=1.0, target_seq_acc=0.75)
# (380mins)

Concepts loaded;
    - vocabulary: 6808
    - grammar: 50000
  Total: 56809
Concepts loaded;
    - vocabulary: 6808
    - grammar: 450000
  Total: 456809
Epoch: 146, Accuracy: 1.98%                                             
  Loss: 25%: 1.880580 50%: 2.172355 75%: 2.669309
   Avg: 2.395109
  Total Seq Acc: 9.453%
    Avg Seq Acc: 65.815%
Epoch: 147, Accuracy: 2.14%                                             
  Loss: 25%: 1.417541 50%: 1.556936 75%: 1.704864
   Avg: 1.571578
  Total Seq Acc: 10.039%
    Avg Seq Acc: 71.823%
Epoch: 148, Accuracy: 2.34%                                             
  Loss: 25%: 1.283669 50%: 1.408078 75%: 1.546098
   Avg: 1.429610
  Total Seq Acc: 10.164%
    Avg Seq Acc: 73.131%
Epoch: 149, Accuracy: 2.45%                                             
  Loss: 25%: 1.220210 50%: 1.345649 75%: 1.486402
   Avg: 1.367570
  Total Seq Acc: 10.606%
    Avg Seq Acc: 75.726%


In [6]:
ACCUMULATION_STEPS = 8190 # *14 close to 8192
apply_concept({"vocabulary": 1.0, "grammar": 0.1}, validation=True)
apply_concept({"vocabulary": 1.0, "grammar": 0.9})
trainFor(5, target_p50_loss=1.0, target_seq_acc=0.80)
# (491mins)

Concepts loaded;
    - vocabulary: 6808
    - grammar: 50000
  Total: 56809
Concepts loaded;
    - vocabulary: 6808
    - grammar: 450000
  Total: 456809
Epoch: 150, Accuracy: 2.52%                                             
  Loss: 25%: 1.173243 50%: 1.296289 75%: 1.433404
   Avg: 1.319219
  Total Seq Acc: 10.685%
    Avg Seq Acc: 76.539%
Epoch: 151, Accuracy: 2.57%                                             
  Loss: 25%: 1.140962 50%: 1.263296 75%: 1.400819
   Avg: 1.286998
  Total Seq Acc: 10.835%
    Avg Seq Acc: 77.294%
Epoch: 152, Accuracy: 2.63%                                             
  Loss: 25%: 1.108131 50%: 1.230870 75%: 1.367448
   Avg: 1.255032
  Total Seq Acc: 10.919%
    Avg Seq Acc: 78.192%
Epoch: 153, Accuracy: 2.71%                                             
  Loss: 25%: 1.084787 50%: 1.205648 75%: 1.342836
   Avg: 1.230732
  Total Seq Acc: 11.062%
    Avg Seq Acc: 78.694%
Epoch: 154, Accuracy: 2.74%                                             
  Loss: 25%: 

In [5]:
ACCUMULATION_STEPS = 8190 # *14 close to 8192
apply_concept({"vocabulary": 1.0, "grammar": 0.1}, validation=True)
apply_concept({"vocabulary": 1.0, "grammar": 0.9})
trainFor(6, target_p50_loss=1.0, target_seq_acc=0.80)
# (380mins)

Concepts loaded;
    - vocabulary: 6808
    - grammar: 50000
  Total: 56809
Concepts loaded;
    - vocabulary: 6808
    - grammar: 450000
  Total: 456809
Epoch: 155, Accuracy: 2.61%                                             
  Loss: 25%: 1.118059 50%: 1.267285 75%: 1.468731
   Avg: 1.357771
  Total Seq Acc: 10.947%
    Avg Seq Acc: 78.352%
Epoch: 156, Accuracy: 2.78%                                             
  Loss: 25%: 1.036225 50%: 1.155239 75%: 1.290535
   Avg: 1.180834
  Total Seq Acc: 11.178%
    Avg Seq Acc: 79.604%
Epoch: 157, Accuracy: 2.82%                                             
  Loss: 25%: 1.021374 50%: 1.140734 75%: 1.279699
   Avg: 1.168246
  Total Seq Acc: 11.236%
    Avg Seq Acc: 79.959%
Epoch: 158, Accuracy: 2.86%                                             
  Loss: 25%: 1.012528 50%: 1.128791 75%: 1.262950
   Avg: 1.155522
  Total Seq Acc: 11.326%
    Avg Seq Acc: 80.388%


In [6]:
ACCUMULATION_STEPS = 8190 # *14 close to 8192
apply_concept({"vocabulary": 1.0, "grammar": 0.1}, validation=True)
apply_concept({"vocabulary": 1.0, "grammar": 0.9})
trainFor(160-start_epoch, target_p50_loss=1.0, target_seq_acc=0.90)
# (190mins)

Concepts loaded;
    - vocabulary: 6808
    - grammar: 50000
  Total: 56809
Concepts loaded;
    - vocabulary: 6808
    - grammar: 450000
  Total: 456809
Epoch: 159, Accuracy: 2.88%                                             
  Loss: 25%: 0.999729 50%: 1.115270 75%: 1.247488
   Avg: 1.142749
  Total Seq Acc: 11.331%
    Avg Seq Acc: 80.665%
Epoch: 160, Accuracy: 2.92%                                             
  Loss: 25%: 0.983621 50%: 1.100303 75%: 1.237552
   Avg: 1.128711
  Total Seq Acc: 11.282%
    Avg Seq Acc: 80.531%
