In [1]:
# Init/Load model
from transformers import BartForConditionalGeneration, BartTokenizer
import numpy as np
import torch
import os

device = "cuda"

# Define a directory to save the models
SAVE_DIR = '../saved_models'
if not os.path.exists(SAVE_DIR):
    os.makedirs(SAVE_DIR)

start_epoch = 110

class SimpleBART(torch.nn.Module):
    def __init__(self):
        super(SimpleBART, self).__init__()
        if start_epoch > 0:
            self.bart = BartForConditionalGeneration.from_pretrained(os.path.join(SAVE_DIR, f'epoch_{start_epoch}'))
            print(f'Loaded epoch_{start_epoch}')
        else:
            self.bart = BartForConditionalGeneration.from_pretrained('facebook/bart-base')
            print('Loaded facebook/bart-base')
        self.tokenizer = BartTokenizer.from_pretrained('facebook/bart-base')

    def forward(self, input_ids, attention_mask):
        return self.bart(input_ids=input_ids, attention_mask=attention_mask)


model = SimpleBART().to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=1e-5)
criterion = torch.nn.CrossEntropyLoss()

Loaded epoch_105


In [2]:
# Load Raw Datasets
from torch.utils.data import Dataset, DataLoader
import csv

ACCUMULATION_STEPS = 14
BATCH_SIZE = 14 # best performing batch size so far (in execution performance)
DATA_SIZE = 0

class CustomDataset(Dataset):
    def __init__(self, data, tokenizer, max_length=200):
        self.data = data
        self.tokenizer = tokenizer
        self.max_length = max_length
        self.pad_token_id = tokenizer.pad_token_id
        self.start_token_id = tokenizer.cls_token_id
        self.end_token_id = tokenizer.eos_token_id

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        text, tokens = self.data[idx]

        inputs = self.tokenizer(text, return_tensors="pt", truncation=True, padding="max_length", max_length=self.max_length)

        # Add start and end tokens and then pad
        tokens = [self.start_token_id] + tokens + [self.end_token_id]
        tokens_padded = [self.pad_token_id] * self.max_length
        tokens_padded[:len(tokens)] = tokens
        tokens_padded[len(tokens):] = [self.pad_token_id] * (self.max_length - len(tokens))

        return inputs["input_ids"].squeeze(0), inputs["attention_mask"].squeeze(0), torch.tensor(tokens_padded, dtype=torch.long)


def load_data_from_csv(file_path):
    with open(file_path, 'r') as file:
        reader = csv.reader(file)
        data = [(row[0], [int(tok) for tok in row[1].split(",")]) for row in reader]

    return data

def apply_concept(params, validation=False):
    global validationLoader
    global dataloader
    global DATA_SIZE

    merged_data = load_data_from_csv(f"../concept/egg.csv")

    print("Concepts loaded;")
    for file_name, percentage in params.items():
        data = load_data_from_csv(f"../concept/{file_name}.csv")
        cutoff = int(len(data) * percentage)

        if validation:
            loaded_data = data[-cutoff:]
        else:
            loaded_data = data[:cutoff]

        print(f"    - {file_name}: {len(loaded_data)}")
        merged_data.extend(loaded_data)

    DATA_SIZE = len(merged_data)
    dataset = CustomDataset(merged_data, model.tokenizer)
    if validation:
        validationLoader = DataLoader(dataset, batch_size=BATCH_SIZE, shuffle=True)
    else:
        dataloader = DataLoader(dataset, batch_size=BATCH_SIZE, shuffle=True)
    print(f'  Total: {DATA_SIZE}')

In [3]:
# Setup Validator
def validate():
    EOS_TOKEN_ID = model.tokenizer.eos_token_id

    model.eval()

    # Initialize counters for accuracy calculation
    total_correct_sequences = 0
    total_sequences = 0

    # Initialize counters for average sequence accuracy within the mask
    total_accuracy = 0

    with torch.no_grad():
        for batch_idx, (input_ids, attention_mask, targets) in enumerate(validationLoader):
            input_ids, attention_mask, targets = input_ids.to(device), attention_mask.to(device), targets.to(device)
            outputs = model(input_ids, attention_mask)
            logits = outputs.logits

            # Identify where the EOS token is in the target sequence
            eos_positions = (targets == EOS_TOKEN_ID).cumsum(dim=1).type(torch.bool)
            mask = ~eos_positions | (targets == EOS_TOKEN_ID)

            _, predicted = logits.max(2)
            correct_sequences = ((predicted == targets) | ~mask).all(dim=1).float().sum().item()
            total_sequences += targets.size(0)
            total_correct_sequences += correct_sequences

            # Compute the accuracy for each sequence
            correct_tokens_per_sequence = ((predicted == targets) & mask).float().sum(dim=1)
            total_tokens_per_sequence = mask.float().sum(dim=1)
            total_accuracy += (correct_tokens_per_sequence / total_tokens_per_sequence).sum().item()

    # Compute and print the accuracy for the entire validation dataset
    validation_accuracy = total_correct_sequences / total_sequences
    print(f"  Total Seq Acc: {validation_accuracy*100:.3f}%")
    avg_accuracy = total_accuracy / total_sequences
    print(f"    Avg Seq Acc: {avg_accuracy*100:.3f}%")

    return avg_accuracy, validation_accuracy

In [4]:
# Setup trainer
def save_model():
    global start_epoch
    model_save_path = os.path.join(SAVE_DIR, f'epoch_{start_epoch}')
    model.bart.save_pretrained(model_save_path)

def trainFor(num_epochs, target_loss=0, target_p50_loss=0, target_acc=1.0, target_seq_acc=1.0):
    global start_epoch

    EOS_TOKEN_ID = model.tokenizer.eos_token_id
    acc_batch = int(ACCUMULATION_STEPS / BATCH_SIZE)
    total_batches = len(dataloader)

    for epoch in range(start_epoch+1, start_epoch+num_epochs+1):
        start_epoch = epoch
        model.train()

        # Resetting the accumulated gradients
        optimizer.zero_grad()

        # Initialize counters for accuracy calculation
        total_correct_sequences = 0
        total_sequences = 0
        cumulative_loss = 0.0

        # Initialize list to store batch losses
        batch_losses = []

        for batch_idx, (input_ids, attention_mask, targets) in enumerate(dataloader):

            input_ids, attention_mask, targets = input_ids.to(device), attention_mask.to(device), targets.to(device)
            outputs = model(input_ids, attention_mask)
            logits = outputs.logits

            # Identify where the EOS token is in the target sequence
            eos_positions = (targets == EOS_TOKEN_ID).cumsum(dim=1).type(torch.bool)
            mask = ~eos_positions | (targets == EOS_TOKEN_ID)

            # Apply mask to filter out tokens after the EOS token for loss computation
            active_loss = mask.view(-1).bool()
            active_logits = logits.view(-1, logits.size(-1))[active_loss]
            active_labels = targets.view(-1)[active_loss]
            loss = criterion(active_logits, active_labels)

            _, predicted = logits.max(2)
            correct_sequences = ((predicted == targets) | ~mask).all(dim=1).float().sum().item()
            total_sequences += targets.size(0)
            total_correct_sequences += correct_sequences

            # Accumulate the gradients
            loss.backward()
            loss_val = loss.item()

            cumulative_loss += loss_val
            batch_losses.append(loss_val)

            isLast = batch_idx == len(dataloader) - 1

            # Only perform an optimization step every ACCUMULATION_STEPS
            if isLast or batch_idx % acc_batch == 0:
                optimizer.step()
                optimizer.zero_grad()

            print(f"\rEpoch: {epoch}, Batch: {batch_idx} of {total_batches}, loss: {loss_val:.6f}      ", end='')


        # Compute and print the accuracy for the entire epoch
        epoch_accuracy = total_correct_sequences / total_sequences
        cumulative_loss = cumulative_loss / len(batch_losses)
        p25_loss = np.percentile(batch_losses, 25)
        p50_loss = np.percentile(batch_losses, 50)
        p75_loss = np.percentile(batch_losses, 75)
        print(f"\rEpoch: {epoch}, Accuracy: {epoch_accuracy*100:.2f}%                                             ")
        print(f"  Loss: 25%: {p25_loss:.6f} 50%: {p50_loss:.6f} 75%: {p75_loss:.6f}\n   Avg: {cumulative_loss:.6f}")

        # Write loss values to loss.csv
        percentiles = [np.percentile(batch_losses, i) for i in range(101)]
        with open('loss.csv', mode='a', newline='') as file:
            writer = csv.writer(file)
            writer.writerow([epoch] + percentiles)

        if epoch % 10 == 0: # Save the model
            save_model()

        seq_acc, total_acc = validate()

        if total_acc >= target_acc:
            break
        if cumulative_loss <= target_loss:
            break
        if p50_loss <= target_p50_loss:
            break

        if seq_acc >= target_seq_acc:
            break

    # Make sure last epoch is always saved
    save_model()

In [5]:
ACCUMULATION_STEPS = BATCH_SIZE # pure memorisation so accumulation won't help
apply_concept({"vocabulary": 1.0, "noise": 0.1}, validation=True)
apply_concept({"vocabulary": 1.0, "noise": 0.02})
trainFor(50, target_seq_acc=0.50)
# (34mins)

Concepts loaded;
    - vocabulary: 6808
    - noise: 15614
  Total: 22423
Concepts loaded;
    - vocabulary: 6808
    - noise: 3122
  Total: 9931
Epoch: 1, Accuracy: 0.00%                                             
  Loss: 25%: 4.403065 50%: 4.849018 75%: 5.296900
   Avg: 4.866282
  Total Seq Acc: 0.004%
    Avg Seq Acc: 39.869%
Epoch: 2, Accuracy: 0.04%                                             
  Loss: 25%: 4.114165 50%: 4.543126 75%: 4.914825
   Avg: 4.488807
  Total Seq Acc: 0.000%
    Avg Seq Acc: 39.873%
Epoch: 3, Accuracy: 0.04%                                             
  Loss: 25%: 4.032996 50%: 4.430487 75%: 4.820580
   Avg: 4.396551
  Total Seq Acc: 0.076%
    Avg Seq Acc: 39.906%
Epoch: 4, Accuracy: 0.06%                                             
  Loss: 25%: 3.972137 50%: 4.378263 75%: 4.781254
   Avg: 4.345113
  Total Seq Acc: 0.120%
    Avg Seq Acc: 39.934%
Epoch: 5, Accuracy: 0.12%                                             
  Loss: 25%: 3.932351 50%: 4.367048

In [6]:
ACCUMULATION_STEPS = BATCH_SIZE # pure memorisation so accumulation won't help
apply_concept({"vocabulary": 1.0, "noise": 0.04})
trainFor(50, target_seq_acc=0.60)
# (5mins)

Concepts loaded;
    - vocabulary: 6808
    - noise: 6245
  Total: 13054
Epoch: 10, Accuracy: 13.81%                                             
  Loss: 25%: 2.656691 50%: 2.988907 75%: 3.271646
   Avg: 2.973416
  Total Seq Acc: 19.721%
    Avg Seq Acc: 68.222%


In [7]:
ACCUMULATION_STEPS = 28 # *14 close to 32
apply_concept({"vocabulary": 1.0, "noise": 0.06})
trainFor(50, target_seq_acc=0.65)
# (5mins)

Concepts loaded;
    - vocabulary: 6808
    - noise: 9368
  Total: 16177
Epoch: 11, Accuracy: 17.17%                                             
  Loss: 25%: 2.074187 50%: 2.373098 75%: 2.669538
   Avg: 2.376195
  Total Seq Acc: 27.387%
    Avg Seq Acc: 76.537%


Aiming for:
$$
\begin{align*}
  \frac{unique}{tokens} &= \frac{4174}{6808} = 61.31\%  & \text{sign pairs to text only used once} \\
  \frac{text}{tokens}   &= \frac{5342}{6808}  = 78.47\% & \text{sign pairs to text unique text} \\
  & & \text{unique meaning the text is only used for one tokenID}
\end{align*}
$$

i.e. there are six different signs which can be used for "present"  
which means we're actually aiming for $96\%$ effective accuracy $\frac{75\%}{78\%}$

But we don't want to over-fit either  
Hence why we slowly introduce new concepts while still memorising vocabulary

`target_seq_acc` includes the `EOS` token, which we of course we want to be right
So our target should be $\frac{61.31\% + 100\%}{2} = 80.65\%$

In [5]:
ACCUMULATION_STEPS = 70 # *14 close to 64
apply_concept({"vocabulary": 1.0, "noise": 0.1}, validation=True)
apply_concept({"vocabulary": 1.0, "noise": 0.08})
trainFor(50, target_seq_acc=0.80655)
# (11mins)

Concepts loaded;
    - vocabulary: 6808
    - noise: 15614
  Total: 22423
Concepts loaded;
    - vocabulary: 6808
    - noise: 12491
  Total: 19300
Epoch: 12, Accuracy: 20.01%                                             
  Loss: 25%: 1.637285 50%: 1.861813 75%: 2.106627
   Avg: 1.887390
  Total Seq Acc: 32.244%
    Avg Seq Acc: 80.065%
Epoch: 13, Accuracy: 23.50%                                             
  Loss: 25%: 1.358196 50%: 1.585695 75%: 1.840750
   Avg: 1.606165
  Total Seq Acc: 34.451%
    Avg Seq Acc: 81.583%


The pure memorisation is over, as the vocabulary has been sufficiently learnt  
While it's not perfect, it will improve further over the later training, as the vocab remains in the training set

In [6]:
ACCUMULATION_STEPS = 1022 # *14 close to 1024
apply_concept({"vocabulary": 1.0, "noise": 0.1}, validation=True)
apply_concept({"vocabulary": 1.0, "noise": 0.225})
trainFor(50, target_seq_acc=0.6)
# (10mins)

Concepts loaded;
    - vocabulary: 6808
    - noise: 15614
  Total: 22423
Concepts loaded;
    - vocabulary: 6808
    - noise: 35132
  Total: 41941
Epoch: 14, Accuracy: 18.78%                                             
  Loss: 25%: 1.329572 50%: 1.558345 75%: 1.814157
   Avg: 1.585541
  Total Seq Acc: 37.921%
    Avg Seq Acc: 83.464%


In [7]:
ACCUMULATION_STEPS = 1022 # *14 close to 1024
apply_concept({"vocabulary": 1.0, "noise": 0.1}, validation=True)
apply_concept({"vocabulary": 1.0, "noise": 0.45})
trainFor(50, target_seq_acc=0.7)
# (16mins)

Concepts loaded;
    - vocabulary: 6808
    - noise: 15614
  Total: 22423
Concepts loaded;
    - vocabulary: 6808
    - noise: 70264
  Total: 77073
Epoch: 15, Accuracy: 20.15%                                             
  Loss: 25%: 1.079807 50%: 1.281758 75%: 1.525836
   Avg: 1.319893
  Total Seq Acc: 41.399%
    Avg Seq Acc: 85.418%


In [8]:
ACCUMULATION_STEPS = 1022 # *14 close to 1024
apply_concept({"vocabulary": 1.0, "noise": 0.1}, validation=True)
apply_concept({"vocabulary": 1.0, "noise": 0.9})
trainFor(50, target_seq_acc=0.75)
# (30mins)

Concepts loaded;
    - vocabulary: 6808
    - noise: 15614
  Total: 22423
Concepts loaded;
    - vocabulary: 6808
    - noise: 140528
  Total: 147337
Epoch: 16, Accuracy: 23.13%                                             
  Loss: 25%: 0.838045 50%: 1.006275 75%: 1.225789
   Avg: 1.055101
  Total Seq Acc: 44.762%
    Avg Seq Acc: 87.021%


In [9]:
ACCUMULATION_STEPS = 1022 # *14 close to 1024
apply_concept({"vocabulary": 1.0, "noise": 0.1}, validation=True)
apply_concept({"vocabulary": 1.0, "noise": 0.9})
trainFor(50, target_p50_loss=0.2, target_seq_acc=0.90)
# (533mins)

Concepts loaded;
    - vocabulary: 6808
    - noise: 15614
  Total: 22423
Concepts loaded;
    - vocabulary: 6808
    - noise: 140528
  Total: 147337
Epoch: 17, Accuracy: 26.92%                                             
  Loss: 25%: 0.662440 50%: 0.804522 75%: 0.992638
   Avg: 0.856268
  Total Seq Acc: 45.779%
    Avg Seq Acc: 87.521%
Epoch: 18, Accuracy: 29.49%                                             
  Loss: 25%: 0.562557 50%: 0.685241 75%: 0.851517
   Avg: 0.737888
  Total Seq Acc: 47.928%
    Avg Seq Acc: 88.370%
Epoch: 19, Accuracy: 31.52%                                             
  Loss: 25%: 0.491418 50%: 0.597656 75%: 0.744295
   Avg: 0.650201
  Total Seq Acc: 48.513%
    Avg Seq Acc: 88.522%
Epoch: 20, Accuracy: 32.73%                                             
  Loss: 25%: 0.447802 50%: 0.545643 75%: 0.681414
   Avg: 0.597653
  Total Seq Acc: 49.436%
    Avg Seq Acc: 88.904%
Epoch: 21, Accuracy: 34.06%                                             
  Loss: 25%: 0.41

In [10]:
apply_concept({"vocabulary": 1.0, "noise": 0.1}, validation=True)
validate()
apply_concept({"vocabulary": 1.0}, validation=True)
validate()
apply_concept({"noise": 0.1}, validation=True)
validate()

Concepts loaded;
    - vocabulary: 6808
    - noise: 15614
  Total: 22423
  Total Seq Acc: 52.165%
    Avg Seq Acc: 90.004%
Concepts loaded;
    - vocabulary: 6808
  Total: 6809
  Total Seq Acc: 77.486%
    Avg Seq Acc: 92.470%
Concepts loaded;
    - noise: 15614
  Total: 15615
  Total Seq Acc: 41.121%
    Avg Seq Acc: 88.925%


(0.88925186383812, 0.4112071725904579)

Now that the model has successfully applied a vocabulary swap we must start the grammatical transformation training.  
First we will start by just teaching it which words to ignore as they do not exist in Auslan

In [11]:
ACCUMULATION_STEPS = 4088 # *14 close to 4096
apply_concept({"vocabulary": 1.0, "word-masking": 0.1}, validation=True)
apply_concept({"vocabulary": 1.0, "word-masking": 0.45})
trainFor(20, target_p50_loss=0.2, target_seq_acc=0.70)
# (57mins)

Concepts loaded;
    - vocabulary: 6808
    - word-masking: 10000
  Total: 16809
Concepts loaded;
    - vocabulary: 6808
    - word-masking: 45000
  Total: 51809
Epoch: 35, Accuracy: 9.80%                                             
  Loss: 25%: 5.300752 50%: 5.609476 75%: 5.946348
   Avg: 5.744106
  Total Seq Acc: 31.804%
    Avg Seq Acc: 58.800%
Epoch: 36, Accuracy: 9.81%                                             
  Loss: 25%: 4.243643 50%: 4.516359 75%: 4.793437
   Avg: 4.508092
  Total Seq Acc: 31.638%
    Avg Seq Acc: 64.020%
Epoch: 37, Accuracy: 9.89%                                             
  Loss: 25%: 3.599592 50%: 3.846125 75%: 4.072579
   Avg: 3.822467
  Total Seq Acc: 32.019%
    Avg Seq Acc: 64.472%
Epoch: 38, Accuracy: 10.03%                                             
  Loss: 25%: 3.236534 50%: 3.473460 75%: 3.700928
   Avg: 3.461110
  Total Seq Acc: 31.751%
    Avg Seq Acc: 65.270%
Epoch: 39, Accuracy: 10.03%                                             
  Loss: 

In [12]:
ACCUMULATION_STEPS = 4088 # *14 close to 4096
apply_concept({"vocabulary": 1.0, "word-masking": 0.1}, validation=True)
apply_concept({"vocabulary": 1.0, "word-masking": 0.9})
trainFor(20, target_p50_loss=0.2, target_seq_acc=0.9)
# (mins)

Concepts loaded;
    - vocabulary: 6808
    - word-masking: 10000
  Total: 16809
Concepts loaded;
    - vocabulary: 6808
    - word-masking: 90000
  Total: 96809
Epoch: 40, Accuracy: 5.95%                                             
  Loss: 25%: 2.783838 50%: 3.019561 75%: 3.248209
   Avg: 3.012249
  Total Seq Acc: 31.828%
    Avg Seq Acc: 69.566%
Epoch: 41, Accuracy: 6.10%                                             
  Loss: 25%: 2.493118 50%: 2.705811 75%: 2.920518
   Avg: 2.705100
  Total Seq Acc: 32.019%
    Avg Seq Acc: 72.888%
Epoch: 42, Accuracy: 6.09%                                             
  Loss: 25%: 2.316473 50%: 2.525594 75%: 2.723895
   Avg: 2.516568
  Total Seq Acc: 32.310%
    Avg Seq Acc: 76.054%
Epoch: 43, Accuracy: 6.19%                                             
  Loss: 25%: 2.155381 50%: 2.360453 75%: 2.556274
   Avg: 2.357171
  Total Seq Acc: 32.013%
    Avg Seq Acc: 75.060%
Epoch: 44, Accuracy: 6.24%                                             
  Loss: 25

In [13]:
save_model()

Cancelled early because we don't want to put too much focus into just the word masking  
The model already has a strong grasp on the concept and can start learning word arranging

In [5]:
ACCUMULATION_STEPS = 8190 # *14 close to 8192
apply_concept({"vocabulary": 1.0, "grammar": 0.1}, validation=True)
apply_concept({"vocabulary": 1.0, "grammar": 0.5})
trainFor(10, target_p50_loss=0.2, target_seq_acc=0.5)
# (551mins)

Concepts loaded;
    - vocabulary: 6808
    - grammar: 50000
  Total: 56809
Concepts loaded;
    - vocabulary: 6808
    - grammar: 250000
  Total: 256809
Epoch: 45, Accuracy: 1.74%                                             
  Loss: 25%: 5.043064 50%: 5.283244 75%: 5.601397
   Avg: 5.440462
  Total Seq Acc: 8.914%
    Avg Seq Acc: 30.513%
Epoch: 46, Accuracy: 1.53%                                             
  Loss: 25%: 4.573521 50%: 4.699400 75%: 4.836210
   Avg: 4.709103
  Total Seq Acc: 8.509%
    Avg Seq Acc: 31.355%
Epoch: 47, Accuracy: 1.31%                                             
  Loss: 25%: 4.356220 50%: 4.481114 75%: 4.614680
   Avg: 4.489703
  Total Seq Acc: 7.810%
    Avg Seq Acc: 32.108%
Epoch: 48, Accuracy: 1.12%                                             
  Loss: 25%: 4.191215 50%: 4.323497 75%: 4.463085
   Avg: 4.331535
  Total Seq Acc: 7.326%
    Avg Seq Acc: 32.913%
Epoch: 49, Accuracy: 1.14%                                             
  Loss: 25%: 4.053386 

In [5]:
ACCUMULATION_STEPS = 8190 # *14 close to 8192
apply_concept({"vocabulary": 1.0, "grammar": 0.1}, validation=True)
apply_concept({"vocabulary": 1.0, "grammar": 0.6})
trainFor(10, target_p50_loss=0.2, target_seq_acc=0.6)
# (649mins)

Concepts loaded;
    - vocabulary: 6808
    - grammar: 50000
  Total: 56809
Concepts loaded;
    - vocabulary: 6808
    - grammar: 300000
  Total: 306809
Epoch: 55, Accuracy: 1.08%                                             
  Loss: 25%: 3.428736 50%: 3.605553 75%: 3.783802
   Avg: 3.614931
  Total Seq Acc: 7.166%
    Avg Seq Acc: 40.403%
Epoch: 56, Accuracy: 1.06%                                             
  Loss: 25%: 3.335988 50%: 3.500705 75%: 3.674894
   Avg: 3.511020
  Total Seq Acc: 7.180%
    Avg Seq Acc: 40.782%
Epoch: 57, Accuracy: 1.06%                                             
  Loss: 25%: 3.262367 50%: 3.426763 75%: 3.606675
   Avg: 3.441214
  Total Seq Acc: 7.090%
    Avg Seq Acc: 41.457%
Epoch: 58, Accuracy: 1.04%                                             
  Loss: 25%: 3.187698 50%: 3.354445 75%: 3.538030
   Avg: 3.369233
  Total Seq Acc: 6.971%
    Avg Seq Acc: 42.178%
Epoch: 59, Accuracy: 1.01%                                             
  Loss: 25%: 3.112078 

In [5]:
ACCUMULATION_STEPS = 8190 # *14 close to 8192
apply_concept({"vocabulary": 1.0, "grammar": 0.1}, validation=True)
apply_concept({"vocabulary": 1.0, "grammar": 0.9})
trainFor(6, target_p50_loss=0.2, target_seq_acc=0.75)
# (568mins)

Concepts loaded;
    - vocabulary: 6808
    - grammar: 50000
  Total: 56809
Concepts loaded;
    - vocabulary: 6808
    - grammar: 450000
  Total: 456809
Epoch: 65, Accuracy: 0.70%                                             
  Loss: 25%: 2.682333 50%: 2.871508 75%: 3.074587
   Avg: 2.884193
  Total Seq Acc: 6.744%
    Avg Seq Acc: 48.775%
Epoch: 66, Accuracy: 0.73%                                             
  Loss: 25%: 2.602419 50%: 2.783997 75%: 2.978259
   Avg: 2.797313
  Total Seq Acc: 6.784%
    Avg Seq Acc: 49.538%
Epoch: 67, Accuracy: 0.73%                                             
  Loss: 25%: 2.537791 50%: 2.718444 75%: 2.912007
   Avg: 2.731926
  Total Seq Acc: 6.883%
    Avg Seq Acc: 50.525%
Epoch: 68, Accuracy: 0.74%                                             
  Loss: 25%: 2.467539 50%: 2.650765 75%: 2.846792
   Avg: 2.663782
  Total Seq Acc: 7.027%
    Avg Seq Acc: 51.586%
Epoch: 69, Accuracy: 0.76%                                             
  Loss: 25%: 2.397745 

In [5]:
ACCUMULATION_STEPS = 8190 # *14 close to 8192
apply_concept({"vocabulary": 1.0, "grammar": 0.1}, validation=True)
apply_concept({"vocabulary": 1.0, "grammar": 0.9})
trainFor(5, target_p50_loss=0.2, target_seq_acc=0.8)
# (475mins)

Concepts loaded;
    - vocabulary: 6808
    - grammar: 50000
  Total: 56809
Concepts loaded;
    - vocabulary: 6808
    - grammar: 450000
  Total: 456809
Epoch: 71, Accuracy: 0.81%                                             
  Loss: 25%: 2.314888 50%: 2.502601 75%: 2.708355
   Avg: 2.521973
  Total Seq Acc: 7.245%
    Avg Seq Acc: 54.242%
Epoch: 72, Accuracy: 0.84%                                             
  Loss: 25%: 2.241149 50%: 2.418887 75%: 2.607680
   Avg: 2.430881
  Total Seq Acc: 7.272%
    Avg Seq Acc: 54.987%
Epoch: 73, Accuracy: 0.87%                                             
  Loss: 25%: 2.194081 50%: 2.370080 75%: 2.560852
   Avg: 2.383077
  Total Seq Acc: 7.323%
    Avg Seq Acc: 55.757%
Epoch: 74, Accuracy: 0.89%                                             
  Loss: 25%: 2.140822 50%: 2.313307 75%: 2.498226
   Avg: 2.325857
  Total Seq Acc: 7.409%
    Avg Seq Acc: 56.885%
Epoch: 75, Accuracy: 0.92%                                             
  Loss: 25%: 2.063219 

In [5]:
ACCUMULATION_STEPS = 8190 # *14 close to 8192
apply_concept({"vocabulary": 1.0, "grammar": 0.1}, validation=True)
apply_concept({"vocabulary": 1.0, "grammar": 0.9})
trainFor(5, target_p50_loss=0.2, target_seq_acc=0.8)
# (474mins)

Concepts loaded;
    - vocabulary: 6808
    - grammar: 50000
  Total: 56809
Concepts loaded;
    - vocabulary: 6808
    - grammar: 450000
  Total: 456809
Epoch: 76, Accuracy: 0.92%                                             
  Loss: 25%: 2.046665 50%: 2.215688 75%: 2.399971
   Avg: 2.231916
  Total Seq Acc: 7.506%
    Avg Seq Acc: 58.765%
Epoch: 77, Accuracy: 0.96%                                             
  Loss: 25%: 1.970358 50%: 2.127085 75%: 2.290505
   Avg: 2.136366
  Total Seq Acc: 7.543%
    Avg Seq Acc: 59.686%
Epoch: 78, Accuracy: 0.97%                                             
  Loss: 25%: 1.921430 50%: 2.077863 75%: 2.235225
   Avg: 2.084244
  Total Seq Acc: 7.601%
    Avg Seq Acc: 60.986%
Epoch: 79, Accuracy: 0.98%                                             
  Loss: 25%: 1.871995 50%: 2.023464 75%: 2.180216
   Avg: 2.030972
  Total Seq Acc: 7.596%
    Avg Seq Acc: 61.886%
Epoch: 80, Accuracy: 1.01%                                             
  Loss: 25%: 1.819543 

In [5]:
ACCUMULATION_STEPS = 8190 # *14 close to 8192
apply_concept({"vocabulary": 1.0, "grammar": 0.1}, validation=True)
apply_concept({"vocabulary": 1.0, "grammar": 0.9})
trainFor(5, target_p50_loss=0.2, target_seq_acc=0.8)
# (474mins)

Concepts loaded;
    - vocabulary: 6808
    - grammar: 50000
  Total: 56809
Concepts loaded;
    - vocabulary: 6808
    - grammar: 450000
  Total: 456809
Epoch: 81, Accuracy: 1.04%                                             
  Loss: 25%: 1.829469 50%: 1.987885 75%: 2.159871
   Avg: 2.007925
  Total Seq Acc: 7.758%
    Avg Seq Acc: 63.655%
Epoch: 82, Accuracy: 1.06%                                             
  Loss: 25%: 1.761185 50%: 1.906973 75%: 2.061434
   Avg: 1.916527
  Total Seq Acc: 7.766%
    Avg Seq Acc: 64.108%
Epoch: 83, Accuracy: 1.08%                                             
  Loss: 25%: 1.730055 50%: 1.877588 75%: 2.032675
   Avg: 1.886860
  Total Seq Acc: 7.779%
    Avg Seq Acc: 64.661%
Epoch: 84, Accuracy: 1.11%                                             
  Loss: 25%: 1.695751 50%: 1.840608 75%: 1.996137
   Avg: 1.850816
  Total Seq Acc: 7.888%
    Avg Seq Acc: 65.719%
Epoch: 85, Accuracy: 1.14%                                             
  Loss: 25%: 1.657749 

In [5]:
ACCUMULATION_STEPS = 8190 # *14 close to 8192
apply_concept({"vocabulary": 1.0, "grammar": 0.1}, validation=True)
apply_concept({"vocabulary": 1.0, "grammar": 0.9})
trainFor(5, target_p50_loss=0.2, target_seq_acc=0.8)
# (473mins)

Concepts loaded;
    - vocabulary: 6808
    - grammar: 50000
  Total: 56809
Concepts loaded;
    - vocabulary: 6808
    - grammar: 450000
  Total: 456809
Epoch: 86, Accuracy: 1.15%                                             
  Loss: 25%: 1.675644 50%: 1.827500 75%: 1.994921
   Avg: 1.849289
  Total Seq Acc: 8.000%
    Avg Seq Acc: 66.860%
Epoch: 87, Accuracy: 1.17%                                             
  Loss: 25%: 1.615581 50%: 1.757754 75%: 1.911210
   Avg: 1.769092
  Total Seq Acc: 7.993%
    Avg Seq Acc: 67.050%
Epoch: 88, Accuracy: 1.22%                                             
  Loss: 25%: 1.590176 50%: 1.732046 75%: 1.881348
   Avg: 1.743233
  Total Seq Acc: 8.044%
    Avg Seq Acc: 67.473%
Epoch: 89, Accuracy: 1.25%                                             
  Loss: 25%: 1.562519 50%: 1.704608 75%: 1.854645
   Avg: 1.715583
  Total Seq Acc: 8.192%
    Avg Seq Acc: 68.291%
Epoch: 90, Accuracy: 1.30%                                             
  Loss: 25%: 1.533324 

In [5]:
ACCUMULATION_STEPS = 8190 # *14 close to 8192
apply_concept({"vocabulary": 1.0, "grammar": 0.1}, validation=True)
apply_concept({"vocabulary": 1.0, "grammar": 0.9})
trainFor(5, target_p50_loss=0.2, target_seq_acc=0.8)
# (474mins)

Concepts loaded;
    - vocabulary: 6808
    - grammar: 50000
  Total: 56809
Concepts loaded;
    - vocabulary: 6808
    - grammar: 450000
  Total: 456809
Epoch: 91, Accuracy: 1.27%                                             
  Loss: 25%: 1.567690 50%: 1.723511 75%: 1.903988
   Avg: 1.762427
  Total Seq Acc: 8.333%
    Avg Seq Acc: 69.191%
Epoch: 92, Accuracy: 1.34%                                             
  Loss: 25%: 1.499688 50%: 1.637657 75%: 1.787199
   Avg: 1.651440
  Total Seq Acc: 8.353%
    Avg Seq Acc: 69.426%
Epoch: 93, Accuracy: 1.36%                                             
  Loss: 25%: 1.480775 50%: 1.618415 75%: 1.769462
   Avg: 1.632100
  Total Seq Acc: 8.456%
    Avg Seq Acc: 70.202%
Epoch: 94, Accuracy: 1.39%                                             
  Loss: 25%: 1.456432 50%: 1.596672 75%: 1.747928
   Avg: 1.610726
  Total Seq Acc: 8.504%
    Avg Seq Acc: 70.455%
Epoch: 95, Accuracy: 1.46%                                             
  Loss: 25%: 1.434069 

In [5]:
ACCUMULATION_STEPS = 8190 # *14 close to 8192
apply_concept({"vocabulary": 1.0, "grammar": 0.1}, validation=True)
apply_concept({"vocabulary": 1.0, "grammar": 0.9})
trainFor(5, target_p50_loss=1.0, target_seq_acc=0.8)
# (474mins)

Concepts loaded;
    - vocabulary: 6808
    - grammar: 50000
  Total: 56809
Concepts loaded;
    - vocabulary: 6808
    - grammar: 450000
  Total: 456809
Epoch: 96, Accuracy: 1.41%                                             
  Loss: 25%: 1.470657 50%: 1.624199 75%: 1.803982
   Avg: 1.663073
  Total Seq Acc: 8.668%
    Avg Seq Acc: 71.137%
Epoch: 97, Accuracy: 1.50%                                             
  Loss: 25%: 1.406769 50%: 1.543143 75%: 1.693135
   Avg: 1.558744
  Total Seq Acc: 8.726%
    Avg Seq Acc: 71.208%
Epoch: 98, Accuracy: 1.54%                                             
  Loss: 25%: 1.387624 50%: 1.525135 75%: 1.675027
   Avg: 1.540241
  Total Seq Acc: 8.842%
    Avg Seq Acc: 72.147%
Epoch: 99, Accuracy: 1.56%                                             
  Loss: 25%: 1.368311 50%: 1.505243 75%: 1.655955
   Avg: 1.521673
  Total Seq Acc: 8.900%
    Avg Seq Acc: 72.388%
Epoch: 100, Accuracy: 1.63%                                             
  Loss: 25%: 1.350048

In [5]:
ACCUMULATION_STEPS = 8190 # *14 close to 8192
apply_concept({"vocabulary": 1.0, "grammar": 0.1}, validation=True)
apply_concept({"vocabulary": 1.0, "grammar": 0.9})
trainFor(5, target_p50_loss=1.0, target_seq_acc=0.8)
# (475mins)

Concepts loaded;
    - vocabulary: 6808
    - grammar: 50000
  Total: 56809
Concepts loaded;
    - vocabulary: 6808
    - grammar: 450000
  Total: 456809
Epoch: 101, Accuracy: 1.55%                                             
  Loss: 25%: 1.400047 50%: 1.558946 75%: 1.752922
   Avg: 1.617956
  Total Seq Acc: 8.983%
    Avg Seq Acc: 72.945%
Epoch: 102, Accuracy: 1.65%                                             
  Loss: 25%: 1.326405 50%: 1.460945 75%: 1.609611
   Avg: 1.478250
  Total Seq Acc: 9.132%
    Avg Seq Acc: 73.535%
Epoch: 103, Accuracy: 1.68%                                             
  Loss: 25%: 1.311099 50%: 1.445052 75%: 1.595631
   Avg: 1.463833
  Total Seq Acc: 9.131%
    Avg Seq Acc: 73.605%
Epoch: 104, Accuracy: 1.70%                                             
  Loss: 25%: 1.295619 50%: 1.429284 75%: 1.577475
   Avg: 1.447973
  Total Seq Acc: 9.241%
    Avg Seq Acc: 74.299%
Epoch: 105, Accuracy: 1.75%                                             
  Loss: 25%: 1.27

In [5]:
ACCUMULATION_STEPS = 8190 # *14 close to 8192
apply_concept({"vocabulary": 1.0, "grammar": 0.1}, validation=True)
apply_concept({"vocabulary": 1.0, "grammar": 0.9})
trainFor(5, target_p50_loss=1.0, target_seq_acc=0.8)
# (475mins)

Concepts loaded;
    - vocabulary: 6808
    - grammar: 50000
  Total: 56809
Concepts loaded;
    - vocabulary: 6808
    - grammar: 450000
  Total: 456809
Epoch: 106, Accuracy: 1.70%                                             
  Loss: 25%: 1.326098 50%: 1.478259 75%: 1.661936
   Avg: 1.524481
  Total Seq Acc: 9.233%
    Avg Seq Acc: 73.959%
Epoch: 107, Accuracy: 1.80%                                             
  Loss: 25%: 1.259377 50%: 1.391462 75%: 1.540108
   Avg: 1.411018
  Total Seq Acc: 9.300%
    Avg Seq Acc: 74.631%
Epoch: 108, Accuracy: 1.79%                                             
  Loss: 25%: 1.244987 50%: 1.377174 75%: 1.527523
   Avg: 1.397827
  Total Seq Acc: 9.400%
    Avg Seq Acc: 75.063%
Epoch: 109, Accuracy: 1.88%                                             
  Loss: 25%: 1.234408 50%: 1.364696 75%: 1.512952
   Avg: 1.385449
  Total Seq Acc: 9.425%
    Avg Seq Acc: 75.147%
Epoch: 110, Accuracy: 1.91%                                             
  Loss: 25%: 1.21

In [None]:
ACCUMULATION_STEPS = 8190 # *14 close to 8192
apply_concept({"vocabulary": 1.0, "grammar": 0.1, "word-pairs": 0.1}, validation=True)
apply_concept({"vocabulary": 1.0, "grammar": 0.9, "word-pairs": 0.9})
trainFor(5, target_p50_loss=1.0, target_seq_acc=0.8)
# (mins)