In [None]:
import os
import torch
import transformers
import pandas as pd
from transformers.trainer import *
from transformers import RobertaConfig
from transformers import RobertaForMaskedLM
from tokenizers import ByteLevelBPETokenizer
from transformers import RobertaTokenizerFast
from torch.utils.data import SequentialSampler
from torch.utils.data import Dataset, DataLoader
from transformers import Trainer, TrainingArguments
from transformers import AutoTokenizer, RobertaModel
from transformers import DataCollatorForLanguageModeling

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
model_path = "/content/drive/MyDrive/VU Thesis/Code/baby_models/V20L4"

In [None]:
def load_data(data_directory_path):
  all_data = []
  total_sentence = 0
  for filename in os.listdir(data_directory_path):
    file_path = os.path.join(data_directory_path, filename)
    if os.path.isfile(file_path):
      print(f'Reading file: {filename}')
      with open(file_path, 'r', encoding='utf-8') as file:
        for line in file:
          total_sentence+=1
          all_data.append(line.strip())
  print(f"total sentece in directory: {total_sentence}")
  return all_data

In [None]:
train_directory_path = '/content/drive/MyDrive/VU Thesis/Code/Data/babylm_10M'
sequences = load_data(train_directory_path)
print('='*50)

dev_directory_path = '/content/drive/MyDrive/VU Thesis/Code/Data/babylm_dev'
devset = load_data(dev_directory_path)
print('='*50)

test_directory_path = '/content/drive/MyDrive/VU Thesis/Code/Data/babylm_test'
test_data = load_data(test_directory_path)
print('='*50)

print(len(sequences))
print(len(devset))
print(len(test_data))

Reading file: wikipedia.train
Reading file: gutenberg.train
Reading file: open_subtitles.train
Reading file: simple_wikipedia.train
Reading file: qed.train
Reading file: aochildes.train
Reading file: bnc_spoken.train
Reading file: switchboard.train
Reading file: children_stories.train
Reading file: cbt.train
total sentece in directory: 1058740
Reading file: qed.dev
Reading file: bnc_spoken.dev
Reading file: cbt.dev
Reading file: children_stories.dev
Reading file: gutenberg.dev
Reading file: aochildes.dev
Reading file: open_subtitles.dev
Reading file: simple_wikipedia.dev
Reading file: wikipedia.dev
Reading file: switchboard.dev
total sentece in directory: 1026747
Reading file: children_stories.test
Reading file: cbt.test
Reading file: aochildes.test
Reading file: gutenberg.test
Reading file: bnc_spoken.test
Reading file: wikipedia.test
Reading file: simple_wikipedia.test
Reading file: open_subtitles.test
Reading file: switchboard.test
Reading file: qed.test
total sentece in directory: 

In [None]:
tokenizer_folder = '/content/drive/MyDrive/VU Thesis/Code/baby_models/tokenizer_V20_folder'

# if not os.path.exists(tokenizer_folder):
  # os.mkdir(tokenizer_folder)

In [None]:
def train_tokenizer(sequence_data, vocab_size, tokenizer_folder):
  # Initialize tokenizer
  tokenizer = ByteLevelBPETokenizer()

  tokenizer.train_from_iterator(sequences,
                                vocab_size=vocab_size,
                                min_frequency=2,
                                show_progress=True,
                                special_tokens=[
                                    "<s>",
                                    "<pad>",
                                    "</s>",
                                    "<unk>",
                                    "<mask>"]
                                )
  # Save tokenizer
  tokenizer.save_model(tokenizer_folder)

if not os.path.exists(tokenizer_folder):
  os.mkdir(tokenizer_folder)
  train_tokenizer(sequences, 20_480, tokenizer_folder)

In [None]:
# create CustomDataset class
class CustomDataset(Dataset):
  def __init__(self, data, tokenizer):
    self.examples = []
    self.mask = []
    max_length = 512
    for example in data:
      x=tokenizer.encode_plus(example, max_length = max_length, truncation=True, padding=True)
      self.examples += [x.input_ids]
      self.mask += [x.attention_mask]

  def __len__(self):
    return len(self.examples)

  def __getitem__(self, i):
    return torch.tensor(self.examples[i])

In [None]:
def get_train_dataloader(self) -> DataLoader:
    """
    Returns the training :class:`~torch.utils.data.DataLoader`.

    Will use no sampler if :obj:`self.train_dataset` does not implement :obj:`__len__`, a random sampler (adapted
    to distributed training if necessary) otherwise.

    Subclass and override this method if you want to inject some custom behavior.
    """
    if self.train_dataset is None:
        raise ValueError("Trainer: training requires a train_dataset.")

    return DataLoader(
        self.train_dataset,
        batch_size=self.args.train_batch_size,
        sampler=SequentialSampler(self.train_dataset),
        collate_fn=self.data_collator,
        drop_last=self.args.dataloader_drop_last,
        num_workers=self.args.dataloader_num_workers,
        shuffle=False
    )

In [None]:
def train_baby_lm(babaylm_config, tokenizer_path, train_data, dev_data, epochs_start, epochs_number, batch_size):

    max_length = 512
    tokenizer = RobertaTokenizerFast.from_pretrained(tokenizer_path, max_len=max_length)

    # Define data collator
    data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=True, mlm_probability=0.15)
    Trainer.get_train_dataloader = get_train_dataloader

    for i in range(epochs_start, epochs_start + epochs_number):

        if os.path.exists(f'{model_path}/model_e{i}_v20_l4'):
            model = RobertaForMaskedLM.from_pretrained(f'{model_path}/model_e{i}_v20_l4')
            print(f"pretrained model load (from Epoch {i})...")

        else:
            print("No checkpoint ... Create New one")
            model = RobertaForMaskedLM(config=babaylm_config)
            print('Model parameters: ',model.num_parameters())

        # batch_size = 16
        # Define training arguments
        training_args = TrainingArguments(
            output_dir=f'{model_path}/run_model_{i+1}_folder',
            overwrite_output_dir=True,
            eval_strategy = 'epoch',
            num_train_epochs=1,
            learning_rate=1e-4,
            weight_decay=0.01,
            per_device_train_batch_size=batch_size,
            per_device_eval_batch_size=batch_size,
            save_strategy="epoch",
            # save_steps=8192,
            save_total_limit=1,
            max_steps=int(len(train_dataset)/ batch_size)
        )

        trainer = Trainer(
            model=model,
            args=training_args,
            data_collator=data_collator,
            train_dataset=train_data,#train_dataset,
            eval_dataset=dev_data,#eval_dataset,
            tokenizer=tokenizer)

        trainer.train()
        trainer.save_model(f'{model_path}/model_e{i+1}_v20_l4')

In [None]:
import pickle
import os

max_length = 512
tokenizer = RobertaTokenizerFast.from_pretrained(tokenizer_folder, max_len=max_length)

train_custom_data_path =  "/content/drive/MyDrive/VU Thesis/Code/baby_models/train_V20.pkl"
dev_custom_data_path = "/content/drive/MyDrive/VU Thesis/Code/baby_models/dev_V20.pkl"

if os.path.isfile(train_custom_data_path):
  with open(train_custom_data_path, 'rb') as file:
    train_dataset = pickle.load(file)
else:
  train_dataset = CustomDataset(sequences, tokenizer)
  with open(train_custom_data_path, 'wb') as file:
    pickle.dump(train_dataset, file)


if os.path.isfile(dev_custom_data_path):
  with open(dev_custom_data_path, 'rb') as file:
    eval_dataset = pickle.load(file)
else:
  eval_dataset = CustomDataset(devset, tokenizer)
  with open(dev_custom_data_path, 'wb') as file:
    pickle.dump(eval_dataset, file)

In [None]:
# Epochs 1-5
config = RobertaConfig(
    vocab_size=20480,
    num_hidden_layers=4,
    max_position_embeddings=514,
    num_attention_heads=8,
    type_vocab_size=1,
    intermediate_size=3072,
    layer_norm_eps = 1e-05,
    hidden_size=256,
    initializer_range=0.02,
    classifier_dropout = None,
    hidden_act="gelu",
    position_embedding_type= "absolute",
    hidden_dropout_prob= 0.1,
    attention_probs_dropout_prob= 0.1,
)

epochs_start = 0
epochs = 5
batch_size = 32

train_baby_lm(config, tokenizer_folder, train_dataset, eval_dataset, epochs_start, epochs, batch_size)

Model parameters:  12823552


  trainer = Trainer(
[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mfarnaz-banifatemi-vu[0m ([33mfarnaz-banifatemi-vu-vrije-universiteit-amsterdam[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


We strongly recommend passing in an `attention_mask` since your input_ids may be padded. See https://huggingface.co/docs/transformers/troubleshooting#incorrect-output-when-padding-tokens-arent-masked.


Epoch,Training Loss,Validation Loss
0,6.2416,5.002483


  trainer = Trainer(


Epoch,Training Loss,Validation Loss
0,5.5978,4.494562


  trainer = Trainer(


Epoch,Training Loss,Validation Loss
0,5.3734,4.294209


  trainer = Trainer(


Epoch,Training Loss,Validation Loss
0,5.2816,4.192483


  trainer = Trainer(


Epoch,Training Loss,Validation Loss
0,5.2345,4.136362


In [None]:
# Epochs 6-10
config = RobertaConfig(
    vocab_size=20480,
    num_hidden_layers=4,
    max_position_embeddings=514,
    num_attention_heads=8,
    type_vocab_size=1,
    intermediate_size=3072,
    layer_norm_eps = 1e-05,
    hidden_size=256,
    initializer_range=0.02,
    classifier_dropout = None,
    hidden_act="gelu",
    position_embedding_type= "absolute",
    hidden_dropout_prob= 0.1,
    attention_probs_dropout_prob= 0.1,
)

epochs_start = 5
epochs = 5
batch_size = 32

train_baby_lm(config, tokenizer_folder, train_dataset, eval_dataset, epochs_start, epochs, batch_size)

pretrained model load (from Epoch 5)...


  trainer = Trainer(
[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mfarnaz-banifatemi-vu[0m ([33mfarnaz-banifatemi-vu-vrije-universiteit-amsterdam[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


We strongly recommend passing in an `attention_mask` since your input_ids may be padded. See https://huggingface.co/docs/transformers/troubleshooting#incorrect-output-when-padding-tokens-arent-masked.


Epoch,Training Loss,Validation Loss
0,5.214,4.115608


pretrained model load (from Epoch 6)...


  trainer = Trainer(


Epoch,Training Loss,Validation Loss
0,5.2145,4.115007


pretrained model load (from Epoch 7)...


  trainer = Trainer(


Epoch,Training Loss,Validation Loss
0,5.2329,4.122503


pretrained model load (from Epoch 8)...


  trainer = Trainer(


Epoch,Training Loss,Validation Loss
0,5.2588,4.135951


pretrained model load (from Epoch 9)...


  trainer = Trainer(


Epoch,Training Loss,Validation Loss
0,5.2917,4.155323


## BLiMP Evaluation

In [None]:
!cp -r "/content/drive/MyDrive/VU Thesis/Code/BliMP_eval_code/evaluation-pipeline" "/content/"

In [None]:
!cp -r "/content/drive/MyDrive/VU Thesis/Code/BliMP_eval_code/lm_eval" "/content/"

In [None]:
!mkdir model_folder
!cp "/content/drive/MyDrive/VU Thesis/Code/baby_models/V20L4/model_e10_v20_l4/config.json" "/content/model_folder"
!cp "/content/drive/MyDrive/VU Thesis/Code/baby_models/V20L4/model_e10_v20_l4/merges.txt" "/content/model_folder"
!cp "/content/drive/MyDrive/VU Thesis/Code/baby_models/V20L4/model_e10_v20_l4/model.safetensors" "/content/model_folder"
!cp "/content/drive/MyDrive/VU Thesis/Code/baby_models/V20L4/model_e10_v20_l4/special_tokens_map.json" "/content/model_folder"
!cp "/content/drive/MyDrive/VU Thesis/Code/baby_models/V20L4/model_e10_v20_l4/tokenizer.json" "/content/model_folder"
!cp "/content/drive/MyDrive/VU Thesis/Code/baby_models/V20L4/model_e10_v20_l4/tokenizer_config.json" "/content/model_folder"
!cp "/content/drive/MyDrive/VU Thesis/Code/baby_models/V20L4/model_e10_v20_l4/training_args.bin" "/content/model_folder"
!cp "/content/drive/MyDrive/VU Thesis/Code/baby_models/V20L4/model_e10_v20_l4/vocab.json" "/content/model_folder"

In [None]:
#@title Setup script { display-mode: "form" }
#@markdown Run this cell to install the necessary packages (may take a few minutes).
%%shell
# Remove previous installation if it exists
cd /content
# mkdir model_folder
pip uninstall -y lm-eval
# rm -rf evaluation-pipeline/

# Install evaluation-pipeline
# cp -r /content/drive/MyDrive/VU Thesis/Code/BliMP_eval_code/evaluation-pipeline /content/
cd evaluation-pipeline/
pip install -e ".[colab]"
# Install other necessary packages
pip install torch==1.11.0+cu113 torchvision==0.12.0+cu113 torchaudio==0.11.0 --extra-index-url https://download.pytorch.org/whl/cu113

# Unpack dataset
# unzip filter_data.zip

[0mObtaining file:///content/evaluation-pipeline
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting accelerate@ git+https://github.com/huggingface/accelerate@main (from lm_eval==0.2.0)
  Cloning https://github.com/huggingface/accelerate (to revision main) to /tmp/pip-install-ez2vrbuq/accelerate_8737c2bcd6e24725950259f404a703f7
  Running command git clone --filter=blob:none --quiet https://github.com/huggingface/accelerate /tmp/pip-install-ez2vrbuq/accelerate_8737c2bcd6e24725950259f404a703f7
  Resolved https://github.com/huggingface/accelerate to commit 34c1779828b3d0769992e6492e6de93d869f71b5
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Collecting datasets>=2.0.0 (from lm_eval==0.2.0)
  Downloading datasets-3.5.0-py3-none-any.whl.metadata (19 kB)
Collecting nltk==3.6 (from lm_eval==0.2.0)
  Downloading nltk-3.6-py3-none-any.whl.metadata (2.9 kB)
Co

CalledProcessError: Command '# Remove previous installation if it exists
cd /content
# mkdir model_folder
pip uninstall -y lm-eval
# rm -rf evaluation-pipeline/

# Install evaluation-pipeline
# cp -r /content/drive/MyDrive/VU Thesis/Code/BliMP_eval_code/evaluation-pipeline /content/
cd evaluation-pipeline/
pip install -e ".[colab]"
# Install other necessary packages
pip install torch==1.11.0+cu113 torchvision==0.12.0+cu113 torchaudio==0.11.0 --extra-index-url https://download.pytorch.org/whl/cu113

# Unpack dataset
# unzip filter_data.zip
' returned non-zero exit status 1.

In [None]:
!pip install sacrebleu

Collecting sacrebleu
  Downloading sacrebleu-2.5.1-py3-none-any.whl.metadata (51 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/51.8 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m51.8/51.8 kB[0m [31m4.3 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting portalocker (from sacrebleu)
  Downloading portalocker-3.1.1-py3-none-any.whl.metadata (8.6 kB)
Collecting colorama (from sacrebleu)
  Downloading colorama-0.4.6-py2.py3-none-any.whl.metadata (17 kB)
Downloading sacrebleu-2.5.1-py3-none-any.whl (104 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m104.1/104.1 kB[0m [31m10.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading colorama-0.4.6-py2.py3-none-any.whl (25 kB)
Downloading portalocker-3.1.1-py3-none-any.whl (19 kB)
Installing collected packages: portalocker, colorama, sacrebleu
Successfully installed colorama-0.4.6 portalocker-3.1.1 sacrebleu-2.5.1


In [None]:
!pip install datasets

Collecting datasets
  Using cached datasets-3.5.0-py3-none-any.whl.metadata (19 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.12.0,>=2023.1.0 (from fsspec[http]<=2024.12.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.12.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.5.0-py3-none-any.whl (491 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m491.2/491.2 kB[0m [31m30.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m9.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2024.12.0-py3-none-any.

In [None]:
!pip install rouge_score

Collecting rouge_score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: rouge_score
  Building wheel for rouge_score (setup.py) ... [?25l[?25hdone
  Created wheel for rouge_score: filename=rouge_score-0.1.2-py3-none-any.whl size=24934 sha256=ca32c1984d9c1400570d2be8e056010eaca8349bf7075deb314d1c98827b9803
  Stored in directory: /root/.cache/pip/wheels/1e/19/43/8a442dc83660ca25e163e1bd1f89919284ab0d0c1475475148
Successfully built rouge_score
Installing collected packages: rouge_score
Successfully installed rouge_score-0.1.2


In [None]:
!pip install lm_eval

Collecting lm_eval
  Downloading lm_eval-0.4.8-py3-none-any.whl.metadata (50 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/50.5 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m50.5/50.5 kB[0m [31m3.9 MB/s[0m eta [36m0:00:00[0m
Collecting evaluate (from lm_eval)
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Collecting jsonlines (from lm_eval)
  Downloading jsonlines-4.0.0-py3-none-any.whl.metadata (1.6 kB)
Collecting pybind11>=2.6.2 (from lm_eval)
  Downloading pybind11-2.13.6-py3-none-any.whl.metadata (9.5 kB)
Collecting pytablewriter (from lm_eval)
  Downloading pytablewriter-1.2.1-py3-none-any.whl.metadata (38 kB)
Collecting sqlitedict (from lm_eval)
  Downloading sqlitedict-2.1.0.tar.gz (21 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting tqdm-multiprocess (from lm_eval)
  Downloading tqdm_multiprocess-0.0.11-py3-none-any.whl.metadata (5.7 kB)
Collecting word2

In [None]:
#@title Load model and evaluate (BLiMP) { display-mode: "form" }
model = "/content/model_folder/" #@param {"type": "string"}
model_type = "encoder" #@param ["decoder", "encoder", "encoder-decoder"]
# file_name = "examples3.csv" #@param {"type": "string"}
# model_names = ["opt-125m", "opt-350m", "opt-1.3b", "opt-2.7b"] #@param {"type": "raw"}

%cd /content/evaluation-pipeline
%run /content/evaluation-pipeline/babylm_eval.py \
  "$model" \
  "$model_type" \
  -t "blimp"

/content/evaluation-pipeline


Generating train split: 0 examples [00:00, ? examples/s]


» Assigning unique IDs to 'blimp_from_file+null' docs


INFO:lm_eval.evaluator:
» Assigning unique IDs to 'blimp_from_file+null' docs


Map:   0%|          | 0/1956 [00:00<?, ? examples/s]


» Filtering invalid docs from 'blimp_from_file+null'


INFO:lm_eval.evaluator:
» Filtering invalid docs from 'blimp_from_file+null'


Filter:   0%|          | 0/1956 [00:00<?, ? examples/s]


» Constructing 'blimp_from_file+null' contexts and requests


INFO:lm_eval.evaluator:
» Constructing 'blimp_from_file+null' contexts and requests
100%|██████████| 1956/1956 [00:00<00:00, 7248.43it/s]


» Running all `loglikelihood` requests



INFO:lm_eval.evaluator:
» Running all `loglikelihood` requests
100%|██████████| 3912/3912 [00:22<00:00, 171.04it/s]


anaphor_agreement:	64.98%


Generating train split: 0 examples [00:00, ? examples/s]


» Assigning unique IDs to 'blimp_from_file+null' docs


INFO:lm_eval.evaluator:
» Assigning unique IDs to 'blimp_from_file+null' docs


Map:   0%|          | 0/8248 [00:00<?, ? examples/s]


» Filtering invalid docs from 'blimp_from_file+null'


INFO:lm_eval.evaluator:
» Filtering invalid docs from 'blimp_from_file+null'


Filter:   0%|          | 0/8248 [00:00<?, ? examples/s]


» Constructing 'blimp_from_file+null' contexts and requests


INFO:lm_eval.evaluator:
» Constructing 'blimp_from_file+null' contexts and requests
100%|██████████| 8248/8248 [00:01<00:00, 5390.74it/s]


» Running all `loglikelihood` requests



INFO:lm_eval.evaluator:
» Running all `loglikelihood` requests
100%|██████████| 16496/16496 [01:33<00:00, 176.45it/s]


argument_structure:	62.43%


Generating train split: 0 examples [00:00, ? examples/s]


» Assigning unique IDs to 'blimp_from_file+null' docs


INFO:lm_eval.evaluator:
» Assigning unique IDs to 'blimp_from_file+null' docs


Map:   0%|          | 0/6738 [00:00<?, ? examples/s]


» Filtering invalid docs from 'blimp_from_file+null'


INFO:lm_eval.evaluator:
» Filtering invalid docs from 'blimp_from_file+null'


Filter:   0%|          | 0/6738 [00:00<?, ? examples/s]


» Constructing 'blimp_from_file+null' contexts and requests


INFO:lm_eval.evaluator:
» Constructing 'blimp_from_file+null' contexts and requests
100%|██████████| 6738/6738 [00:01<00:00, 6511.04it/s]


» Running all `loglikelihood` requests



INFO:lm_eval.evaluator:
» Running all `loglikelihood` requests
100%|██████████| 13476/13476 [01:19<00:00, 170.07it/s]


binding:	61.59%


Generating train split: 0 examples [00:00, ? examples/s]


» Assigning unique IDs to 'blimp_from_file+null' docs


INFO:lm_eval.evaluator:
» Assigning unique IDs to 'blimp_from_file+null' docs


Map:   0%|          | 0/4526 [00:00<?, ? examples/s]


» Filtering invalid docs from 'blimp_from_file+null'


INFO:lm_eval.evaluator:
» Filtering invalid docs from 'blimp_from_file+null'


Filter:   0%|          | 0/4526 [00:00<?, ? examples/s]


» Constructing 'blimp_from_file+null' contexts and requests


INFO:lm_eval.evaluator:
» Constructing 'blimp_from_file+null' contexts and requests
100%|██████████| 4526/4526 [00:00<00:00, 7132.50it/s]


» Running all `loglikelihood` requests



INFO:lm_eval.evaluator:
» Running all `loglikelihood` requests
100%|██████████| 9052/9052 [00:55<00:00, 164.38it/s]


control_raising:	59.77%


Generating train split: 0 examples [00:00, ? examples/s]


» Assigning unique IDs to 'blimp_from_file+null' docs


INFO:lm_eval.evaluator:
» Assigning unique IDs to 'blimp_from_file+null' docs


Map:   0%|          | 0/7542 [00:00<?, ? examples/s]


» Filtering invalid docs from 'blimp_from_file+null'


INFO:lm_eval.evaluator:
» Filtering invalid docs from 'blimp_from_file+null'


Filter:   0%|          | 0/7542 [00:00<?, ? examples/s]


» Constructing 'blimp_from_file+null' contexts and requests


INFO:lm_eval.evaluator:
» Constructing 'blimp_from_file+null' contexts and requests
100%|██████████| 7542/7542 [00:01<00:00, 5360.35it/s]


» Running all `loglikelihood` requests



INFO:lm_eval.evaluator:
» Running all `loglikelihood` requests
100%|██████████| 15084/15084 [01:25<00:00, 176.25it/s]


determiner_noun_agreement:	77.29%


Generating train split: 0 examples [00:00, ? examples/s]


» Assigning unique IDs to 'blimp_from_file+null' docs


INFO:lm_eval.evaluator:
» Assigning unique IDs to 'blimp_from_file+null' docs


Map:   0%|          | 0/1732 [00:00<?, ? examples/s]


» Filtering invalid docs from 'blimp_from_file+null'


INFO:lm_eval.evaluator:
» Filtering invalid docs from 'blimp_from_file+null'


Filter:   0%|          | 0/1732 [00:00<?, ? examples/s]


» Constructing 'blimp_from_file+null' contexts and requests


INFO:lm_eval.evaluator:
» Constructing 'blimp_from_file+null' contexts and requests
100%|██████████| 1732/1732 [00:00<00:00, 6774.39it/s]


» Running all `loglikelihood` requests



INFO:lm_eval.evaluator:
» Running all `loglikelihood` requests
100%|██████████| 3464/3464 [00:24<00:00, 144.23it/s]


ellipsis:	52.66%


Generating train split: 0 examples [00:00, ? examples/s]


» Assigning unique IDs to 'blimp_from_file+null' docs


INFO:lm_eval.evaluator:
» Assigning unique IDs to 'blimp_from_file+null' docs


Map:   0%|          | 0/6426 [00:00<?, ? examples/s]


» Filtering invalid docs from 'blimp_from_file+null'


INFO:lm_eval.evaluator:
» Filtering invalid docs from 'blimp_from_file+null'


Filter:   0%|          | 0/6426 [00:00<?, ? examples/s]


» Constructing 'blimp_from_file+null' contexts and requests


INFO:lm_eval.evaluator:
» Constructing 'blimp_from_file+null' contexts and requests
100%|██████████| 6426/6426 [00:00<00:00, 7082.21it/s]


» Running all `loglikelihood` requests



INFO:lm_eval.evaluator:
» Running all `loglikelihood` requests
100%|██████████| 12852/12852 [01:20<00:00, 159.28it/s]


filler_gap:	60.26%


Generating train split: 0 examples [00:00, ? examples/s]


» Assigning unique IDs to 'blimp_from_file+null' docs


INFO:lm_eval.evaluator:
» Assigning unique IDs to 'blimp_from_file+null' docs


Map:   0%|          | 0/1965 [00:00<?, ? examples/s]


» Filtering invalid docs from 'blimp_from_file+null'


INFO:lm_eval.evaluator:
» Filtering invalid docs from 'blimp_from_file+null'


Filter:   0%|          | 0/1965 [00:00<?, ? examples/s]


» Constructing 'blimp_from_file+null' contexts and requests


INFO:lm_eval.evaluator:
» Constructing 'blimp_from_file+null' contexts and requests
100%|██████████| 1965/1965 [00:00<00:00, 7148.77it/s]


» Running all `loglikelihood` requests



INFO:lm_eval.evaluator:
» Running all `loglikelihood` requests
100%|██████████| 3930/3930 [00:21<00:00, 179.52it/s]


irregular_forms:	84.68%


Generating train split: 0 examples [00:00, ? examples/s]


» Assigning unique IDs to 'blimp_from_file+null' docs


INFO:lm_eval.evaluator:
» Assigning unique IDs to 'blimp_from_file+null' docs


Map:   0%|          | 0/2676 [00:00<?, ? examples/s]


» Filtering invalid docs from 'blimp_from_file+null'


INFO:lm_eval.evaluator:
» Filtering invalid docs from 'blimp_from_file+null'


Filter:   0%|          | 0/2676 [00:00<?, ? examples/s]


» Constructing 'blimp_from_file+null' contexts and requests


INFO:lm_eval.evaluator:
» Constructing 'blimp_from_file+null' contexts and requests
100%|██████████| 2676/2676 [00:00<00:00, 6841.59it/s]


» Running all `loglikelihood` requests



INFO:lm_eval.evaluator:
» Running all `loglikelihood` requests
100%|██████████| 5352/5352 [00:33<00:00, 161.49it/s]


island_effects:	44.51%


Generating train split: 0 examples [00:00, ? examples/s]


» Assigning unique IDs to 'blimp_from_file+null' docs


INFO:lm_eval.evaluator:
» Assigning unique IDs to 'blimp_from_file+null' docs


Map:   0%|          | 0/6586 [00:00<?, ? examples/s]


» Filtering invalid docs from 'blimp_from_file+null'


INFO:lm_eval.evaluator:
» Filtering invalid docs from 'blimp_from_file+null'


Filter:   0%|          | 0/6586 [00:00<?, ? examples/s]


» Constructing 'blimp_from_file+null' contexts and requests


INFO:lm_eval.evaluator:
» Constructing 'blimp_from_file+null' contexts and requests
100%|██████████| 6586/6586 [00:01<00:00, 5033.89it/s]


» Running all `loglikelihood` requests



INFO:lm_eval.evaluator:
» Running all `loglikelihood` requests
100%|██████████| 13172/13172 [01:17<00:00, 168.89it/s]


npi_licensing:	59.52%


Generating train split: 0 examples [00:00, ? examples/s]


» Assigning unique IDs to 'blimp_from_file+null' docs


INFO:lm_eval.evaluator:
» Assigning unique IDs to 'blimp_from_file+null' docs


Map:   0%|          | 0/3882 [00:00<?, ? examples/s]


» Filtering invalid docs from 'blimp_from_file+null'


INFO:lm_eval.evaluator:
» Filtering invalid docs from 'blimp_from_file+null'


Filter:   0%|          | 0/3882 [00:00<?, ? examples/s]


» Constructing 'blimp_from_file+null' contexts and requests


INFO:lm_eval.evaluator:
» Constructing 'blimp_from_file+null' contexts and requests
100%|██████████| 3882/3882 [00:00<00:00, 7181.37it/s]


» Running all `loglikelihood` requests



INFO:lm_eval.evaluator:
» Running all `loglikelihood` requests
100%|██████████| 7764/7764 [00:44<00:00, 173.79it/s]


quantifiers:	67.62%


Generating train split: 0 examples [00:00, ? examples/s]


» Assigning unique IDs to 'blimp_from_file+null' docs


INFO:lm_eval.evaluator:
» Assigning unique IDs to 'blimp_from_file+null' docs


Map:   0%|          | 0/5535 [00:00<?, ? examples/s]


» Filtering invalid docs from 'blimp_from_file+null'


INFO:lm_eval.evaluator:
» Filtering invalid docs from 'blimp_from_file+null'


Filter:   0%|          | 0/5535 [00:00<?, ? examples/s]


» Constructing 'blimp_from_file+null' contexts and requests


INFO:lm_eval.evaluator:
» Constructing 'blimp_from_file+null' contexts and requests
100%|██████████| 5535/5535 [00:01<00:00, 4525.97it/s]


» Running all `loglikelihood` requests



INFO:lm_eval.evaluator:
» Running all `loglikelihood` requests
100%|██████████| 11070/11070 [01:02<00:00, 176.41it/s]


subject_verb_agreement:	55.01%

Scores:
anaphor_agreement:	64.98%
argument_structure:	62.43%
binding:	61.59%
control_raising:	59.77%
determiner_noun_agreement:	77.29%
ellipsis:	52.66%
filler_gap:	60.26%
irregular_forms:	84.68%
island_effects:	44.51%
npi_licensing:	59.52%
quantifiers:	67.62%
subject_verb_agreement:	55.01%


In [None]:
!cp -r "/content/model_folder/zeroshot" "/content/drive/MyDrive/VU Thesis/Code/baby_models/V20L4/model_e10_v20_l4"