In [1]:
!pip install transformers tokenizers datasets

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.26.1-py3-none-any.whl (6.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.3/6.3 MB[0m [31m49.5 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting tokenizers
  Downloading tokenizers-0.13.2-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.6/7.6 MB[0m [31m58.3 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting datasets
  Downloading datasets-2.9.0-py3-none-any.whl (462 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m462.8/462.8 KB[0m [31m25.8 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.11.0
  Downloading huggingface_hub-0.12.0-py3-none-any.whl (190 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m190.3/190.3 KB[0m [31m8.8 MB/s[0m eta [36m0:00:00[0m
Collecting xxhash
  Downloa

In [20]:
from google.colab import drive
drive.mount('/content/gdrive')
base_path = 'gdrive/MyDrive/NLP_Projects/lexical_resources'
lang = 'maltese'
lang = 'arabic'
TRAIN_BATCH_SIZE = 8
VALID_BATCH_SIZE = 8
SAVE_STEPS = 1000
EVAL_STEPS = 500
SAVE_LIMIT = 2
WARMUP_STEPS = 100
EPOCHS = 5
LEARNING_RATE = 1e-4  # 1e-04


Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [21]:
#load_dataset("MLRS/korpus_malti")
#https://huggingface.co/MLRS/mBERTu
# https://huggingface.co/docs/datasets/v1.11.0/loading_datasets.html
from transformers import AutoTokenizer, AutoModelForMaskedLM, Trainer, TrainingArguments, DataCollatorForLanguageModeling
from datasets import load_dataset, Dataset, DatasetDict
import pandas as pd
from sklearn.model_selection import train_test_split
#dataset = load_dataset("MLRS/korpus_malti", split=['train[:10%]', 'test[:10%]'])  # this is too slow.
#dataset = load_dataset("MLRS/korpus_malti")
model_name = "MLRS/mBERTu"
#model_name = "distilbert-base-multilingual-cased"
model = AutoModelForMaskedLM.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--MLRS--mBERTu/snapshots/05e74807b519aefb42e0583edb0bcb00a1f7d75c/config.json
Model config BertConfig {
  "_name_or_path": "MLRS/mBERTu",
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "directionality": "bidi",
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "pooler_fc_size": 768,
  "pooler_num_attention_heads": 12,
  "pooler_num_fc_layers": 3,
  "pooler_size_per_head": 128,
  "pooler_type": "first_token_transform",
  "position_embedding_type": "absolute",
  "torch_dtype": "float32",
  "transformers_version": "4.26.1",
  "type_vocab_size": 2,
  "use

In [22]:
# dataset
# train = dataset['train'].shard(num_shards=246, index=0)  # 69k - 70k
# validation = dataset['validation'].shard(num_shards=246, index=0)  # 12257

In [23]:
from dataclasses import dataclass
import torch
import numpy as np

@dataclass
# 1. We take in a sentence and its tags
# 2. We tokenize the sentence using the tokenizer
# 3. We create a list of tags for each word in the sentence
# 4. We create a list of tags for each token in the sentence
# 5. We create a list of tags for each subtoken in the sentence
# 6. We return a dictionary of the tokenized sentence, the list of tags for each word, and the list of
# tags for each subtoken
class PreDataCollator:
    
    def __init__(self, tokenizer, max_len):

        self.tokenizer = tokenizer
        self.max_len = max_len        
        
    
    def __call__(self, batch):
        
        input_ids = []
        attention_mask = []
        labels = []
        
        for sent in batch['sents']:  # was sentences before
            
            tokenized = self.tokenize(sent)
            input_ids.append(tokenized['input_ids'])
            attention_mask.append(tokenized['attention_mask'])            
            
        
        
        batch = {'input_ids':input_ids,'attention_mask':attention_mask}
        

        return batch

    def tokenize(self, sentence):
        
  

        # using tokenizer to encode sentence (includes padding/truncation up to max length)
        # BertTokenizerFast provides "return_offsets_mapping" functionality for individual tokens, so we know the start and end of a token divided into subtokens
        encoding = self.tokenizer(sentence,                             
                             #return_offsets_mapping=True, 
                             padding='max_length', 
                             truncation=True, 
                             max_length=self.max_len)
 
            

        # turning everything into PyTorch tensors
        item = {key: torch.as_tensor(val) for key, val in encoding.items()}        

        return item


In [24]:
%cd 'gdrive/MyDrive/NLP_Projects/lexical_resources'

[Errno 2] No such file or directory: 'gdrive/MyDrive/NLP_Projects/lexical_resources'
/content/gdrive/.shortcut-targets-by-id/13KkCD2fkNEO2nduVuPwn1HhOMzKANE0K/lexical_resources


# Generated augmented tokens

New tokens

# Main Paths

In [25]:
#tokenizer_path = base_path + f'/Languages/{lang}/Tokenizer/{lang}_tokenizer'
corpus = base_path + f'/Languages/{lang}/Corpora/{lang}_corpus.txt'
corpus = f'./Languages/{lang}/Corpora/{lang}_corpus.txt'
#pretrained_path = base_path + f'Languages/{lang}/Pretrained_model'

# Load tokenizer

Created after vocabulary augmentation

In [26]:
!ls

 Languages	      'Possible papers Lexical Resources.gdoc'
 list_aug_tokens.txt   utils
 maltese_eval.ipynb    vocab_augmentation.ipynb
 maltese_train.ipynb   Zeroshot_paper.pdf
 MLM.ipynb


In [27]:
#corpus
with open(corpus, 'r') as f:
  stored = f.readlines()

In [28]:
train, dev = train_test_split(stored, test_size=0.2)

In [29]:
train_dataset = Dataset.from_dict({"sents": train})
dev_dataset = Dataset.from_dict({"sents": dev})

whole_data = DatasetDict({'train': train_dataset, 'val': dev_dataset})



In [30]:
train_dataset = train_dataset.shard(num_shards=6, index=0)
dev_dataset = dev_dataset.shard(num_shards=6, index=0)

In [31]:
# dataset = LineByLineTextDataset(
#     tokenizer = tokenizer_path,
#     file_path = corpus,
#     block_size = 128
# )

MAX_LEN = 128
collator = PreDataCollator(tokenizer=tokenizer, max_len=MAX_LEN)
train_tokenized = train_dataset.map(collator, remove_columns=train_dataset.column_names, batch_size=4, num_proc=4, batched=True)
dev_tokenized = dev_dataset.map(collator, remove_columns=dev_dataset.column_names, batch_size=4, num_proc=4, batched=True)

     

#0:   0%|          | 0/584 [00:00<?, ?ba/s]

  

#1:   0%|          | 0/584 [00:00<?, ?ba/s]

#2:   0%|          | 0/584 [00:00<?, ?ba/s]

 

#3:   0%|          | 0/584 [00:00<?, ?ba/s]

      

#0:   0%|          | 0/146 [00:00<?, ?ba/s]

 

#1:   0%|          | 0/146 [00:00<?, ?ba/s]

#3:   0%|          | 0/146 [00:00<?, ?ba/s]

 

#2:   0%|          | 0/146 [00:00<?, ?ba/s]

# MLM

In [32]:
percentage_mask = 0.15
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=True, mlm_probability=percentage_mask)

# Pretraining

In [33]:
output_dir = f"./Languages/{lang}/pre_trained/"

training_args = TrainingArguments(
    output_dir= output_dir,
    group_by_length=True,
    overwrite_output_dir = True, #replaces the old models everytime we run train it -- so good!
    num_train_epochs=EPOCHS, 
    per_device_train_batch_size=TRAIN_BATCH_SIZE,
    learning_rate= LEARNING_RATE,
    weight_decay= 0.01,
    gradient_accumulation_steps=2,
    evaluation_strategy="steps",
    fp16=False,
    save_steps=SAVE_STEPS,
    eval_steps=EVAL_STEPS,
    logging_steps=EVAL_STEPS,    
    warmup_steps=WARMUP_STEPS,
    save_total_limit=SAVE_LIMIT    
    # warmup_steps=10000,
)

# evaluation_strategy

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).


In [34]:
# Perfrom pre-training and save the model
trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=train_tokenized,
    eval_dataset=dev_tokenized,
    tokenizer=tokenizer
)  # may need to pass new tokenizer.


In [35]:
trainer.train()
trainer.save_model(output_dir)

***** Running training *****
  Num examples = 9334
  Num Epochs = 5
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 2
  Total optimization steps = 2915
  Number of trainable parameters = 177974523
You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss,Validation Loss
500,3.4988,2.932173
1000,2.7549,2.605704
1500,2.4472,2.456597
2000,2.2838,2.285205
2500,2.1314,2.250356


***** Running Evaluation *****
  Num examples = 2334
  Batch size = 8
***** Running Evaluation *****
  Num examples = 2334
  Batch size = 8
Saving model checkpoint to ./Languages/arabic/pre_trained/checkpoint-1000
Configuration saved in ./Languages/arabic/pre_trained/checkpoint-1000/config.json
Configuration saved in ./Languages/arabic/pre_trained/checkpoint-1000/generation_config.json
Model weights saved in ./Languages/arabic/pre_trained/checkpoint-1000/pytorch_model.bin
tokenizer config file saved in ./Languages/arabic/pre_trained/checkpoint-1000/tokenizer_config.json
Special tokens file saved in ./Languages/arabic/pre_trained/checkpoint-1000/special_tokens_map.json
Deleting older checkpoint [Languages/arabic/pre_trained/checkpoint-600] due to args.save_total_limit
***** Running Evaluation *****
  Num examples = 2334
  Batch size = 8
***** Running Evaluation *****
  Num examples = 2334
  Batch size = 8
Saving model checkpoint to ./Languages/arabic/pre_trained/checkpoint-2000
Configur