For simple dynamic embedding

#### Initialization

Mount drive

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


Install dependencies

In [2]:
!pip install peft
!pip install datasets
!pip install -U bitsandbytes
!pip install accelerate

Collecting peft
  Downloading peft-0.13.2-py3-none-any.whl.metadata (13 kB)
Downloading peft-0.13.2-py3-none-any.whl (320 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m320.7/320.7 kB[0m [31m9.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: peft
Successfully installed peft-0.13.2
Collecting datasets
  Downloading datasets-3.0.2-py3-none-any.whl.metadata (20 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Downloading datasets-3.0.2-py3-none-any.whl (472 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m472.7/472.7 kB[0m [31m16.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2

Make imports

In [3]:
from transformers import AutoTokenizer, AutoModelForMaskedLM, BitsAndBytesConfig
from peft import LoraConfig, get_peft_model
from datasets import load_dataset
from transformers import TrainingArguments, Trainer, DataCollatorForLanguageModeling
import re
import torch

#### Step 1: Tokenizer and Model Setup

In [4]:
custom_tokenizer = AutoTokenizer.from_pretrained('bert-base-multilingual-cased')

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/625 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/996k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.96M [00:00<?, ?B/s]



Load mBERT in 4-bit precision using BitsAndBytesConfig

In [5]:
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4"
)

Load mBERT model for MLM task with 4-bit quantization

In [6]:
model = AutoModelForMaskedLM.from_pretrained('bert-base-multilingual-cased', quantization_config=bnb_config)

`low_cpu_mem_usage` was None, now set to True since model is quantized.


model.safetensors:   0%|          | 0.00/714M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-multilingual-cased were not used when initializing BertForMaskedLM: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Set up LoRA configuration to fine-tune specific layers

In [7]:
lora_config = LoraConfig(
    r=8,
    lora_alpha=32,
    target_modules=["query", "key", "value"],
    lora_dropout=0.1,
    bias="none"
)

Apply LoRA to the quantized mBERT model

In [8]:
lora_model = get_peft_model(model, lora_config)

In [9]:
# Ensure LoRA is correctly applied and view trainable parameters
lora_model.print_trainable_parameters()

trainable params: 442,368 || all params: 178,416,891 || trainable%: 0.2479


#### Step 2: Dataset Preparation

In [10]:
# Load the SIDAC dataset
dataset = load_dataset("0xAIT/SIDAC", split='train')

README.md:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

Repo card metadata block was not found. Setting CardData to empty.


Resolving data files:   0%|          | 0/17 [00:00<?, ?it/s]

Downloading data:   0%|          | 0/16 [00:00<?, ?files/s]

data-00000-of-00016.arrow:   0%|          | 0.00/499M [00:00<?, ?B/s]

data-00001-of-00016.arrow:   0%|          | 0.00/504M [00:00<?, ?B/s]

data-00002-of-00016.arrow:   0%|          | 0.00/497M [00:00<?, ?B/s]

data-00003-of-00016.arrow:   0%|          | 0.00/509M [00:00<?, ?B/s]

data-00004-of-00016.arrow:   0%|          | 0.00/502M [00:00<?, ?B/s]

data-00005-of-00016.arrow:   0%|          | 0.00/507M [00:00<?, ?B/s]

data-00006-of-00016.arrow:   0%|          | 0.00/499M [00:00<?, ?B/s]

data-00007-of-00016.arrow:   0%|          | 0.00/503M [00:00<?, ?B/s]

data-00008-of-00016.arrow:   0%|          | 0.00/503M [00:00<?, ?B/s]

data-00009-of-00016.arrow:   0%|          | 0.00/498M [00:00<?, ?B/s]

data-00010-of-00016.arrow:   0%|          | 0.00/506M [00:00<?, ?B/s]

data-00011-of-00016.arrow:   0%|          | 0.00/505M [00:00<?, ?B/s]

data-00012-of-00016.arrow:   0%|          | 0.00/573M [00:00<?, ?B/s]

data-00013-of-00016.arrow:   0%|          | 0.00/384M [00:00<?, ?B/s]

data-00014-of-00016.arrow:   0%|          | 0.00/233M [00:00<?, ?B/s]

data-00015-of-00016.arrow:   0%|          | 0.00/326M [00:00<?, ?B/s]

Generating train split: 0 examples [00:00, ? examples/s]

In [11]:
# Regular expression pattern to retain only Sinhala characters (Unicode range U+0D80 to U+0DFF)
sinhala_pattern = re.compile(r'[අ-ඬෆ-෴ ]+')  # Sinhala letters and common punctuation

# Function to clean the text
def clean_text(sentence):
    # Remove English letters
    sentence = re.sub(r'[a-zA-Z]', '', sentence['text'])

    # Remove punctuation and special characters (retain Sinhala letters)
    sentence = re.sub(r'[^\u0D80-\u0DFF\s]', '', sentence)  # Unicode range for Sinhala is \u0D80 to \u0DFF

    # Remove extra whitespace
    sentence = re.sub(r'\s+', ' ', sentence).strip()

    return {'text': sentence}

# Apply cleaning to the entire dataset
cleaned_dataset = dataset.shuffle(seed=42).select(range(60000)).map(clean_text)

Map:   0%|          | 0/60000 [00:00<?, ? examples/s]

In [12]:
print(cleaned_dataset[2])

{'text': 'මේ ක්රිස්තු වර්ෂ අවුරුද්දයි එක්තරා සුවිශාල මහා දේශයක පිහිටි තවත් එක සුවිශාල රාඡධානියක රඡකම වෙනුවෙන් මහා බල අරගලයක් නිර්මාණය වෙමින් පැවතිණි තරුණ කුමාරයෙක් රාඡධානියේ රඡකම වෙනුවෙන් එවකට සිටි රඡතුමා හා සටනට පිවිසියේය එම මහා දේශයේ බල කෑදර කුමන්ත්රණකාරී වෙනත් රාඡ්යන්ගෙන් ද උපකාර ලබාගත් තරුණ කුමරා යුද්ධයයේ වාසිය තමා වෙත ලඟාකරගනිද්දී අසරණ වූ රඡතුමා තම පැරණි මිත්රයෙකු වූ මුහුදෙන් එතෙර පිහිටි කුඩා දූපත් රාඡ්යයක රඡ කෙනෙකුගෙන් උපකාර පැතීය උපකාර පැතූ රාඡ්යය දූපත්වාසී කුඩා රාඡ්යයක් වූවද එම රාඡ්යයේ රඡතුමා මහා චක්රවර්තී අධිරාඡ්යයෙකුට නොදෙවනි වූවෙකු විය සතර දිශාවෙහි අධිරාඡ්යන්ගෙන් පවා ගරු බුහුමන් ලැබූ මෙම රඡතුමා තමාගෙන් උදව් ඉල්ලූ කෙනෙකුට කිසිදා පිටු නොපාන අන්දමේ සුරු විරු පාලකයෙක් විය හසුන්පත අතලැබූ මහරඡතුමා වහාම තම සෙනෙවියන්ගෙන් ශ්රේෂ්ඨතම සෙනෙවියන්ගෙන් කෙනෙක් කැඳවීය ඔහු නමින් ලංකාපුර දේවයි මිත්රවරුනි මෙය තවත් සුරංගනා කතාවක් නෙවේ මෙය ඉතිහාසයයේ වැලි තලාවෙන් යටවී ගිය අපේ ඡාතියේ ශ්රේෂ්ඨත්වය පෙන්නූ කාලයයි මේ පරාක්රමබාහු යුගයයි ඉන්දියාවේ පාණ්ඩ දේශයේ සිංහාසනයට එකල විශාල බල අරගලයක් ඇතිවිය සිහසුනට උ

In [13]:
# Split cleaned_dataset into training and testing datasets
split_dataset = cleaned_dataset.train_test_split(test_size=0.1, seed=42)

cleaned_training_dataset = split_dataset['train']
cleaned_testing_dataset = split_dataset['test']

Load tokenizer

In [14]:
# Load bert tokenizer (replace later)
existing_tokenizer = AutoTokenizer.from_pretrained('bert-base-multilingual-cased')

# Example preprocess function using the custom tokenizer
def preprocess_function(examples):
    tokenized_inputs = existing_tokenizer(examples['text'], truncation=True, padding='max_length', max_length=128)
    return tokenized_inputs

In [24]:
# Reverse tokenization function
def reverse_tokenization(tokenized_inputs):
    """
    Convert tokenized input IDs back into text.
    Assumes input is a dictionary with 'input_ids' key that holds the tokenized input.
    """
    # Extract token ids from tokenized inputs
    input_ids = tokenized_inputs['input_ids']

    # Decode the token ids back to text
    decoded_text = existing_tokenizer.batch_decode(input_ids, skip_special_tokens=True)

    return decoded_text

In [27]:
tokenized_test = existing_tokenizer(cleaned_testing_dataset[0]['text'], truncation=True, padding='max_length', max_length=128)
print(reverse_tokenization(tokenized_test))


tokenized_test = existing_tokenizer('चलो कहीं अभिनय करने चलते हैं', truncation=True, padding='max_length', max_length=128)
print(reverse_tokenization(tokenized_test))

['', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', 'ක', '##ල', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', 'ක', '##්', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', 'ක', '', '', '', '', '', '', '', '']
['', 'च', '##ल', '##ो', 'क', '##ही', '##ं', 'अ', '##भ', '##िन', '##य', 'करने', 'च', '##ल', '##ते', 'हैं', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', 

In [15]:
# Apply preprocessing to dataset
tokenized_dataset = cleaned_training_dataset.map(preprocess_function, batched=True, remove_columns=["text"])

Map:   0%|          | 0/54000 [00:00<?, ? examples/s]

In [16]:
print(tokenized_dataset[:2])

{'timestamp': ['2021/04/13 07:26:50', '2021/12/09 05:12:46'], 'url': ['http://si.molongtattoosupply.com/our-guarantee/', 'https://balawegaya.wordpress.com/2012/04/07/%E0%B6%BA%E0%B7%94%E0%B6%AF%E0%B7%8A%E0%B6%B0%E0%B6%BA-%E0%B6%B1%E0%B7%92%E0%B6%B8%E0%B7%8F-%E0%B7%80%E0%B7%94%E0%B7%80%E0%B6%AD%E0%B7%8A-%E0%B6%85%E0%B6%BB%E0%B6%B8%E0%B7%94%E0%B6%AF%E0%B6%BD/'], 'source': ['mC4', 'mC4'], 'input_ids': [[101, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 1389, 111408, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 1389, 111408, 100, 100, 100, 100, 100, 100

In [25]:
print(reverse_tokenization(tokenized_dataset[0]))

['', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', 'ක', '##ර', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', 'ක', '##ර', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '']


In [17]:
# Split tokenized_dataset into training and evaluation datasets
split_dataset = tokenized_dataset.train_test_split(test_size=0.1, seed=42)

tokenized_training_dataset = split_dataset['train']
tokenized_eval_dataset = split_dataset['test']

In [18]:
# Data Collator for dynamic masking in MLM
data_collator = DataCollatorForLanguageModeling(
    tokenizer=existing_tokenizer,
    mlm=True,
    mlm_probability=0.15
)

#### Step 4: Training / Fine-tuning

In [19]:
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="steps",
    eval_steps=500,
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=500,
    logging_strategy="steps",
    report_to="none",
    fp16=True,
    save_total_limit=3,
)



In [20]:
# Fine-Tuning with Trainer
trainer = Trainer(
    model=lora_model,
    args=training_args,
    train_dataset=tokenized_training_dataset,
    eval_dataset=tokenized_eval_dataset,
    data_collator=data_collator,
    tokenizer=existing_tokenizer
)

  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


In [21]:
eval_results = trainer.evaluate(eval_dataset=tokenized_eval_dataset)
print(f"Evaluation Results before training: {eval_results}")



Evaluation Results before training: {'eval_model_preparation_time': 0.003, 'eval_runtime': 58.1384, 'eval_samples_per_second': 92.882, 'eval_steps_per_second': 5.814}


Start tuning

In [22]:
trainer.train()

Step,Training Loss,Validation Loss


KeyboardInterrupt: 

In [None]:
# Load the test split of the SIDAC dataset
test_dataset = cleaned_testing_dataset

# Preprocess the test dataset
tokenized_test_dataset = test_dataset.map(preprocess_function, batched=True, remove_columns=["text"])

Evaluate on test data

In [None]:
eval_results = trainer.evaluate(eval_dataset=tokenized_test_dataset)
print(f"Evaluation Results: {eval_results}")

In [None]:
print(clean_text(dataset[0]))

In [None]:
# Evalate on a few samples
import random

# Extract five random sentences from the test split
sample_sentences = random.sample(test_dataset['text'], 5)

# Loop through each sentence, mask a word, and predict
for sentence in sample_sentences:
    # Tokenize the input sentence
    inputs = existing_tokenizer(sentence, return_tensors="pt", truncation=True, padding='max_length', max_length=128)

    # Mask a random word (here, the second word as an example)
    inputs.input_ids[0, 1] = existing_tokenizer.mask_token_id  # Mask the second word

    # Get logits from the model
    with torch.no_grad():
        outputs = lora_model(**inputs)
    logits = outputs.logits

    # Get predicted token at masked position
    mask_token_index = (inputs.input_ids == existing_tokenizer.mask_token_id).nonzero(as_tuple=True)[1]
    predicted_token_ids = logits[0, mask_token_index].argmax(dim=-1)
    predicted_token = existing_tokenizer.decode(predicted_token_ids)

    # Print the result
    print(f"Original sentence: {sentence}")
    print(f"Predicted word for [MASK]: {predicted_token}\n")

In [None]:
import os
import torch

# Define the base model path
base_model_path = '/content/drive/MyDrive/LLM_Tasks/Task 1/Embedding Layer/Models/mlm_model_16-10-24'

# Function to get a unique file name
def get_unique_model_path(base_path):
    # Start with the original file name
    model_path = f"{base_path}.pth"
    counter = 1

    # Increment the file name if it already exists
    while os.path.exists(model_path):
        model_path = f"{base_path}-{counter}.pth"
        counter += 1

    return model_path

# Assuming 'lora_model' is your fine-tuned model
encoder_model = lora_model.bert.encoder  # Extract the encoder part

# Get a unique file path for saving
model_path = get_unique_model_path(base_model_path)

# Save the encoder model to the unique file path
torch.save(encoder_model.state_dict(), model_path)

print(f"Encoder segment saved as: {model_path}")