In [None]:
from huggingface_hub import notebook_login
notebook_login()

In [None]:
%%capture
%pip install accelerate peft bitsandbytes transformers trl

In [None]:
import os
import torch
import json
from datasets import Dataset
from datasets import load_dataset, concatenate_datasets
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    TrainingArguments,
    pipeline,
    logging,
)
from peft import LoraConfig
from trl import SFTTrainer

In [None]:
old_llama_tokenizer = AutoTokenizer.from_pretrained('meta-llama/Llama-2-7b-hf')

In [None]:
dataset_sinhala_corpus = load_dataset('LexiconShiftInnovations/SinhalaCorpusLarge')
dataset_sinhala_wiki = load_dataset('LexiconShiftInnovations/SinhalaWikipediaArticles')
dataset_sinhala_dental_qna = load_dataset('LexiconShiftInnovations/SinhalaDentalQnA')

In [None]:
dataset_sinhala_dental_qna_text = dataset_sinhala_dental_qna['train']['text']
dataset_sinhala_dental_qna_dataset = {"text": dataset_sinhala_dental_qna_text}
dataset_sinhala_dental_qna_dataset = Dataset.from_dict(dataset_sinhala_dental_qna_dataset)

In [None]:
datasets_to_concatenate = [dataset_sinhala_corpus['train'], dataset_sinhala_wiki['train'], dataset_sinhala_dental_qna_dataset]

In [None]:
Sinhala_Corpus_Train = concatenate_datasets(datasets_to_concatenate)

In [None]:
variables_to_delete = ['dataset_sinhala_corpus', 'dataset_sinhala_wiki', 'dataset_sinhala_dental_qna', 'dataset_sinhala_dental_qna_dataset']
%reset_selective -f {variable for variable in variables_to_delete}


In [None]:
Sinhala_Corpus_Train

In [None]:
def get_training_corpus():
    return (
        Sinhala_Corpus_Train['text'][i : i + 1000]
        for i in range(0, len(Sinhala_Corpus_Train["text"]), 1000)
    )


training_corpus = get_training_corpus()

In [None]:
new_llama_tokenizer = old_llama_tokenizer.train_new_from_iterator(training_corpus, 20000)

In [None]:
new_llama_tokenizer.push_to_hub('LexiconShiftInnovations/Llama2SinhalaTokenizer')

In [None]:
sinhala_text = "ඉස්සරහ දත් දෙක මැද හිඩස පිරෙව්වට පස්සේ mouth wash එකක් භාවිතා කිරීම නුසුදුසු ද?"


llama_2_existing_tokenizer_output = old_llama_tokenizer.tokenize(sinhala_text)
llama_2_sinhala_tokenizer_output = new_llama_tokenizer.tokenize(sinhala_text)

print("Output from the existing Llama-2 Tokenizer")
print(f"Token count : {len(llama_2_existing_tokenizer_output)}")

print(llama_2_existing_tokenizer_output)
print("\nOutput from the Llama-2 Tokenizer trained on Sinhala Corpus")
print(f"Token count : {len(llama_2_sinhala_tokenizer_output)}")
print(llama_2_sinhala_tokenizer_output)

In [None]:
old_vocab = old_llama_tokenizer.vocab
new_vocab = new_llama_tokenizer.vocab

In [None]:
len(old_vocab)

In [None]:
len(new_vocab)

In [None]:
set_old_vocab = set(old_vocab)
set_new_vocab = set(new_vocab)

intersection_set = set_old_vocab.intersection(set_new_vocab)

In [None]:
len(intersection_set)

In [None]:
tokens_to_add = set_new_vocab - intersection_set

In [None]:
old_llama_tokenizer.add_tokens(list(tokens_to_add))

In [None]:
len(old_llama_tokenizer.vocab)

In [None]:
updated_llama_tokenizer = old_llama_tokenizer.tokenize(sinhala_text)

In [None]:
print(f"Token count : {len(updated_llama_tokenizer)}")
print(updated_llama_tokenizer)