In [94]:
from datasets import load_dataset
import random
random.seed(42)

dataset = load_dataset("Angelectronic/IWSLT15_English_Vietnamese")
dataset

Found cached dataset parquet (C:/Users/lg/.cache/huggingface/datasets/Angelectronic___parquet/Angelectronic--IWSLT15_English_Vietnamese-d3d4a119d05830c0/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)


  0%|          | 0/2 [00:00<?, ?it/s]

DatasetDict({
    train: Dataset({
        features: ['id', 'translation'],
        num_rows: 133166
    })
    test: Dataset({
        features: ['id', 'translation'],
        num_rows: 1268
    })
})

In [95]:
from langchain_community.embeddings import HuggingFaceBgeEmbeddings
import torch

device = "cuda" if torch.cuda.is_available() else "cpu"
model_name = "BAAI/bge-small-en"
model_kwargs = {"device": device}
encode_kwargs = {"normalize_embeddings": True}
hf = HuggingFaceBgeEmbeddings(
    model_name=model_name, model_kwargs=model_kwargs, encode_kwargs=encode_kwargs
)

embedding = hf.embed_query("hi this is harrison")
len(embedding)

384

In [None]:
from langchain.vectorstores import FAISS
import json
from langchain.docstore.document import Document

tm_docs = [Document(page_content=dataset['train'][i]['translation']['en'], metadata={"id": dataset['train'][i]['id']}) for i in range(len(dataset['train']))]
db = FAISS.from_documents(tm_docs, hf)
# db.save_local("tm_vectorstore")
# db.similarity_search("Nobody 's ever done it before , so I 'm going to go do it .")

In [96]:
from langchain.vectorstores import FAISS

vector_db = FAISS.load_local("tm_vectorstore", hf)
k = 3
retriver = vector_db.as_retriever(search_kwargs={'k': k})
relevant_docs = retriver.get_relevant_documents("what did he say about ketanji brown jackson")
relevant_docs

[Document(page_content='He was interviewed once , and he said the following .', metadata={'id': '45660'}),
 Document(page_content='Asked him what this said .', metadata={'id': '59394'}),
 Document(page_content="Here 's what he had to say .", metadata={'id': '84341'})]

In [None]:
import copy

dataset_copy = copy.deepcopy(dataset)

id_list = [data['id'] for data in dataset['train']]
rand_id = random.sample(id_list, len(dataset['train'])//2)

def get_relevant_docs_train(example):    
    if example['id'] in rand_id:
        relevant_docs = retriver.get_relevant_documents(example['translation']['en'])
        relevant_docs = [doc for doc in relevant_docs if doc.metadata['id'] != example['id']]
        if len(relevant_docs) > k - 1:
            relevant_docs.pop()

        instructs = ""              
        for doc in relevant_docs:
            index = id_list.index(int(doc.metadata['id']))
            tm_doc = dataset['train'][index]

            instructs += f"<English> : {tm_doc['translation']['en']}\n<Vietnamese> : {tm_doc['translation']['vi']}\n\n"
            
        example['translation']['en'] = instructs + "<English> : " + example['translation']['en'] + "\n" + "<Vietnamese> : "
    else:
        example['translation']['en'] = "<English> : " + example['translation']['en'] + "\n" + "<Vietnamese> : "
    
    return example

def get_relevant_docs_test(example):    
    relevant_docs = retriver.get_relevant_documents(example['translation']['en'])
    relevant_docs = [doc for doc in relevant_docs if doc.metadata['id'] != example['id']]
    if len(relevant_docs) > k - 1:
        relevant_docs.pop()

    instructs = ""              
    for doc in relevant_docs:
        index = id_list.index(int(doc.metadata['id']))
        tm_doc = dataset['train'][index]

        instructs += f"<English> : {tm_doc['translation']['en']}\n<Vietnamese> : {tm_doc['translation']['vi']}\n\n"
        
    example['translation']['en'] = instructs + "<English> : " + example['translation']['en'] + "\n" + "<Vietnamese> : "
    return example

dataset_copy['train'] = dataset_copy['train'].map(get_relevant_docs_train)
dataset_copy['test'] = dataset_copy['test'].map(get_relevant_docs_test)
# dataset_copy.save_to_disk("fuzzy_iwslt15")

print(dataset_copy['train'][0]['translation']['en'])

In [1]:
from datasets import load_dataset
import random
random.seed(42)

sample_size_train = 133166
sample_size_test = 1268
dataset = load_dataset("Angelectronic/fuzzy_iwslt15_domain_specific")

# random_indices_train = random.sample(range(len(dataset['train'])), sample_size_train)
# dataset['train'] = dataset['train'].select(random_indices_train)

# random_indices_test = random.sample(range(len(dataset['test'])), sample_size_test)
# dataset['test'] = dataset['test'].select(random_indices_test)

dataset

Downloading readme:   0%|          | 0.00/615 [00:00<?, ?B/s]

Downloading and preparing dataset None/None to C:/Users/lg/.cache/huggingface/datasets/Angelectronic___parquet/Angelectronic--fuzzy_iwslt15_domain_specific-918d73d5c01a1810/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec...


Downloading data files:   0%|          | 0/3 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/48.3M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/738k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/1.25M [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/3 [00:00<?, ?it/s]

Generating train split:   0%|          | 0/131621 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1268 [00:00<?, ? examples/s]

Generating domain_specific_test split:   0%|          | 0/1542 [00:00<?, ? examples/s]

Dataset parquet downloaded and prepared to C:/Users/lg/.cache/huggingface/datasets/Angelectronic___parquet/Angelectronic--fuzzy_iwslt15_domain_specific-918d73d5c01a1810/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec. Subsequent calls will reuse this data.


  0%|          | 0/3 [00:00<?, ?it/s]

DatasetDict({
    train: Dataset({
        features: ['id', 'translation'],
        num_rows: 131621
    })
    test: Dataset({
        features: ['id', 'translation'],
        num_rows: 1268
    })
    domain_specific_test: Dataset({
        features: ['id', 'translation'],
        num_rows: 1542
    })
})

In [None]:
from google.colab import userdata
from unsloth import FastLanguageModel
import torch

max_seq_length = 1024 # Choose any! We auto support RoPE Scaling internally!
dtype = 'Float16' # None for auto detection. torch.float16 for Tesla T4, V100, Bfloat16 for Ampere+
load_in_4bit = True # Use 4bit quantization to reduce memory usage. Can be False.

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "unsloth/gemma-7b-bnb-4bit", # Choose ANY! eg teknium/OpenHermes-2.5-Mistral-7B
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
    token = userdata.get('HF_TOKEN'), # use one if using gated models like meta-llama/Llama-2-7b-hf
)

model = FastLanguageModel.get_peft_model(
    model,
    r = 16, # Choose any number > 0 ! Suggested 8, 16, 32, 64, 128
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj",],
    lora_alpha = 16,
    lora_dropout = 0, # Supports any, but = 0 is optimized
    bias = "none",    # Supports any, but = "none" is optimized
    use_gradient_checkpointing = True,
    random_state = 3407,
    use_rslora = False,  # We support rank stabilized LoRA
    loftq_config = None, # And LoftQ
)

In [None]:
EOS_TOKEN = tokenizer.eos_token # Must add EOS_TOKEN
def formatting_prompts_func(examples):
    texts = []
    translations = examples['translation']
    for i in range(len(translations)):
        text = translations[i]['en'] + translations[i]['vi'] + EOS_TOKEN
        texts.append(text)

    return {"text": texts}
pass
dataset['train'] = dataset['train'].map(formatting_prompts_func, batched=True)
print(dataset['train'][0]['text'])

In [None]:
import evaluate
from transformers.trainer_callback import TrainerCallback

metric = evaluate.load("sacrebleu")

class EvaluateAfterEpochCallback(TrainerCallback):
    def on_epoch_end(self, args, state, control, model, tokenizer, **kwargs):
        EOS_TOKEN = tokenizer.eos_token

        labels = [doc['translation']['vi'] + EOS_TOKEN for doc in dataset['test']]
        predictions = []
        for test_example in dataset['test']:
            prompt = test_example['translation']['en']

            inputs = tokenizer(
                prompt,
                return_tensors = "pt",
            ).to("cuda")

            outputs = model.generate(**inputs, max_new_tokens = 64, use_cache = True)
            prediction = tokenizer.decode(outputs[0], skip_special_tokens = True)

            completion = prediction.split(prompt)[-1].split(EOS_TOKEN)[0].strip()
            predictions.append(completion)

        results = metric.compute(predictions=predictions, references=labels)
        print(results)

In [None]:
from trl import SFTTrainer
from transformers import TrainingArguments


trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    train_dataset = dataset['train'],
    dataset_text_field = "text",
    max_seq_length = max_seq_length,
    dataset_num_proc = 2,
    packing = False, # Can make training 5x faster for short sequences.
    callbacks = [EvaluateAfterEpochCallback()],
    args = TrainingArguments(
        per_device_train_batch_size = 2,
        gradient_accumulation_steps = 4,
        warmup_steps = 5,
        num_train_epochs=1,
        learning_rate = 2e-4,
        fp16 = not torch.cuda.is_bf16_supported(),
        bf16 = torch.cuda.is_bf16_supported(),
        logging_steps = 1,
        optim = "adamw_8bit",
        report_to="tensorboard",
        weight_decay = 0.01,
        lr_scheduler_type = "linear",
        seed = 3407,
        output_dir = "Gemma-7b_en_vi",
        push_to_hub=True,
    ),
)
trainer_stats = trainer.train()

In [None]:
import evaluate

metric = evaluate.load("sacrebleu")
preds = ["Xin chào mọi người!", "Hoàng hôn đẹp quá!", "Tôi là một người đàn ông!"]
labels = ["Xin chào thế giới!", "Hoàng hôn không đẹp!", "Tôi là một người phụ nữ!"]
metric.compute(predictions=preds, references=labels)

In [None]:
from unsloth import FastLanguageModel

max_seq_length = 2048
dtype = None # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
load_in_4bit = True

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "Angelectronic/Gemma-7b_en_vi",
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
)
FastLanguageModel.for_inference(model)
EOS_TOKEN = tokenizer.eos_token
BOS_TOKEN = tokenizer.bos_token

labels = [doc['translation']['vi'] + EOS_TOKEN for doc in dataset['test']]
predictions = []
for test_example in dataset['test']:
    prompt = test_example['translation']['en']

    inputs = tokenizer(
        prompt,
        return_tensors = "pt",
    ).to("cuda")

    outputs = model.generate(**inputs, max_new_tokens = 64, use_cache = True)
    prediction = tokenizer.decode(outputs[0], skip_special_tokens = True)

    completion = prediction.split(prompt)[-1].split(EOS_TOKEN)[0].strip()
    predictions.append(completion)

results = metric.compute(predictions=predictions, references=labels)
results