In [None]:
!pip install -q -U transformers datasets accelerate peft trl bitsandbytes wandb

In [None]:
source_lang, source_lang_iso = "Spanish", "spa"
target_lang, target_lang_iso = "Wayuu", "guc" # or pbb

In [None]:
import os
import torch
from datasets import load_dataset
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    AutoTokenizer,
    TrainingArguments,
    pipeline,
)
from peft import LoraConfig, PeftModel, prepare_model_for_kbit_training
from trl import SFTTrainer



Load Fine Tuned model

In [None]:
# Model
base_model = f"Broomva/llama-2-7b-chat-instruct-translate-{source_lang_iso}-{target_lang_iso}"

# Tokenizer
tokenizer = AutoTokenizer.from_pretrained(base_model, use_fast=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

Quantize model

In [None]:
# Quantization configuration
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_use_double_quant=True,
)

# LoRA configuration
peft_config = LoraConfig(
    lora_alpha=32,
    lora_dropout=0.05,
    r=16,
    bias="none",
    task_type="CAUSAL_LM",
)

# Load base moodel
model = AutoModelForCausalLM.from_pretrained(
    base_model,
    quantization_config=bnb_config,
)

# Cast the layernorm in fp32, make output embedding layer require grads, add the upcasting of the lmhead to fp32
model = prepare_model_for_kbit_training(model)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]



In [None]:
def translate(spanish_prompt, wayuu_reference):
  instruction = f"<s>[INST] Traduce de {source_lang} a {target_lang}: {spanish_prompt} [/INST]"
  pipe = pipeline(task="text-generation", model=model, tokenizer=tokenizer, max_length=len(instruction)+len(wayuu_reference))
  result = pipe(instruction)
  return result[0]['generated_text'][len(instruction):]

In [None]:
import pandas as pd

dataset = pd.read_csv(f'../datasets/{source_lang_iso}_{target_lang_iso}/{source_lang_iso}_{target_lang_iso}_dataset.csv', sep = '|').sample(50)
source_sentences = dataset[source_lang_iso]
target_references = dataset[target_lang_iso]

In [None]:
model_outputs = [translate(source_sentence, target_reference) for source_sentence, target_reference in zip(source_sentences, target_references)]



In [13]:
results_df = pd.DataFrame({source_lang_iso: list(source_sentences), target_lang_iso: list(target_references), f'generated_{target_lang_iso}': model_outputs})
results_df.head(20)

Unnamed: 0,spa,guc,generated_guc
0,Pero ¿qué representa el que se lavara a Aarón?,Oʼoojinnüshi Aarón süpülapünaa sacerdooteinjac...,¿Kasa kayaawaseka nüshanaain Aarón? 5-7. 5. ¿...
1,que hace el,kasa naa'inraaka niakai,kasa naa'inraaka niakai sutuma tü nümakat jes...
2,¿Qué ayuda práctica podría usted dar a alguien...,¿Kasa paaʼinrajatka süpüla pükaaliinjüin na aa...,¿Kasa eeka süpüla paaʼinrüin nümüin wanee jim...
3,tom salio a comer,ajuittüshi tom ekaainchi,ekaashi tom sünain ekaa ma aka tia so'ukai ji...
4,14 Muchos hermanos viven en países donde los p...,14 Mainma wawalayuu eʼrakana müliaa otta choʼu...,14 Wainma wawalayuu kepiakana sainküin mmakat...
5,"Al ver que estaba resentido, Rutherford le adv...","Sutuma jashichin nia, nüküjüin Rutherford eein...",Nnojoishi niain anain Rutherford sümüin wayuu...
6,quieres comer,ekeeshi pia,ekeesü paa'in süpüla shiküinjatüin wayuu shia...
7,entren,joutaa,jaleraa maa'ulu yaa sulu'u ekirajüleeka wayuu...
8,Despertó su entusiasmo por lo que podrían logr...,"Je tü naaʼinreetkat, alanaʼaleejeeria sünain a...","Müleka jiküle je jülemerale, shiimüin sünain ..."
9,ella puede saltar alto,shiaka sütüjain awataa iipünaashaatasü,shiakalü eesü süpüla sujutuin süpaatiwa sümaa...


In [30]:
from nltk.translate.bleu_score import SmoothingFunction, corpus_bleu

smoothie = SmoothingFunction().method5
bleu_score_smooth = corpus_bleu(target_references, model_outputs, smoothing_function=smoothie)
print(f"Smooth BLEU Score: {bleu_score_smooth}")


0.07187542180783783

In [36]:
from nltk.translate.bleu_score import sentence_bleu

# Calculate BLEU for each sentence
individual_bleu_scores = [sentence_bleu([reference], candidate) for reference, candidate in zip(target_references, model_outputs)]
print(f"AVG Sentence BLEU Score: {sum(individual_bleu_scores)/len(individual_bleu_scores)}")

AVG Sentence BLEU Score: 0.13479943753792184
