In [1]:
import gc

def cleanup():
    gc.collect()
    torch.cuda.empty_cache()

In [2]:
import torch
from transformers import MBartForConditionalGeneration, MBart50Tokenizer
import numpy as np
import pandas as pd
import warnings

warnings.filterwarnings("ignore")

def get_model(model_path="./mbart-large-51-ru-mans-v2-full-finetuneepoch_4/"):
    tokenizer = MBart50Tokenizer.from_pretrained(model_path)
    old_len = len(tokenizer)
    tokenizer.lang_code_to_id['mans_XX'] = old_len-1
    tokenizer.id_to_lang_code[old_len-1] = 'mans_XX'
    tokenizer.fairseq_tokens_to_ids["<mask>"] = len(tokenizer.sp_model) + len(tokenizer.lang_code_to_id) + tokenizer.fairseq_offset
    tokenizer.fairseq_tokens_to_ids.update(tokenizer.lang_code_to_id)
    tokenizer.fairseq_ids_to_tokens = {v: k for k, v in tokenizer.fairseq_tokens_to_ids.items()}
    if 'mans_XX' not in tokenizer._additional_special_tokens:
        tokenizer._additional_special_tokens.append('mans_XX')

    tokenizer.src_lang = "ru_RU"
    tokenizer.tgt_lang = "mans_XX"
    
    
    model = MBartForConditionalGeneration.from_pretrained(model_path,
                                                        torch_dtype=torch.bfloat16,
                                                        #attn_implementation="flash_attention_2", #https://huggingface.co/docs/transformers/perf_infer_gpu_one#combine-optimizations
                                                        #WORKS ONLY ON AMPERS GPUS
                                                        load_in_8bit=True,
                                                         )
    print(model.device)
    
    return model, tokenizer

model, tokenizer = get_model()

The `load_in_4bit` and `load_in_8bit` arguments are deprecated and will be removed in the future versions. Please, pass a `BitsAndBytesConfig` object in `quantization_config` argument instead.
`low_cpu_mem_usage` was None, now set to True since model is quantized.


cuda:0


In [3]:
# model = model.to_bettertransformer()

model = torch.compile(model) #https://huggingface.co/docs/transformers/perf_torch_compile#v100-batch-size-1
# https://habr.com/ru/companies/wunderfund/articles/820721/

In [4]:
def translate(text, src='ru_RU', trg='mans_XX', max_length=200, num_beams=5, repetition_penalty=5.0, **kwargs):
    tokenizer.src_lang = src
    encoded = tokenizer(text, return_tensors="pt")
    
    # # enable FlashAttention
    # with torch.backends.cuda.sdp_kernel(enable_flash=True, enable_math=False, enable_mem_efficient=False):
    ### works ONLY ON AMPER GPUS
    
    generated_tokens = model.generate(
        **encoded.to(model.device),
        forced_bos_token_id=tokenizer.lang_code_to_id[trg], 
        max_length=max_length, 
        num_beams=num_beams,
        repetition_penalty=repetition_penalty,
        # early_stopping=True,
    )
        
    return tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)[0]


def translate_batch(texts, src='ru_RU', trg='mans_XX', max_length=200, num_beams=5, repetition_penalty=5.0,
                    batch_size=128, **kwargs):
    tokenizer.src_lang = src
    all_translations = []

    for i in tqdm(range(0, len(texts), batch_size)):
        batch_texts = texts[i:i+batch_size]
        encoded = tokenizer(batch_texts, return_tensors="pt", padding=True, truncation=True)
        
        # # enable FlashAttention
        # with torch.backends.cuda.sdp_kernel(enable_flash=True, enable_math=False, enable_mem_efficient=False):
        ### works ONLY ON AMPER GPUS
        
        generated_tokens = model.generate(
            **{k: v.to(model.device) for k, v in encoded.items()},
            forced_bos_token_id=tokenizer.lang_code_to_id[trg],
            max_length=max_length,
            num_beams=num_beams,
            repetition_penalty=repetition_penalty,
            # early_stopping=True,
        )
        
        batch_translations = tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)
        all_translations.extend(batch_translations)
    
    return all_translations

def translate_dataframe(df, text_column, src='ru_RU', trg='mans_XX', **kwargs):
    texts = df[text_column].tolist()
    translations = translate_batch(texts, src=src, trg=trg, **kwargs)
    return pd.DataFrame({'original': texts, 'translation_model': translations})

In [5]:
import evaluate

chrf = evaluate.load("chrf")
bleu = evaluate.load("bleu")

def get_simple_metrics(data: pd.DataFrame, preds_column: str,
                       original_column: str, output_file: str):
    
    chrf__ = chrf.compute(
        predictions=data[preds_column].values,
        references=data[original_column].values,
        word_order=2,
    )["score"]
    bleu__ = bleu.compute(
        predictions=data[preds_column].values, references=data[original_column].values
    )["bleu"]
    metrics = pd.DataFrame(
        {"chrf": [chrf__], "bleu": [bleu__]}
    )
    print(output_file, f"bleu {bleu__}", f"chrf {chrf__}")
    metrics.to_csv(output_file, index=False)

2024-09-08 22:59:23.959738: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
Downloading builder script: 100%|██████████| 9.01k/9.01k [00:00<00:00, 20.9MB/s]
Downloading builder script: 100%|██████████| 5.94k/5.94k [00:00<00:00, 15.4MB/s]
Downloading extra modules: 4.07kB [00:00, 4.91MB/s]                   
Downloading extra modules: 100%|██████████| 3.34k/3.34k [00:00<00:00, 11.5MB/s]


In [6]:
import pandas as pd
from tqdm import tqdm
tqdm.pandas()

model_names = ["mbart-large-51-ru-mans-v2-full-finetuneepoch_4",
              "mbart-large-51-ru-mans-v2-full-finetuneepoch_3",
              "mbart-large-51-ru-mans-v2-full-finetuneepoch_2",
              "mbart-large-51-ru-mans-v2-full-finetuneepoch_1",
              "mbart-large-51-ru-mans-v2-full-finetuneepoch_0"]

test = pd.read_csv('./test.csv')
print(f"test size: {test.shape[0]}")

def run_model_test(model_name):
    translated_test = translate_dataframe(test, "ru")
    
    #save results
    translated_test.to_csv(f'./metrics/test_predicted_{model_name}.csv')
        
        
    #compute metrics
    get_simple_metrics(translated_test, preds_column=f"predicted_{model_name}",
                       original_column="mans", output_file=f"./metrics/results_{model_name}.csv")


test size: 8821


In [7]:
#simple test

test.values[3], translate(test.values[3][0])

(array(['Девушка Куринька мастерит.', 'А̄ги Кӯринька ва̄ранты.'],
       dtype=object),
 'А̄ги Куринька ма̄щтыр.')

In [8]:
chrf = evaluate.load("chrf")
bleu = evaluate.load("bleu")
    
print('chrf', chrf.compute(predictions=['А̄ги Куринька ма̄щтыр.'], references=['А̄ги Кӯринька ва̄ранты.'], word_order=2))

print('bleu', bleu.compute(predictions=['А̄ги Куринька ма̄щтыр.'], references=['А̄ги Кӯринька ва̄ранты.']))


chrf {'score': 34.60185689354383, 'char_order': 6, 'word_order': 2, 'beta': 2}
bleu {'bleu': 0.0, 'precisions': [0.5, 0.0, 0.0, 0.0], 'brevity_penalty': 1.0, 'length_ratio': 1.0, 'translation_length': 4, 'reference_length': 4}


In [16]:
# run_model_test(model_names[0])

translated_test = pd.read_csv('./metrics/test_predicted_mbart-large-51-ru-mans-v2-full-finetuneepoch_4.csv')
test['model_translation'] = translated_test['translation']

#compute metrics
get_simple_metrics(test, preds_column="model_translation",
                    original_column="mans", output_file=f"./metrics/results_mbart_v2_4epochs.csv")


./metrics/results_mbart_v2_4epochs.csv bleu 0.16135040463809533 chrf 42.84673235429147
