## Entrenando a Chimi

Primero preparar el modelo base en preparacion_modelo.ipynb

#### Luego de elegir el kernel de python (recomendado un venv) para este notebook. Instalar los paquetes necesarios a ese kernel

In [16]:
from pynvml import *

def print_gpu_utilization():
    nvmlInit()
    handle = nvmlDeviceGetHandleByIndex(0)
    info = nvmlDeviceGetMemoryInfo(handle)
    print(f"GPU memory occupied: {info.used//1024**2} MB.")


def print_summary(result):
    print(f"Time: {result.metrics['train_runtime']:.2f}")
    print(f"Samples/second: {result.metrics['train_samples_per_second']:.2f}")
    print_gpu_utilization()

### Preparar el dataset

In [17]:
from datasets import load_dataset
from transformers import MT5Tokenizer, MT5ForConditionalGeneration, DataCollatorForSeq2Seq, Seq2SeqTrainingArguments, Seq2SeqTrainer, pipeline 
from transformers.optimization import Adafactor, AdafactorSchedule
import evaluate

In [18]:
chimi_dataset = load_dataset("csv", data_files="chimi_training_data_v1.csv")
chimi_dataset

Generating train split: 0 examples [00:00, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['frase', 'respuesta'],
        num_rows: 5000
    })
})

In [4]:
def modify_newline_char(row_data):
    row_data["frase"] = row_data["frase"].replace("\n", "|")
    return row_data

chimi_dataset = chimi_dataset.map(modify_newline_char)

Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

In [19]:
chimi_train_test = chimi_dataset["train"].train_test_split(test_size=0.1)
chimi_train_validation = chimi_train_test["train"].train_test_split(test_size=chimi_train_test["test"].num_rows)

chimi_dataset["train"] = chimi_train_validation["train"]
chimi_dataset["validation"] = chimi_train_validation["test"]
chimi_dataset["test"] = chimi_train_test["test"]
chimi_dataset

DatasetDict({
    train: Dataset({
        features: ['frase', 'respuesta'],
        num_rows: 4000
    })
    validation: Dataset({
        features: ['frase', 'respuesta'],
        num_rows: 500
    })
    test: Dataset({
        features: ['frase', 'respuesta'],
        num_rows: 500
    })
})

In [20]:
chimi_dataset["train"].to_csv("train.csv")
chimi_dataset["validation"].to_csv("validation.csv")
chimi_dataset["test"].to_csv("test.csv")

Creating CSV from Arrow format:   0%|          | 0/4 [00:00<?, ?ba/s]

Creating CSV from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Creating CSV from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

60919

In [6]:
modelo_base_path = "models/spa-mt5"
tokenizer = MT5Tokenizer.from_pretrained(modelo_base_path, legacy=False)

In [7]:
max_input_length = 512
max_target_length = 100

def tokenize_function(row_data):
    # try:
        model_inputs = tokenizer(row_data['frase'], max_length=max_input_length, truncation=True)
        labels = tokenizer(row_data["respuesta"], max_length=max_target_length, truncation=True)
        model_inputs["labels"] = labels["input_ids"]
        return model_inputs
    # except Exception as e:
    #     print(e)
    #     print(row_data)

In [8]:
tokenized_chimi_dataset = chimi_dataset.map(tokenize_function, batched=True)
tokenized_chimi_dataset

Map:   0%|          | 0/4000 [00:00<?, ? examples/s]

Map:   0%|          | 0/500 [00:00<?, ? examples/s]

Map:   0%|          | 0/500 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['frase', 'respuesta', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 4000
    })
    validation: Dataset({
        features: ['frase', 'respuesta', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 500
    })
    test: Dataset({
        features: ['frase', 'respuesta', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 500
    })
})

In [9]:
base_model = MT5ForConditionalGeneration.from_pretrained(modelo_base_path)
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=base_model)
cer = evaluate.load("cer", module_type="metric")
exact_match = evaluate.load("exact_match", module_type="metric")

In [10]:
def custom_eval(predictions, references):
    agregar_contacto_len = 0
    transferencia_len = 0
    total_len = len(predictions)
    correct_action = 0
    correct_alias = 0
    correct_nombre = 0
    correct_monto = 0
    correct_ent = 0
    correct_cuenta = 0
    correct_moneda = 0
    correct_doc = 0
    for prediction, reference in zip(predictions, references):
        splitted_prediction = prediction.split("|")
        len_pred = len(splitted_prediction)
        
        splitted_reference = reference.split("|")
        reference_action = splitted_reference[0]

        if(splitted_prediction[0] == reference_action):
            correct_action += 1

        if(reference_action == "T" or reference_action == "A"):
            # T|alias|name|monto|entidad|nro_cuenta|moneda|nro_doc
            # A|alias|name|monto|entidad|nro_cuenta|moneda|nro_doc
            transferencia_len += 1
            if(len_pred > 1 and splitted_prediction[1].lower() == splitted_reference[1].lower()):
                correct_alias += 1
            if(len_pred > 2 and splitted_prediction[2].lower() == splitted_reference[2].lower()):
                correct_nombre += 1
            if(len_pred > 3 and splitted_prediction[3].lower() == splitted_reference[3].lower()):
                correct_monto += 1
            if(len_pred > 4 and splitted_prediction[4].lower() == splitted_reference[4].lower()):
                correct_ent += 1
            if(len_pred > 5 and splitted_prediction[5].lower() == splitted_reference[5].lower()):
                correct_cuenta += 1
            if(len_pred > 6 and splitted_prediction[6].lower() == splitted_reference[6].lower()):
                correct_moneda += 1
            if(len_pred > 7 and splitted_prediction[7].lower() == splitted_reference[7].lower()):
                correct_doc += 1

    action_acc = correct_action/total_len
    if(transferencia_len == 0 and agregar_contacto_len == 0):
        alias_acc = nombre_acc = cuenta_acc = entidad_acc = doc_acc = monto_acc = moneda_acc = -1
    else:
        alias_acc = (correct_alias/(transferencia_len+agregar_contacto_len))
        nombre_acc = (correct_nombre/(transferencia_len+agregar_contacto_len))
        monto_acc = (correct_monto/(transferencia_len+agregar_contacto_len))
        entidad_acc = (correct_ent/(transferencia_len+agregar_contacto_len))
        cuenta_acc = (correct_cuenta/(transferencia_len+agregar_contacto_len))
        moneda_acc = (correct_moneda/(transferencia_len+agregar_contacto_len))
        doc_acc = (correct_doc/(transferencia_len+agregar_contacto_len))

    result = {
        "action_acc": action_acc,
        "alias_acc": alias_acc,
        "nombre_acc": nombre_acc,
        "monto_acc": monto_acc,
        "moneda_acc": moneda_acc,
        "cuenta_acc": cuenta_acc,
        "entidad_acc": entidad_acc,
        "doc_acc": doc_acc
    }

    return result

In [11]:
import pandas as pd
import numpy as np

def compute_metrics_with_csv_building(save_to_csv=False, csv_path=None):
    def compute_metrics(eval_pred):
        predictions, labels, inputs = eval_pred
        decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)

        # Replace -100 in the labels as we can't decode them.
        labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
        decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

        inputs = np.where(inputs != -100, inputs, tokenizer.pad_token_id)
        decoded_inputs = tokenizer.batch_decode(inputs, skip_special_tokens=True)
        
        # longest_token_len_labels = 0
        # longest_token_len_pred = 0
        # longest_prediction = max(decoded_preds, key=len)
        # longest_label = max(decoded_labels, key=len)
        # for decoded_label, decoded_pred in zip(decoded_labels, decoded_preds):
        #     len_tokens_label = len(tokenizer.encode(decoded_label))
        #     len_tokens_pred = len(tokenizer.encode(decoded_pred))
        #     if len_tokens_label > longest_token_len_labels:
        #         longest_token_len_labels = len_tokens_label
        #     if len_tokens_pred > longest_token_len_pred:
        #         longest_token_len_pred = len_tokens_pred

        # print(f"Longest Label: {longest_label}")
        # print(f"Max Token Length Labels: {longest_token_len_labels}")
        # print()
        # print(f"Longest Prediction: {longest_prediction}")
        # print(f"Max Token Length Prediction: {longest_token_len_pred}")
        result = {}

        # Compute CER
        result["cer"] = cer.compute(predictions=decoded_preds, references=decoded_labels)
        
        # Compute Exact Match
        exact_match_res = exact_match.compute(predictions=decoded_preds, references=decoded_labels, ignore_case=True)
        result["exact_match"] = exact_match_res["exact_match"]

        # Compute Custom Eval
        result.update(custom_eval(predictions=decoded_preds, references=decoded_labels))

        if(result["exact_match"] < 1 and save_to_csv and csv_path is not None):
            non_matches = []
            for input, pred, label in zip(decoded_inputs, decoded_preds, decoded_labels):
                is_exact_match = pred.lower() == label.lower()
                if not is_exact_match:
                    non_matches.append({"frase": input, "reference_string": label, "predicted_string": pred})
            non_matches_df = pd.DataFrame(non_matches)
            non_matches_df.to_csv(csv_path, index=False)

        return {k: round(v, 4) for k, v in result.items()}
    return compute_metrics

In [12]:
batch_size = 8

training_args = Seq2SeqTrainingArguments(
    output_dir="models/chimi-mt5-base",
    evaluation_strategy="epoch",
    logging_strategy="epoch",
    save_strategy="epoch",
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    include_inputs_for_metrics=True,
    predict_with_generate=True,
    warmup_steps=20,
    save_total_limit=1,
    num_train_epochs=1,
    metric_for_best_model="exact_match",
    load_best_model_at_end=True
)



In [13]:
optimizer = Adafactor(
    base_model.parameters(),
    lr=1e-3,
    clip_threshold=1.0,
    weight_decay=0.0,
    relative_step=False,
    scale_parameter=False,
    warmup_init=False,
)

In [14]:
base_model.config.max_length = 100
base_model.config.max_length
compute_metrics_func = compute_metrics_with_csv_building()
trainer = Seq2SeqTrainer(
    model=base_model,
    args=training_args,
    train_dataset=tokenized_chimi_dataset["train"],
    eval_dataset=tokenized_chimi_dataset["validation"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    optimizers=(optimizer, None),
    compute_metrics=compute_metrics_func
)

In [15]:
result = trainer.train()
print_summary(result)

  0%|          | 0/500 [00:00<?, ?it/s]

Non-default generation parameters: {'max_length': 100}


{'loss': 2.3612, 'grad_norm': 2.210242748260498, 'learning_rate': 0.0, 'epoch': 1.0}




  0%|          | 0/63 [00:00<?, ?it/s]

Non-default generation parameters: {'max_length': 100}


{'eval_loss': 0.30494630336761475, 'eval_cer': 0.1314, 'eval_exact_match': 0.454, 'eval_action_acc': 0.89, 'eval_alias_acc': 0.8105, 'eval_nombre_acc': 0.818, 'eval_monto_acc': 0.793, 'eval_moneda_acc': 0.808, 'eval_cuenta_acc': 0.7406, 'eval_entidad_acc': 0.8803, 'eval_doc_acc': 0.6135, 'eval_runtime': 31.316, 'eval_samples_per_second': 15.966, 'eval_steps_per_second': 2.012, 'epoch': 1.0}


There were missing keys in the checkpoint model loaded: ['encoder.embed_tokens.weight', 'decoder.embed_tokens.weight'].


{'train_runtime': 129.9925, 'train_samples_per_second': 30.771, 'train_steps_per_second': 3.846, 'train_loss': 2.36116552734375, 'epoch': 1.0}
Time: 129.99
Samples/second: 30.77
GPU memory occupied: 3536 MB.


In [33]:
# 4800 98.65 | Doc 98  5000 - 98.85
trained_model_path = "models/chimi-mt5-base/checkpoint-12000-974"
trained_model = MT5ForConditionalGeneration.from_pretrained(trained_model_path).to("cuda")
trained_tokenizer = MT5Tokenizer.from_pretrained(trained_model_path)

OSError: Incorrect path_or_model_id: 'base_models/chimi-mt5-base/checkpoint-12000-974'. Please provide either the path to a local folder or the repo_id of a model on the Hub.

In [None]:
import os
script_dir = os.getcwd()
errors_csv_folder_path = os.path.join(script_dir, "errors_model_csvs")
os.makedirs(os.path.dirname(errors_csv_folder_path), exist_ok=True)
csv_name = "error_csv_check_12000.csv"
csv_path = os.path.join(errors_csv_folder_path, csv_name)

compute_metrics_func = compute_metrics_with_csv_building(save_to_csv=True, csv_path=csv_path)

test_trainer = Seq2SeqTrainer(
    model=trained_model,
    args=training_args,
    eval_dataset=tokenized_chimi_dataset["test"],
    data_collator=data_collator,
    tokenizer=trained_tokenizer,
    compute_metrics=compute_metrics_func
)

test_trainer.evaluate()

In [None]:
el_purete = pipeline(
    "text2text-generation",
    model=trained_model,
    tokenizer=trained_tokenizer,
    max_new_tokens=75,
    num_beams=1,
    device="cuda:0"
)

In [None]:
result = el_purete(["Sera que le podes pagar 1200000 a Fissch", "Transferie na 350000 a Kike Fanego", "Epaaaaa?", "Sera que le podes agregar a Javier Mereles a mis contactos. Su cuenta es del Banco Continental, y su numero de cedula es 2849212", "Podes pasarle 55000 a Enzo Flecha", "Le quiero pasar 52500 a mi socio Joaquincho", "Nro. Cta: 0515277944\nSolar Banco\nC.I: 2355066\nBlanca Nancy Martínez", "Vanessa Cristina Gamarra Cantero\nCuenta Numero 346936492\nVision Banco\nNúmero de Documento: 7,366,864", "Che, sera que le podes agregar a Javier Flecha con numero de cedula 91446060. Tiene su cuenta en el BNA."])

# result = el_purete("Sera que le podes pagar 1200000 a Spachuzo")
result

In [None]:
result = el_purete("Sera que le podes agregar a Joaquín Flecha con numero de cuenta 799822551. Su cuenta la tiene en el Banco de la Nación de Argentina.")
result

In [None]:
inputs = ["Transferile na quinientos cincuenta mil a Kike Fanego"]

inputs = trained_tokenizer(inputs, max_length=200, truncation=True, return_tensors="pt").to("cuda")
output = trained_model(**inputs)
type(output)

In [None]:
output.logits

In [None]:
decoded_output = tokenizer.batch_decode(output, skip_special_tokens=True)[0]

print(decoded_output)