In [None]:
from transformers import AutoTokenizer
from custom_dataset import LedgerDataset
from transformers import DataCollatorForSeq2Seq
import torch
import numpy as np
import evaluate
from transformers import AutoModelForSeq2SeqLM, Seq2SeqTrainingArguments, Seq2SeqTrainer
from transformers import pipeline
import numpy as np
from sklearn.metrics import precision_recall_fscore_support


In [None]:
tokenizer = AutoTokenizer.from_pretrained("t5-base")
metric = evaluate.load("sacrebleu")

In [None]:
def postprocess_text(preds, labels):
    preds = [pred.strip() for pred in preds]
    labels = [[label.strip()] for label in labels]

    return preds, labels


def compute_metrics(eval_preds):
    preds, labels = eval_preds
    if isinstance(preds, tuple):
        preds = preds[0]
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)

    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    decoded_preds, decoded_labels = postprocess_text(decoded_preds, decoded_labels)

    result = metric.compute(predictions=decoded_preds, references=decoded_labels)
    result = {"bleu": result["score"]}

    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in preds]
    prec,recall,f1,_ = precision_recall_fscore_support(np.array(decoded_labels), np.array(decoded_preds), average='weighted')
    result["prec"] =prec
    result["recall"] =recall
    result["f1"] =f1 
    result["gen_len"] = np.mean(prediction_lens)
    result = {k: round(v, 4) for k, v in result.items()}
    return result




In [None]:

data = LedgerDataset("data.json",tokenizer)
print(len(data))
print(data[0])


In [None]:
gen = torch.Generator()
gen.manual_seed(0)

data_size = int(0.8 * len(data))
test_size = len(data) - data_size
other_data, test_dataset = torch.utils.data.random_split(data, [data_size, test_size], generator=gen)
train_size = int(0.8 * len(other_data))
validation_size = len(other_data) - train_size
train_dataset, validate_dataset = torch.utils.data.random_split(data, [data_size, test_size], generator=gen)


In [None]:
print(len(train_dataset))
print(len(validate_dataset))
print(len(test_dataset))

In [None]:
model = AutoModelForSeq2SeqLM.from_pretrained("t5-base")
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model, padding=True)


In [None]:
training_args = Seq2SeqTrainingArguments(
    output_dir="output_base_3",
    evaluation_strategy="steps",
    learning_rate=3e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    weight_decay=0.01,
    save_total_limit=5,
    max_steps=10000,
    predict_with_generate=True,
)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=validate_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

trainer.train()

In [None]:
model_name = "output_base_3/checkpoint-10000/"


In [None]:

translator = pipeline("translation_source_to_target", model=model_name,max_length=30)

In [None]:
text = "translate source to target: ADP Processing Fees"
excepted_output = "Fees, Dues & Subscriptions"
translator(text,max_length=30)

In [None]:
prefix = "translate source to target: "
res = []
for item in test_dataset : 
    text = prefix + item["translation"]["source"]
    target_text = item["translation"]["target"]
    pred = translator(text)[0]['translation_text']
    res.append({
        "text":text, 
        "target" : target_text,
        "pred" : pred, 
    })

In [None]:
targets =[x["target"] for x in res]
predictions = [x["pred"] for x in res]


In [None]:
sum =0 
failed= 0 
for i in range(len(targets)): 
    if targets[i] == predictions[i]:
        sum+=1
    else : 
        failed +=1 
        print(f"target : {targets[i]}\n prediction : {predictions[i]}")
print("accuracy : ")
print(sum/len(targets))
print(failed/len(targets))

In [None]:
import numpy as np
from sklearn.metrics import precision_recall_fscore_support


In [None]:
precision_recall_fscore_support(np.array(targets), np.array(predictions), average='weighted')