In [1]:
import pandas as pd
! pip install transformers evaluate rouge_score
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, DataCollatorForSeq2Seq, Seq2SeqTrainingArguments, Seq2SeqTrainer
import torch
from torch.utils.data import Dataset
from sklearn.model_selection import train_test_split
import evaluate
from datasets import load_metric
import numpy as np
import nltk
nltk.download('punkt')
from sklearn import preprocessing
import re

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [2]:
def preprocess_data(data): #this is used to process the data for the tokenizer
    context = data['context'].to_list() #First convert to a list
    text_encodings = tokenizer(context, truncation=True, padding=True)

    triplets = data['triplets'].to_list()
    label_encodings = tokenizer(triplets, truncation=True, padding=True)
    #new_data ={"input_ids":model_inputs["input_ids"], "labels": labels["input_ids"]}

    return text_encodings, label_encodings

class RebelDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels['input_ids'][idx])

        return item

    def __len__(self):
        return len(self.labels['input_ids'])


In [3]:
model_checkpoint = "Babelscape/rebel-large"
model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint)
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

In [None]:
seed = 1
data = pd.read_csv('rebel_format_v2.csv')
train_data, val_data = train_test_split(data, test_size=0.1, random_state=seed)
del data

train_encodings, train_labels = preprocess_data(train_data)
train_data = RebelDataset(train_encodings, train_labels)
val_encodings, val_labels = preprocess_data(val_data)
val_data = RebelDataset(val_encodings, val_labels)

In [None]:
model_name = model_checkpoint.split("/")[-1]
args = Seq2SeqTrainingArguments(
    f"{model_name}-finetuned-faro-relations",
    evaluation_strategy = "epoch",
    save_strategy="epoch",
    logging_strategy="epoch",
    learning_rate=0.000025,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    weight_decay=0.1,
    save_total_limit=1,
    #save_steps=300,
    load_best_model_at_end=True,
    metric_for_best_model='F1 relations',
    num_train_epochs=7,
    predict_with_generate=True,
    fp16 = True,
    push_to_hub=False,
)
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

In [None]:
metric = load_metric("rouge")
f1_metric = evaluate.load("f1")

def compute_rouge(eval_pred):
    predictions, labels = eval_pred
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    
    # Replace -100 in the labels as we can't decode them.
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    
    # Rouge expects a newline after each sentence
    decoded_preds = ["\n".join(nltk.sent_tokenize(pred.strip()))
                      for pred in decoded_preds]
    decoded_labels = ["\n".join(nltk.sent_tokenize(label.strip())) 
                      for label in decoded_labels]
    
    # Compute ROUGE scores
    result = metric.compute(predictions=decoded_preds, references=decoded_labels,
                            use_stemmer=True)

    # Extract ROUGE f1 scores
    result = {key: value.mid.fmeasure * 100 for key, value in result.items()}
    
    # Add mean generated length to metrics
    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id)
                      for pred in predictions]
    result["gen_len"] = np.mean(prediction_lens)
    
    return {k: round(v, 4) for k, v in result.items()}

def check_format(data):
    #This is used to check if the extraction worked

    if len(data) != 3:
      return ['wrong', 'wrong', 'wrong']
    else:
      return data

def compute_f1(eval_pred):

    metrics = {}

    predictions, labels = eval_pred
    predictions = tokenizer.batch_decode(predictions, skip_special_tokens=False)

    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    labels = tokenizer.batch_decode(labels, skip_special_tokens=False)

    pattern = r'>([^<]+)'
    predictions = [check_format(re.findall(pattern, instance)) for instance in predictions]
    try:
      predictions = pd.DataFrame(predictions, columns =['Subject', 'Object', 'Relation'])
    except:
      print("Something went wrong with extracting and creating predictions dataframe")

    try:
      labels = [check_format(re.findall(pattern, instance)) for instance in labels]
      labels = pd.DataFrame(labels, columns =['Subject', 'Object', 'Relation'])
    except:
      print("Something went wrong with extracting and creating label dataframe")
    #F1 can only be computed on integer labels, so we need to encode them
    try:
        le_subject= preprocessing.LabelEncoder()
        le_subject.fit(predictions['Subject']) #Learn the encodings on the predictions for subject
        le_subject_dict = dict(zip(le_subject.classes_, le_subject.transform(le_subject.classes_)))
    except:
        print("Something went wrong when calculating subject encodings")
    del le_subject

    #Encode in the same way, if it doesn't exist set to non existent number
    try:
        predictions['Subject'] = predictions['Subject'].apply(lambda x: le_subject_dict.get(x, 999999999))
        labels['Subject'] = labels['Subject'].apply(lambda x: le_subject_dict.get(x, 999999999))
    except:
        print("Something went wrong when applying the subject labels")
    
    try:
        metrics['F1 Subject'] = f1_metric.compute(predictions=predictions['Subject'].to_list(), references=labels['Subject'].to_list(), average='macro')['f1']
    except:
        print("Something went wrong when computing the f1 score for subject")
    del le_subject_dict

    try:
        le_object= preprocessing.LabelEncoder()
        le_object.fit(predictions['Object']) #Learn the encodings on the predictions for object
        le_object_dict = dict(zip(le_object.classes_, le_object.transform(le_object.classes_)))
    except:
        print("Something went wrong when calculating object encodings")
    del le_object

    try:
        predictions['Object'] = predictions['Object'].apply(lambda x: le_object_dict.get(x, 999999999))
        labels['Object'] = labels['Object'].apply(lambda x: le_object_dict.get(x, 999999999))
    except:
        print("Something went wrong when applying Object encodings")
    
    try:
        metrics['F1 Object'] = f1_metric.compute(predictions=predictions['Object'].to_list(), references=labels['Object'].to_list(), average='macro')['f1']
    except:
        print("Something went wrong when computing F1 scores for object")
    del le_object_dict

    try:
        le_relations = preprocessing.LabelEncoder() #since there are a fixed number of classes no dict needs to be created
        le_relations.fit(labels['Relation']) #Learn the representation on the predictions for relations
        le_relations_dict = dict(zip(le_relations.classes_, le_relations.transform(le_relations.classes_)))
        #del le_relations
    except:
        print("Something went wrong when calculating the relation encodings")

    try:
        labels['Relation'] = labels['Relation'].apply(lambda x: le_relations_dict.get(x, 999999999))
        predictions['Relation'] = predictions['Relation'].apply(lambda x: le_relations_dict.get(x, 999999999))

        #print(labels['Relation'])
        #print(predictions['Relation'])
    except:
        print("Something went wrong when applying relation encodings")
    
    try:
        f1 = f1_metric.compute(predictions=predictions['Relation'].to_list(), references=labels['Relation'].to_list(), average= None)['f1']
        metrics['F1 relations'] = f1_metric.compute(predictions=predictions['Relation'].to_list(), references=labels['Relation'].to_list(), average= 'macro')['f1']
    except:
        print("Something went wrong with computing f1 for relations")
    try:
        
        for i in range(len(le_relations.classes_)):
            metrics[f'F1 {le_relations.classes_[i].strip()}'] = f1[i]
    except:
        print("something went wrong when assigning relation scores")

    return metrics


In [None]:
trainer = Seq2SeqTrainer(
    model,
    args,
    train_dataset=train_data,
    eval_dataset=val_data,
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics= compute_f1
)

trainer.train()

In [None]:
trainer.save_model("finetuned_rebel")

In [None]:
import shutil
shutil.make_archive("rebel_finetuned", 'zip', "finetuned_rebel")

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
import shutil
shutil.unpack_archive('/content/drive/MyDrive/rebel_finetuned.zip', '/content/rebel_finetuned', 'zip')

In [None]:
model = AutoModelForSeq2SeqLM.from_pretrained("rebel_finetuned")
tokenizer = AutoTokenizer.from_pretrained('Babelscape/rebel-large')

In [None]:
text = ["because the machine is old, it is unreliable", "many people have died in the storm", "now the preparation is complete, we can start again",
        "the restrictions made sure less people got infected", "I am running everyday because i want to run a marathon", "the elevator is fixed, so i can go up again",
        "There was a traffic jam, so i was late", "I broke my leg, so I can't run the marathon", "Since I failed the exam, I can't graduate",
        "I did some shopping, because i want to cook later", "I shouldn't have said that, I did not mean that", "I wanted to say that", "I am planning on doing that later",
        "I intend on doing that"]
encoding = tokenizer(text, return_tensors="pt", truncation=True, padding=True)

# forward pass
outputs = model.generate(**encoding, do_sample=True)
decoded_output = tokenizer.batch_decode(outputs, skip_special_tokens=True)
print(decoded_output)