In [2]:
import pandas as pd
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, DataCollatorForSeq2Seq, Seq2SeqTrainingArguments, Seq2SeqTrainer
import torch
from torch.utils.data import Dataset
from sklearn.model_selection import train_test_split
import evaluate

In [43]:
def preprocess_data(data): #this is used to process the data for the tokenizer
    context = data['context'].to_list() #First convert to a list
    text_encodings = tokenizer(context, truncation=True, padding=True)

    triplets = data['triplets'].to_list()
    label_encodings = tokenizer(triplets, truncation=True, padding=True)
    #new_data ={"input_ids":model_inputs["input_ids"], "labels": labels["input_ids"]}

    return text_encodings, label_encodings

class RebelDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels['input_ids'][idx])

        return item

    def __len__(self):
        return len(self.labels['input_ids'])

In [32]:
model_checkpoint = "Babelscape/rebel-large"
model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint)
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

loading configuration file config.json from cache at C:\Users\mike-/.cache\huggingface\hub\models--Babelscape--rebel-large\snapshots\d24237e8ab9c1ad2cbdf53fd54b0d7cda1da8018\config.json
Model config BartConfig {
  "_name_or_path": "Babelscape/rebel-large",
  "activation_dropout": 0.1,
  "activation_function": "gelu",
  "add_bias_logits": false,
  "add_final_layer_norm": false,
  "architectures": [
    "BartForConditionalGeneration"
  ],
  "attention_dropout": 0.1,
  "bos_token_id": 0,
  "classif_dropout": 0.1,
  "classifier_dropout": 0.0,
  "d_model": 1024,
  "decoder_attention_heads": 16,
  "decoder_ffn_dim": 4096,
  "decoder_layerdrop": 0.0,
  "decoder_layers": 12,
  "decoder_start_token_id": 0,
  "dropout": 0.1,
  "encoder_attention_heads": 16,
  "encoder_ffn_dim": 4096,
  "encoder_layerdrop": 0.0,
  "encoder_layers": 12,
  "eos_token_id": 2,
  "forced_eos_token_id": 2,
  "gradient_checkpointing": false,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1",
    "2": "LABEL_2"
  },

In [44]:
seed = 1
data = pd.read_csv('Data/rebel/rebel_format.csv')
train_data, val_data = train_test_split(data, test_size=0.2, random_state=seed)
del data

train_encodings, train_labels = preprocess_data(train_data.head(100))
train_data = RebelDataset(train_encodings, train_labels)
val_encodings, val_labels = preprocess_data(val_data.head(10))
val_data = RebelDataset(val_encodings, val_labels)

In [9]:
model_name = model_checkpoint.split("/")[-1]
args = Seq2SeqTrainingArguments(
    f"{model_name}-finetuned-faro-relations",
    evaluation_strategy = "epoch",
    learning_rate=0.000025,
    per_device_train_batch_size=1,
    per_device_eval_batch_size=1,
    weight_decay=0.1,
    save_total_limit=3,
    num_train_epochs=7,
    predict_with_generate=True, #Maybe switch to false?
    push_to_hub=False,
)
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

In [49]:
def metrics_func(eval_arg):
  preds, labels = eval_arg
  rouge_metric = evaluate.load("rouge")

  return rouge_metric.compute(
    predictions=preds,
    references=labels
  )

Collecting rouge_score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py): started
  Preparing metadata (setup.py): finished with status 'done'
Collecting absl-py
  Using cached absl_py-1.4.0-py3-none-any.whl (126 kB)
Building wheels for collected packages: rouge-score
  Building wheel for rouge-score (setup.py): started
  Building wheel for rouge-score (setup.py): finished with status 'done'
  Created wheel for rouge-score: filename=rouge_score-0.1.2-py3-none-any.whl size=24955 sha256=45fa49797212b75b8166f320d047dfbc759a3393dd2cddd8b7a33577286791d4
  Stored in directory: c:\users\mike-\appdata\local\pip\cache\wheels\24\55\6f\ebfc4cb176d1c9665da4e306e1705496206d08215c1acd9dde
Successfully built rouge-score
Installing collected packages: absl-py, rouge-score
Successfully installed absl-py-1.4.0 rouge-score-0.1.2


You should consider upgrading via the 'C:\Users\mike-\Documents\VU\Eurecom\venv\Scripts\python.exe -m pip install --upgrade pip' command.


In [None]:
trainer = Seq2SeqTrainer(
    model,
    args,
    train_dataset=train_data,
    eval_dataset=val_data,
    data_collator=data_collator,
    tokenizer=tokenizer,
)

trainer.train()