# BART Finetuning

By: Federico Dominguez Molina

#### Libraries

In [18]:
# !pip install transformers datasets evaluate rouge_score
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from transformers import Trainer, TrainingArguments
from transformers import AutoModelForSeq2SeqLM, Seq2SeqTrainingArguments, Seq2SeqTrainer

import os
import numpy as np
import pandas as pd
import evaluate

# Local imports
from text_parser import TextParser

In [19]:
MODEL_NAME = "facebook/bart-large-cnn"
TEXT_FILES_PATH = (
    "C:/Users/fdmol/Desktop/MSCAPP/CAPP30255/NLP-Police-Complaints/data/text_files"
)
DATA_PATH = "C:/Users/fdmol/Desktop/MSCAPP/CAPP30255/NLP-Police-Complaints/data"


In [26]:
# Add local parameters
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModelForSeq2SeqLM.from_pretrained(MODEL_NAME)

rouge = evaluate.load("rouge")

model = AutoModelForSeq2SeqLM.from_pretrained(MODEL_NAME)


### Create and format training data

In [27]:
text_parser = TextParser(TEXT_FILES_PATH, nlp_task="summarization")

Initializing parsers for summarization


In [28]:
training_data = pd.ExcelFile(DATA_PATH + "/manual_summaries.xlsx")
training_data = training_data.parse("manual_summaries")

complete_texts = []
processed_manual_summaries = []
complaint_files = []

# Get local
for index, row in training_data.iterrows():
    manual_summary = row["manual_summary"]
    complaint_file = row["complaint"]

    # Get complete, preprocessed text
    complete_text = text_parser.file_to_string(f"{TEXT_FILES_PATH}/{complaint_file}")

    # Preprocess manual summary
    processed_manual_summary = text_parser.process_given_text(manual_summary)

    complete_texts.append(complete_text)
    processed_manual_summaries.append(processed_manual_summary)
    complaint_files.append(complaint_file)


all_text_files = os.listdir(TEXT_FILES_PATH)

# Get indices of files in the training set
training_set_indices = []

for file in all_text_files:
    if file in complaint_files:
        training_set_indices.append(all_text_files.index(file))

training_set = pd.DataFrame(
    {
        "complaint": complaint_files,
        "complete_text": complete_texts,
        "manual_summary": processed_manual_summaries,
        "file_index": training_set_indices,
    }
)


In [30]:
training_set.to_excel(DATA_PATH + "/training_set.xlsx")

Functions to finetune BART using the `transformers` library.

In [17]:
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    result = rouge.compute(
        predictions=decoded_preds, references=decoded_labels, use_stemmer=True
    )

    prediction_lens = [
        np.count_nonzero(pred != tokenizer.pad_token_id) for pred in predictions
    ]
    result["gen_len"] = np.mean(prediction_lens)

    return {k: round(v, 4) for k, v in result.items()}

#### Tokenize data for model 

In [33]:
tokenized_inputs = tokenizer(
    training_set["complete_text"].tolist(),
    padding=True,
    truncation=True,
    max_length=1024,
)
tokenized_labels = tokenizer(
    training_set["manual_summary"].tolist(),
    padding=True,
    truncation=True,
    max_length=128,
)


In [18]:
training_args = Seq2SeqTrainingArguments(
    output_dir="my_awesome_billsum_model",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=4,
    predict_with_generate=True,
    fp16=True,
    push_to_hub=True,
)

# trainer = Seq2SeqTrainer(
#     model=model,
#     args=training_args,
#     train_dataset=tokenized_billsum["train"],
#     eval_dataset=tokenized_billsum["test"],
#     tokenizer=tokenizer,
#     data_collator=data_collator,
#     compute_metrics=compute_metrics,
# )

# trainer.train()


ImportError: Using the `Trainer` with `PyTorch` requires `accelerate>=0.20.1`: Please run `pip install transformers[torch]` or `pip install accelerate -U`