# BART Finetuning

By: Federico Dominguez Molina

#### Libraries

In [17]:
# !pip install transformers datasets evaluate rouge_score
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from transformers import Trainer, TrainingArguments
from transformers import AutoModelForSeq2SeqLM, Seq2SeqTrainingArguments, Seq2SeqTrainer

import os
import numpy as np
import pandas as pd
import evaluate

# Local imports
from text_parser import TextParser
from summarizer_model import Summarizer


In [2]:
MODEL_NAME = "facebook/bart-large-cnn"
TEXT_FILES_PATH = (
    "C:/Users/fdmol/Desktop/MSCAPP/CAPP30255/NLP-Police-Complaints/data/text_files"
)
DATA_PATH = "C:/Users/fdmol/Desktop/MSCAPP/CAPP30255/NLP-Police-Complaints/data"

In [3]:
# Add local parameters
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModelForSeq2SeqLM.from_pretrained(MODEL_NAME)

rouge = evaluate.load("rouge")

model = AutoModelForSeq2SeqLM.from_pretrained(MODEL_NAME)

### Create and format training data

In [4]:
text_parser = TextParser(TEXT_FILES_PATH, nlp_task="summarization")

Initializing parsers for summarization


In [5]:
training_data = pd.ExcelFile(DATA_PATH + "/manual_summaries.xlsx")
training_data = training_data.parse("manual_summaries")

complete_texts = []
processed_manual_summaries = []
complaint_files = []

# Get local
for index, row in training_data.iterrows():
    manual_summary = row["manual_summary"]
    complaint_file = row["complaint"]

    # Get complete, preprocessed text
    complete_text = text_parser.file_to_string(f"{TEXT_FILES_PATH}/{complaint_file}")

    # Preprocess manual summary
    processed_manual_summary = text_parser.process_given_text(manual_summary)

    complete_texts.append(complete_text)
    processed_manual_summaries.append(processed_manual_summary)
    complaint_files.append(complaint_file)


all_text_files = os.listdir(TEXT_FILES_PATH)

# Get indices of files in the training set
training_set_indices = []

for file in all_text_files:
    if file in complaint_files:
        training_set_indices.append(all_text_files.index(file))

training_set = pd.DataFrame(
    {
        "complaint": complaint_files,
        "complete_text": complete_texts,
        "manual_summary": processed_manual_summaries,
        "file_index": training_set_indices,
    }
)

In [6]:
training_set.to_excel(DATA_PATH + "/training_set.xlsx")

Functions to finetune BART using the `transformers` library.

In [7]:
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    result = rouge.compute(
        predictions=decoded_preds, references=decoded_labels, use_stemmer=True
    )

    prediction_lens = [
        np.count_nonzero(pred != tokenizer.pad_token_id) for pred in predictions
    ]
    result["gen_len"] = np.mean(prediction_lens)

    return {k: round(v, 4) for k, v in result.items()}


#### Tokenize data for model 

In [8]:
tokenized_inputs = tokenizer(
    training_set["complete_text"].tolist(),
    padding=True,
    truncation=True,
    max_length=1024,
)
tokenized_labels = tokenizer(
    training_set["manual_summary"].tolist(),
    padding=True,
    truncation=True,
    max_length=128,
)

#### Custom Dataset

In [9]:
import torch
from torch.utils.data import Dataset


class CustomDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item["labels"] = torch.tensor(self.labels["input_ids"][idx])
        return item

    def __len__(self):
        return len(self.labels["input_ids"])


dataset = CustomDataset(tokenized_inputs, tokenized_labels)


In [10]:
# training_args = Seq2SeqTrainingArguments(
#     output_dir="my_awesome_billsum_model",
#     evaluation_strategy="epoch",
#     learning_rate=2e-5,
#     per_device_train_batch_size=16,
#     per_device_eval_batch_size=16,
#     weight_decay=0.01,
#     save_total_limit=3,
#     num_train_epochs=4,
#     predict_with_generate=True,
#     fp16=True,
#     push_to_hub=True,
# )

# odel = AutoModelForSeq2SeqLM.from_pretrained(MODEL_NAME)

training_args = TrainingArguments(
    output_dir="./results",  # Output directory for model checkpoints
    num_train_epochs=3,  # Number of training epochs
    per_device_train_batch_size=4,  # Batch size per device during training
    per_device_eval_batch_size=4,  # Batch size for evaluation
    warmup_steps=500,  # Number of warmup steps for learning rate scheduler
    weight_decay=0.01,  # Strength of weight decay
    logging_dir="./logs",  # Directory for storing logs
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset,
    # eval_dataset=test_dataset,  # If you have a test dataset
)

trainer.train()

100%|██████████| 21/21 [20:51<00:00, 59.62s/it]

{'train_runtime': 1251.9204, 'train_samples_per_second': 0.06, 'train_steps_per_second': 0.017, 'train_loss': 3.3641651698521207, 'epoch': 3.0}





TrainOutput(global_step=21, training_loss=3.3641651698521207, metrics={'train_runtime': 1251.9204, 'train_samples_per_second': 0.06, 'train_steps_per_second': 0.017, 'train_loss': 3.3641651698521207, 'epoch': 3.0})

In [12]:
# Save model
model.save_pretrained(DATA_PATH + "/finetuned_bart_model")
tokenizer.save_pretrained(DATA_PATH + "/finetuned_bart_model")

('C:/Users/fdmol/Desktop/MSCAPP/CAPP30255/NLP-Police-Complaints/data/finetuned_bart_model\\tokenizer_config.json',
 'C:/Users/fdmol/Desktop/MSCAPP/CAPP30255/NLP-Police-Complaints/data/finetuned_bart_model\\special_tokens_map.json',
 'C:/Users/fdmol/Desktop/MSCAPP/CAPP30255/NLP-Police-Complaints/data/finetuned_bart_model\\vocab.json',
 'C:/Users/fdmol/Desktop/MSCAPP/CAPP30255/NLP-Police-Complaints/data/finetuned_bart_model\\merges.txt',
 'C:/Users/fdmol/Desktop/MSCAPP/CAPP30255/NLP-Police-Complaints/data/finetuned_bart_model\\added_tokens.json',
 'C:/Users/fdmol/Desktop/MSCAPP/CAPP30255/NLP-Police-Complaints/data/finetuned_bart_model\\tokenizer.json')

### Test the finetuned model

In [13]:
def generate_summary(report, model, tokenizer):
    inputs = tokenizer(
        report, return_tensors="pt", padding=True, truncation=True, max_length=1024
    )
    summary_ids = model.generate(
        inputs["input_ids"],
        max_length=1200,
        min_length=40,
        length_penalty=2.0,
        num_beams=4,
        early_stopping=True,
    )
    return tokenizer.decode(summary_ids[0], skip_special_tokens=True)


In [15]:
# Load finetuned model

finetuned_model = AutoModelForSeq2SeqLM.from_pretrained(
    DATA_PATH + "/finetuned_bart_model"
)
finetuned_tokenizer = AutoTokenizer.from_pretrained(DATA_PATH + "/finetuned_bart_model")


In [19]:
# Randomly select 5 complaints to generate summaries for

import os
import random

all_complaints = os.listdir(TEXT_FILES_PATH)
random_complaints = random.sample(all_complaints, 5)

for complaint in random_complaints:
    complaint_text = text_parser.file_to_string(f"{TEXT_FILES_PATH}/{complaint}")

    # Generate summary - Original model
    model = Summarizer(MODEL_NAME, complaint_text)
    summary = model.generate_summary(
        max_length=1200,
        min_length=40,
        length_penalty=2,
        no_repeat_ngram_size=2,
        num_beams=4,
        early_stopping=True,
    )

    print(f"Complaint: {complaint}")
    print(f"Summary: {summary}")
    print("\n")
    finetuned_summary = generate_summary(
        complaint_text, finetuned_model, finetuned_tokenizer
    )

    print(f"Finetuned Summary: {finetuned_summary}")
    print("\n")

Complaint: 1077591.txt
Summary: Police responded to a call of a man climbing over a fence, officers approached and instructed him to come outside of the fence. refused to follow the instructions and reached for a firearm. was tased, arrested and transported to the 010th district station. allegations: on october 14, 2015, intake aide denise stewart received a letter from complainant and registered a complaint.


Finetuned Summary: Citizens office of police accountability received a complaint on october 14, 2015, on behalf of a man who claims he was beaten and tased by chicago police officers on august 16, 2015. Officer roberto gomez, star 11353 and officer jose carrera, star 12997 were accused of punching, kicking, tasing, pointing a weapon at, and falsely charging him with possession of a weapon. Officer carrera was also accused of tasing the man, in violation of rule 8, of kicking him in the face, and of pointing a gun at him.


Complaint: 1088596.txt
Summary: The complainant, was dri