# BART Finetuning

By: Federico Dominguez Molina

#### Libraries

In [1]:
# !pip install transformers datasets evaluate rouge_score
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from transformers import Trainer, TrainingArguments
from transformers import AutoModelForSeq2SeqLM, Seq2SeqTrainingArguments, Seq2SeqTrainer
from sklearn.model_selection import train_test_split

import os
import numpy as np
import pandas as pd
import evaluate
import random
import torch
from torch.utils.data import Dataset

# Local imports
from text_parser import TextParser
from summarizer_model import Summarizer

  from .autonotebook import tqdm as notebook_tqdm
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\fdmol\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\fdmol\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [2]:
MODEL_NAME = "facebook/bart-large-cnn"
TEXT_FILES_PATH = (
    "C:/Users/fdmol/Desktop/MSCAPP/CAPP30255/NLP-Police-Complaints/data/text_files"
)
DATA_PATH = "C:/Users/fdmol/Desktop/MSCAPP/CAPP30255/NLP-Police-Complaints/data"

RANDOM_STATE = 30255

In [3]:
# Add local parameters
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModelForSeq2SeqLM.from_pretrained(MODEL_NAME)

rouge = evaluate.load("rouge")

model = AutoModelForSeq2SeqLM.from_pretrained(MODEL_NAME)

### Create and format training data

I load the Excel file with the gold standard summaries, and I then create a training set that also contains the original text. 

In [4]:
text_parser = TextParser(TEXT_FILES_PATH, nlp_task="summarization")

Initializing parsers for summarization


In [5]:
training_data = pd.ExcelFile(DATA_PATH + "/manual_summaries.xlsx")
training_data = training_data.parse("manual_summaries")

complete_texts = []
processed_manual_summaries = []
complaint_files = []

# Get local
for index, row in training_data.iterrows():
    manual_summary = row["manual_summary"]
    complaint_file = row["complaint"]

    # Get complete, preprocessed text
    complete_text = text_parser.file_to_string(f"{TEXT_FILES_PATH}/{complaint_file}")

    # Preprocess manual summary
    processed_manual_summary = text_parser.process_given_text(manual_summary)

    complete_texts.append(complete_text)
    processed_manual_summaries.append(processed_manual_summary)
    complaint_files.append(complaint_file)


all_text_files = os.listdir(TEXT_FILES_PATH)

# Get indices of files in the training set
training_set_indices = []

for file in all_text_files:
    if file in complaint_files:
        training_set_indices.append(all_text_files.index(file))

training_set = pd.DataFrame(
    {
        "complaint": complaint_files,
        "complete_text": complete_texts,
        "manual_summary": processed_manual_summaries,
        "file_index": training_set_indices,
    }
)

In [6]:
training_set.to_excel(DATA_PATH + "/training_set.xlsx")


train_df, test_df = train_test_split(
    training_set, test_size=0.2, random_state=RANDOM_STATE
)

##### Creating tokenizers for the model. 

In [8]:
tokenized_train_inputs = tokenizer(
    training_set["complete_text"].tolist(),
    padding=True,
    truncation=True,
    max_length=1024,
)
tokenized_train_labels = tokenizer(
    training_set["manual_summary"].tolist(),
    padding=True,
    truncation=True,
    max_length=256,
)

tokenized_test_inputs = tokenizer(
    test_df["complete_text"].tolist(),
    padding=True,
    truncation=True,
    max_length=1024,
)

tokenized_test_labels = tokenizer(
    test_df["manual_summary"].tolist(),
    padding=True,
    truncation=True,
    max_length=256,
)

#### Custom Dataset

In [9]:
class CustomDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item["labels"] = torch.tensor(self.labels["input_ids"][idx])
        return item

    def __len__(self):
        return len(self.labels["input_ids"])


train_dataset = CustomDataset(tokenized_train_inputs, tokenized_train_labels)
test_dataset = CustomDataset(tokenized_test_inputs, tokenized_test_labels)

In [10]:
training_args = TrainingArguments(
    output_dir="./results",  # Output directory for model checkpoints
    num_train_epochs=3,  # Number of training epochs
    learning_rate=2e-5,  # Learning rate
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir="./logs",
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
)

trainer.train()

100%|██████████| 21/21 [22:22<00:00, 63.94s/it]

{'train_runtime': 1342.6816, 'train_samples_per_second': 0.056, 'train_steps_per_second': 0.016, 'train_loss': 5.153852190290179, 'epoch': 3.0}





TrainOutput(global_step=21, training_loss=5.153852190290179, metrics={'train_runtime': 1342.6816, 'train_samples_per_second': 0.056, 'train_steps_per_second': 0.016, 'train_loss': 5.153852190290179, 'epoch': 3.0})

In [11]:
# Save model
model.save_pretrained(DATA_PATH + "/finetuned_bart_model")
tokenizer.save_pretrained(DATA_PATH + "/finetuned_bart_model")

('C:/Users/fdmol/Desktop/MSCAPP/CAPP30255/NLP-Police-Complaints/data/finetuned_bart_model\\tokenizer_config.json',
 'C:/Users/fdmol/Desktop/MSCAPP/CAPP30255/NLP-Police-Complaints/data/finetuned_bart_model\\special_tokens_map.json',
 'C:/Users/fdmol/Desktop/MSCAPP/CAPP30255/NLP-Police-Complaints/data/finetuned_bart_model\\vocab.json',
 'C:/Users/fdmol/Desktop/MSCAPP/CAPP30255/NLP-Police-Complaints/data/finetuned_bart_model\\merges.txt',
 'C:/Users/fdmol/Desktop/MSCAPP/CAPP30255/NLP-Police-Complaints/data/finetuned_bart_model\\added_tokens.json',
 'C:/Users/fdmol/Desktop/MSCAPP/CAPP30255/NLP-Police-Complaints/data/finetuned_bart_model\\tokenizer.json')

### Test the finetuned model

In [12]:
def generate_summary(report, model, tokenizer):
    inputs = tokenizer(
        report, return_tensors="pt", padding=True, truncation=True, max_length=1024
    )
    summary_ids = model.generate(
        inputs["input_ids"],
        max_length=1200,
        min_length=40,
        length_penalty=2.0,
        no_repeat_ngram_size=2,
        num_beams=4,
        early_stopping=True,
    )
    return tokenizer.decode(summary_ids[0], skip_special_tokens=True)

In [13]:
# Load finetuned model
finetuned_model = AutoModelForSeq2SeqLM.from_pretrained(
    DATA_PATH + "/finetuned_bart_model"
)
finetuned_tokenizer = AutoTokenizer.from_pretrained(DATA_PATH + "/finetuned_bart_model")

Test model for five random complaints 

In [15]:
# Randomly select 5 complaints to generate summaries for
all_complaints = os.listdir(TEXT_FILES_PATH)
random_complaints = random.sample(all_complaints, 5)

for complaint in random_complaints:
    complaint_text = text_parser.file_to_string(f"{TEXT_FILES_PATH}/{complaint}")

    # Generate summary - Original model
    model = Summarizer(MODEL_NAME, complaint_text)
    summary = model.generate_summary(
        max_length=1200,
        min_length=40,
        length_penalty=2,
        no_repeat_ngram_size=2,
        num_beams=4,
        early_stopping=True,
    )

    print(f"Complaint: {complaint}")
    print(f"Summary: {summary}")
    print("\n")
    finetuned_summary = generate_summary(
        complaint_text, finetuned_model, finetuned_tokenizer
    )

    print(f"Finetuned Summary: {finetuned_summary}")
    print("\n")

Complaint: 2009-1027884.txt
Summary: The Chicago Police Department launched an investigation into the shooting death of a 16-year-old boy on July 2, 2009. The shooting occurred after a group of 3-4 male/blacks committed a home-invasion style armed robbery at xxxx s. spaulding. Police received an oemc call regarding a person with a gun, and were driving through the same alley, when they found themselves behind the fleeing suspects.


Finetuned Summary: The Chicago Police Department launched an investigation into the shooting death of a 16-year-old boy on July 2, 2009. Officer a fired eight shots at subject 1 eight (8) times, hitting him in the back of the neck and right flank. The victim was pronounced dead at the hospital.


Complaint: 1089601.txt
Summary: Officers were patrolling the vicinity of 7826 s. evans avenue p.m. on may 25, 2018, when several gunshots were heard. The officers observed a black suv (kia sorrento) traveling at a high rate of speed out of the east alley of langley

##### Create summaries for training set

We will use both the finetuned and the original BART model to create summaries for the training set.

In [17]:
finetuned_summaries = []
original_summaries = []

for index, row in training_set.iterrows():
    complaint = row["complete_text"]
    model = Summarizer(MODEL_NAME, complaint)
    original_summary = model.generate_summary(
        max_length=1200,
        min_length=40,
        length_penalty=2,
        no_repeat_ngram_size=2,
        num_beams=4,
        early_stopping=True,
    )

    finetuned_summary = generate_summary(
        complaint, finetuned_model, finetuned_tokenizer
    )

    finetuned_summaries.append(finetuned_summary)
    original_summaries.append(original_summary)

In [23]:
# Create dataframe with original and finetuned summaries
training_set["bart_original_summary"] = original_summaries
training_set["bart_finetuned_summary"] = finetuned_summaries

training_set.to_excel(DATA_PATH + "/training_set_with_summaries.xlsx", index=False)