# BART Finetuning

By: Federico Dominguez Molina

#### Libraries

In [1]:
# !pip install transformers datasets evaluate rouge_score
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from transformers import Trainer, TrainingArguments
from transformers import AutoModelForSeq2SeqLM, Seq2SeqTrainingArguments, Seq2SeqTrainer

import numpy as np
import pandas as pd
import evaluate

# Local imports
from text_parser import TextParser


  from .autonotebook import tqdm as notebook_tqdm
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\fdmol\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\fdmol\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [2]:
MODEL_NAME = "facebook/bart-large-cnn"
TEXT_FILES_PATH = (
    "C:/Users/fdmol/Desktop/MSCAPP/CAPP30255/NLP-Police-Complaints/data/text_files"
)
DATA_PATH = "C:/Users/fdmol/Desktop/MSCAPP/CAPP30255/NLP-Police-Complaints/data"

In [3]:
# Add local parameters
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModelForSeq2SeqLM.from_pretrained(MODEL_NAME)

rouge = evaluate.load("rouge")

model = AutoModelForSeq2SeqLM.from_pretrained(MODEL_NAME)

### Create and format training data

In [4]:
text_parser = TextParser(TEXT_FILES_PATH, nlp_task="summarization")

Initializing parsers for summarization


In [10]:
training_data = pd.ExcelFile(DATA_PATH + "/manual_summaries.xlsx")
training_data = training_data.parse("manual_summaries")

complete_texts = []
processed_manual_summaries = []
complaint_files = []

# Get local
for index, row in training_data.iterrows():
    manual_summary = row["manual_summary"]
    complaint_file = row["complaint"]

    # Get complete, preprocessed text
    complete_text = text_parser.file_to_string(f"{TEXT_FILES_PATH}/{complaint_file}")

    # Preprocess manual summary
    processed_manual_summary = text_parser.process_given_text(manual_summary)

    complete_texts.append(complete_text)
    processed_manual_summaries.append(processed_manual_summary)
    complaint_files.append(complaint_file)


training_set = pd.DataFrame(
    {
        "complaint": complaint_files,
        "complete_text": complete_texts,
        "manual_summary": processed_manual_summaries,
    }
)


2013-1060190.txt
2016-1081599.txt
2017-1087778.txt
1085499.txt
2010-1032871.txt
2016-1081128.txt
2018-1088501.txt
1087317.txt
2020-0001176.txt
2016-1081113.txt
1084293.txt
2014-1072822.txt
2017-1087234.txt
2020-0002226.txt
2014-1068262.txt
1075325.txt
1075770.txt
2018-1089298.txt
1080064.txt
2019-1092229.txt
1076615.txt
2011-1047225.txt
2013-1063129.txt
1091501.txt
2015-1078263.txt


In [9]:
training_set

Unnamed: 0,complaint,complete_text,manual_summary
0,2013-1060190.txt,/ u # 13 investigation number: / u #13 involv...,"on february 16, 2013, chicago police officer ""..."
1,2016-1081599.txt,summary report of investigation' time of inci...,"on july 9, 2016, in alsip, illinois, officer e..."
2,2017-1087778.txt,date of incident: time of incident: location...,"on december 10, 2017, officer rodney pickett a..."
3,1085499.txt,"introductionon june 7th, 2017, chicago police...","on june 7, 2017, subject 1 was detained by chi..."
4,2010-1032871.txt,/u # 10-1 investigation number: u # 10-1/ in...,"on january 1, 2010, in chicago, officers a and..."
5,2016-1081128.txt,log #/ 1081128 1 summary report of investigati...,"in june 2016, officer faced allegations of phy..."
6,2018-1088501.txt,"= 1 p.m. oglesby avenue chicago, illinois 606...","on february 15, 2018, in chicago, police offic..."
7,1087317.txt,date of incident: time of incident: location...,"on october 26, 2017, copa investigated an inci..."
8,2020-0001176.txt,civilian office of poli ce accountability 1 ...,the copa investigation into allegations agains...
9,2016-1081113.txt,summary report of investigation' date/time/lo...,"on june 22, 2016, in chicago, officers , , and..."


Functions to finetune BART using the `transformers` library.

In [11]:
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    result = rouge.compute(
        predictions=decoded_preds, references=decoded_labels, use_stemmer=True
    )

    prediction_lens = [
        np.count_nonzero(pred != tokenizer.pad_token_id) for pred in predictions
    ]
    result["gen_len"] = np.mean(prediction_lens)

    return {k: round(v, 4) for k, v in result.items()}


In [12]:
training_args = Seq2SeqTrainingArguments(
    output_dir="my_awesome_billsum_model",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=4,
    predict_with_generate=True,
    fp16=True,
    push_to_hub=True,
)

# trainer = Seq2SeqTrainer(
#     model=model,
#     args=training_args,
#     train_dataset=tokenized_billsum["train"],
#     eval_dataset=tokenized_billsum["test"],
#     tokenizer=tokenizer,
#     data_collator=data_collator,
#     compute_metrics=compute_metrics,
# )

# trainer.train()

ImportError: Using the `Trainer` with `PyTorch` requires `accelerate>=0.20.1`: Please run `pip install transformers[torch]` or `pip install accelerate -U`