In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!pip install transformers
!pip install pytorch-lightning

In [None]:
import pandas as pd
import numpy as np
from tqdm.notebook import tqdm
from collections import defaultdict

import json
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader

from transformers import (
    AdamW,
    T5ForConditionalGeneration,
    T5TokenizerFast as T5Tokenizer,
    get_linear_schedule_with_warmup
)

import pytorch_lightning as pl
from pytorch_lightning.callbacks import ModelCheckpoint, EarlyStopping
from pytorch_lightning.loggers import TensorBoardLogger

from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, multilabel_confusion_matrix

import seaborn as sns
from pylab import rcParams
import matplotlib.pyplot as plt
from matplotlib import rc

%matplotlib inline
%config InlineBackend.figure_format='retina'

RANDOM_SEED = 42

sns.set(style='whitegrid', palette='muted', font_scale=1.2)
HAPPY_COLORS_PALETTE = ["#01BEFE", "#FFDD00", "#FF7D00" , "#FF006D", "#ADFF02", "#8F00FF"]
sns.set_palette(sns.color_palette(HAPPY_COLORS_PALETTE))
rcParams['figure.figsize'] = 12, 8

pl.seed_everything(RANDOM_SEED)

from collections import Counter

tqdm.pandas()

INFO:lightning_fabric.utilities.seed:Seed set to 42


In [None]:
df = pd.read_csv('/content/drive/MyDrive/FNS_Dataset_2023/training/traindata2.csv')

In [None]:
df=df.iloc[:800]

In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 800 entries, 0 to 799
Data columns (total 11 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   Unnamed: 0          800 non-null    int64  
 1   report              800 non-null    object 
 2   summary_2           798 non-null    object 
 3   summary_3           799 non-null    object 
 4   summary_4           192 non-null    object 
 5   summary_1           796 non-null    object 
 6   summary_5           25 non-null     object 
 7   summary_6           0 non-null      object 
 8   summary_7           0 non-null      object 
 9   best_summary_index  800 non-null    int64  
 10  best_summary_score  800 non-null    float64
dtypes: float64(1), int64(2), object(8)
memory usage: 68.9+ KB


In [None]:
import pandas as pd

# Function to retrieve summary data based on best_summary_index
def get_summary(row):
    best_summary_index = row['best_summary_index']
    summary_column = f'summary_{best_summary_index}'
    return row[summary_column]

# Apply the function to create the 'source' column
df['target'] = df.apply(get_summary, axis=1)

In [None]:
columns_to_keep = ['report', 'target']  # Replace 'target' with the actual column name you want to keep

# Drop columns that are not in the list of columns to keep
df.drop(df.columns.difference(columns_to_keep), axis=1, inplace=True)


In [None]:
df.rename(columns = {'report':'source'}, inplace = True)
df

Unnamed: 0,source,target
0,24303.04 10 November 2015 12:43 PM proof...,24303.04 10 November 2015 12:43 PM proof...
1,Synergy Health plc \nAnnual Report and Accou...,2010 2011 2012 2013 2014\n286.4 287.3\n312.0\...
2,Shanks Group plc Annual Report and Accounts 2...,shanks. annual report 2008 3\nchairman’s stat...
3,Annual Report & Accounts\n2012 Contents\nWynn...,1 Wynnstay Group Plc Annual Report & Accounts...
4,HSBC Holdings plc \nAnnual Report and Account...,HSBC HOLDINGS PLC \nReport of the Directors: ...
...,...,...
795,2006\nWalker Greenbank PLC\nAnnual Report and...,Walker Greenbank PLC Annual Reports and Accou...
796,DS Smith Plc\nBeech House\nWhitebrook Park\n6...,Adjusted operating proﬁt\n£136.1m\n2010: £98....
797,Christie Group plc\nWhitefriars House\n6 Carm...,REVENUE BY DIVISION\nREVENUE BY SECTOR\n2011 ...
798,iSOFT Group plc\n2005 annual report and accou...,01\n75.5%\nTurnover growth of 75.5%\nto £262....


In [None]:
train_df, test_df = train_test_split(df, test_size=0.2)
train_df.shape, test_df.shape

((640, 2), (160, 2))

In [None]:
import sys
import transformers
import pandas as pd
import numpy as np
import glob
import math
import random
import re
import argparse
import nltk
from transformers import Trainer, TrainingArguments

In [None]:
from transformers import BartTokenizer, BartForConditionalGeneration, AdamW, BartConfig

tokenizer = BartTokenizer.from_pretrained('facebook/bart-large', add_prefix_space=True)

bart_model = BartForConditionalGeneration.from_pretrained("facebook/bart-large")

Downloading (…)okenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/1.63k [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/1.02G [00:00<?, ?B/s]

In [None]:
new_tokens = ['<F>', '<RLC>', '<A>', '<S>', '<P>', '<R>', '<RPC>']

special_tokens_dict = {'additional_special_tokens': new_tokens}
num_added_toks = tokenizer.add_special_tokens(special_tokens_dict)
bart_model.resize_token_embeddings(len(tokenizer))

Embedding(50272, 1024)

In [None]:
import glob
from nltk import tokenize
import nltk
import transformers
from torch.utils.data import DataLoader, TensorDataset, random_split, RandomSampler, Dataset
import pandas as pd
import numpy as np
import torch.nn.functional as F
import pytorch_lightning as pl
import torch
from pytorch_lightning.callbacks import ModelCheckpoint

class LitModel(pl.LightningModule):
  # Instantiate the model
  def __init__(self, learning_rate, tokenizer, model):
    super().__init__()
    self.tokenizer = tokenizer
    self.model = model
    self.learning_rate = learning_rate
    # self.freeze_encoder = freeze_encoder
    # self.freeze_embeds_ = freeze_embeds
#     self.hparams = argparse.Namespace()

    self.hparams.freeze_encoder = True
    self.hparams.freeze_embeds = True
    self.hparams.eval_beams = 4
    # self.hparams = hparams

    if self.hparams.freeze_encoder:
      freeze_params(self.model.get_encoder())

    if self.hparams.freeze_embeds:
      self.freeze_embeds()

  def freeze_embeds(self):
    ''' freeze the positional embedding parameters of the model; adapted from finetune.py '''
    freeze_params(self.model.model.shared)
    for d in [self.model.model.encoder, self.model.model.decoder]:
      freeze_params(d.embed_positions)
      freeze_params(d.embed_tokens)

  # Do a forward pass through the model
  def forward(self, input_ids, **kwargs):
    return self.model(input_ids, **kwargs)

  def configure_optimizers(self):
    optimizer = torch.optim.Adam(self.parameters(), lr = self.learning_rate)
    return optimizer

  def training_step(self, batch, batch_idx):
    # Load the data into variables
    src_ids, src_mask = batch[0], batch[1]
    tgt_ids = batch[2]
    # Shift the decoder tokens right (but NOT the tgt_ids)
    decoder_input_ids = shift_tokens_right(tgt_ids, self.tokenizer.pad_token_id)

    # Run the model and get the logits
    outputs = self(src_ids, attention_mask=src_mask, decoder_input_ids=decoder_input_ids, use_cache=False)
    lm_logits = outputs[0]
    # Create the loss function
    ce_loss_fct = torch.nn.CrossEntropyLoss(ignore_index=self.tokenizer.pad_token_id)
    # Calculate the loss on the un-shifted tokens
    loss = ce_loss_fct(lm_logits.view(-1, lm_logits.shape[-1]), tgt_ids.view(-1))

    return {'loss':loss}

  def validation_step(self, batch, batch_idx):

    src_ids, src_mask = batch[0], batch[1]
    tgt_ids = batch[2]

    decoder_input_ids = shift_tokens_right(tgt_ids, self.tokenizer.pad_token_id)

    # Run the model and get the logits
    outputs = self(src_ids, attention_mask=src_mask, decoder_input_ids=decoder_input_ids, use_cache=False)
    lm_logits = outputs[0]

    ce_loss_fct = torch.nn.CrossEntropyLoss(ignore_index=self.tokenizer.pad_token_id)
    val_loss = ce_loss_fct(lm_logits.view(-1, lm_logits.shape[-1]), tgt_ids.view(-1))

    return {'loss': val_loss}

  # Method that generates text using the BartForConditionalGeneration's generate() method
  def generate_text(self, text, eval_beams, early_stopping = True, max_len = 1024):
    ''' Function to generate text '''
    generated_ids = self.model.generate(
        text["input_ids"],
        attention_mask=text["attention_mask"],
        use_cache=True,
        decoder_start_token_id = self.tokenizer.pad_token_id,
        num_beams= eval_beams,
        max_length = max_len,
        early_stopping = early_stopping
    )
    return [self.tokenizer.decode(w, skip_special_tokens=True, clean_up_tokenization_spaces=True) for w in generated_ids]

def freeze_params(model):
  ''' Function that takes a model as input (or part of a model) and freezes the layers for faster training
      adapted from finetune.py '''
  for layer in model.parameters():
    layer.requires_grade = False


# Create a dataloading module as per the PyTorch Lightning Docs
class SummaryDataModule(pl.LightningDataModule):
  def __init__(self, tokenizer, df, batch_size):
    super().__init__()
    self.tokenizer = tokenizer
    self.batch_size = batch_size
    self.data = df

  # Loads and splits the data into training, validation and test sets with a 60/20/20 split
  def prepare_data(self):
    self.train, self.validate, self.test = np.split(self.data.sample(frac=1), [int(.6*len(self.data)), int(.8*len(self.data))])

  # encode the sentences using the tokenizer
  def setup(self, stage):
    self.train = encode_sentences(self.tokenizer, self.train['source'], self.train['target'])
    self.validate = encode_sentences(self.tokenizer, self.validate['source'], self.validate['target'])
    self.test = encode_sentences(self.tokenizer, self.test['source'], self.test['target'])

  # Load the training, validation and test sets in Pytorch Dataset objects
  def train_dataloader(self):
    dataset = TensorDataset(self.train['input_ids'], self.train['attention_mask'], self.train['labels'])
    train_data = DataLoader(dataset, sampler = RandomSampler(dataset), batch_size = self.batch_size)
    return train_data

  def val_dataloader(self):
    dataset = TensorDataset(self.validate['input_ids'], self.validate['attention_mask'], self.validate['labels'])
    val_data = DataLoader(dataset, batch_size = self.batch_size)
    return val_data

  def test_dataloader(self):
    dataset = TensorDataset(self.test['input_ids'], self.test['attention_mask'], self.test['labels'])
    test_data = DataLoader(dataset, batch_size = self.batch_size)
    return test_data



def shift_tokens_right(input_ids, pad_token_id):
  """ Shift input ids one token to the right, and wrap the last non pad token (usually <eos>).
      This is taken directly from modeling_bart.py
  """
  prev_output_tokens = input_ids.clone()
  index_of_eos = (input_ids.ne(pad_token_id).sum(dim=1) - 1).unsqueeze(-1)
  prev_output_tokens[:, 0] = input_ids.gather(1, index_of_eos).squeeze()
  prev_output_tokens[:, 1:] = input_ids[:, :-1]
  return prev_output_tokens

def encode_sentences(tokenizer, source_sentences, target_sentences, max_length=1024, min_length = 512, pad_to_max_length=True, return_tensors="pt"):
  ''' Function that tokenizes a sentence
      Args: tokenizer - the BART tokenizer; source and target sentences are the source and target sentences
      Returns: Dictionary with keys: input_ids, attention_mask, target_ids
  '''

  input_ids = []
  attention_masks = []
  target_ids = []
  tokenized_sentences = {}

  for sentence in source_sentences:
    encoded_dict = tokenizer(
          sentence,
          max_length=max_length,
          padding="max_length" if pad_to_max_length else None,
          truncation=True,
          return_tensors=return_tensors,
          add_prefix_space = True
      )

    input_ids.append(encoded_dict['input_ids'])
    attention_masks.append(encoded_dict['attention_mask'])

  input_ids = torch.cat(input_ids, dim = 0)
  attention_masks = torch.cat(attention_masks, dim = 0)

  for sentence in target_sentences:
    encoded_dict = tokenizer(
          sentence,
          max_length=min_length,
          padding="max_length" if pad_to_max_length else None,
          truncation=True,
          return_tensors=return_tensors,
          add_prefix_space = True
      )
    # Shift the target ids to the right
    # shifted_target_ids = shift_tokens_right(encoded_dict['input_ids'], tokenizer.pad_token_id)
    target_ids.append(encoded_dict['input_ids'])

  target_ids = torch.cat(target_ids, dim = 0)


  batch = {
      "input_ids": input_ids,
      "attention_mask": attention_masks,
      "labels": target_ids,
  }

  return batch

In [None]:
summary_data = SummaryDataModule(tokenizer, train_df, batch_size = 1)
model = LitModel(learning_rate = 2e-5, tokenizer = tokenizer, model = bart_model)

In [None]:
model=bart_model
bart_model = LitModel.load_from_checkpoint("/content/drive/MyDrive/FNS_Dataset_2023/training/fns_output.ckpt",learning_rate = 2e-5, tokenizer = tokenizer, model = model)

In [None]:
def summarizeText(text):
    text_encoding = tokenizer(
        text,
        max_length=512,
        padding='max_length',
        truncation=True,
        return_attention_mask=True,
        add_special_tokens=True,
        return_tensors='pt'
    )
    generated_ids = bart_model.model.generate(
        input_ids=text_encoding['input_ids'],
        attention_mask=text_encoding['attention_mask'],
        max_length=210,
        num_beams=2,
        repetition_penalty=2.5,
        length_penalty=1.0,
        early_stopping=True
    )

    preds = [
            tokenizer.decode(gen_id, skip_special_tokens=True, clean_up_tokenization_spaces=True)
            for gen_id in generated_ids
    ]
    return "".join(preds)

In [None]:
def merge(text):
  x=text.split('\n')
  summary = '\n'.join(x)
  return summary

In [None]:
from tqdm import tqdm
for _,row in tqdm(test_df.iterrows(),total=len(test_df)):
  text = row['source']
  model_summary = merge(summarizeText(text))
  test_df.loc[_, 'summary_created'] = model_summary

100%|██████████| 160/160 [1:06:21<00:00, 24.89s/it]


In [None]:
test_df.iloc[6]['summary_created']

' 17384\t 06/05/2010\t Proof\t6\nHighlights \nOperational highlights \n> Strong performance in Business Services, with revenue up 23%, and \nEBITDA up 110%. \n > Continued expansion of Fleet Director direct sales team, with volumes \nfrom National Accounts trebling in 2009.\n> 12% growth in Fleet Director subscriber base to 98,000 vehicle \ntracking units.\n+44% increase in revenues from the year ended 31 December 2009 (2009: 57,000) to 98k vehicles.\n-1\n-2\n-3\n-4\n-5\n-6\n-7\n-8\n-9\n-10\n-11\n-12\n-13\n-14\n-15\n-16\n-17\n-18\n-19\n-20\n-21\n-22\n-24\n-25\n-26\n-27\n-28\n-29\n-30\n-32\n-'

In [None]:
test_df.head()

Unnamed: 0,source,target,summary_created
696,Annual report and accounts 2009\nGROW\nINTEGR...,Contents\n01 Financial highlights\n02 Chairm...,Annual report and accounts 2009\nGROW\nINTEGR...
667,Annual Report \nand Accounts 2016\nGrainger ...,Net rental income\n1\n£37.4m +15%\n(FY15: £32...,Grainger plc Annual Report and Accounts 2016\...
63,Annual Report & Financial Statements 2006\nGr...,Jessops plc Report & Accounts 2006\n03\nI am ...,Annual Report & Financial Statements 2006\nGr...
533,Resilience \nPerformance \nGrowth \nEmpresari...,Overview\nEmpresaria Group plc \nAnnual repor...,Empresaria Group plc\nEmpresaria is an intern...
66,Annual\nReport\n2015 Welcome to our \n201 5 A...,Financial and \nOperational Highlights.\nGro...,SuperGroup Plc (“SuperGroup” or the \n“Compan...


In [None]:
test_df.to_csv('/content/drive/MyDrive/FNS_Dataset_2023/training/test_summaries.csv')

In [None]:
pip install rouge_score

Collecting rouge_score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: rouge_score
  Building wheel for rouge_score (setup.py) ... [?25l[?25hdone
  Created wheel for rouge_score: filename=rouge_score-0.1.2-py3-none-any.whl size=24932 sha256=d200c8d167a0bad670ae23d5edaa9a50ef92e43bb2efb562ee9f20b650700f2e
  Stored in directory: /root/.cache/pip/wheels/5f/dd/89/461065a73be61a532ff8599a28e9beef17985c9e9c31e541b4
Successfully built rouge_score
Installing collected packages: rouge_score
Successfully installed rouge_score-0.1.2


In [None]:
import pandas as pd
from rouge_score import rouge_scorer
scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
rouge_1_scores = []
rouge_2_scores = []
rouge_l_scores = []
for _, row in test_df.iterrows():
    reference_text = row['target']
    generated_text = row['summary_created']
    scores = scorer.score(reference_text, generated_text)
    rouge_1_scores.append(scores['rouge1'].fmeasure)
    rouge_2_scores.append(scores['rouge2'].fmeasure)
    rouge_l_scores.append(scores['rougeL'].fmeasure)

In [None]:
print('Average ROUGE-1 Score:',np.mean(rouge_1_scores))
print('Average ROUGE-2 Score:',np.mean(rouge_2_scores))
print('Average ROUGE-L Score:',np.mean(rouge_l_scores))

Average ROUGE-1 Score: 0.1911711851574247
Average ROUGE-2 Score: 0.10912989574107021
Average ROUGE-L Score: 0.14205016738325055


In [None]:
test_df['rouge_1_score'] = rouge_1_scores
test_df['rouge_2_score'] = rouge_2_scores
test_df['rouge_l_score'] = rouge_l_scores

avg_rouge_1 = test_df['rouge_1_score'].mean()
avg_rouge_2 = test_df['rouge_2_score'].mean()
avg_rouge_l = test_df['rouge_l_score'].mean()
print("Average ROUGE-1 Score:", avg_rouge_1)
print("Average ROUGE-2 Score:", avg_rouge_2)
print("Average ROUGE-L Score:", avg_rouge_l)

Average ROUGE-1 Score: 0.1911711851574247
Average ROUGE-2 Score: 0.10912989574107021
Average ROUGE-L Score: 0.14205016738325055


In [None]:
test_df.head()

Unnamed: 0,source,target,summary_created,rouge_1_score,rouge_2_score,rouge_l_score
696,Annual report and accounts 2009\nGROW\nINTEGR...,Contents\n01 Financial highlights\n02 Chairm...,Annual report and accounts 2009\nGROW\nINTEGR...,0.390977,0.310606,0.323308
667,Annual Report \nand Accounts 2016\nGrainger ...,Net rental income\n1\n£37.4m +15%\n(FY15: £32...,Grainger plc Annual Report and Accounts 2016\...,0.228782,0.044444,0.103321
63,Annual Report & Financial Statements 2006\nGr...,Jessops plc Report & Accounts 2006\n03\nI am ...,Annual Report & Financial Statements 2006\nGr...,0.158895,0.048527,0.072539
533,Resilience \nPerformance \nGrowth \nEmpresari...,Overview\nEmpresaria Group plc \nAnnual repor...,Empresaria Group plc\nEmpresaria is an intern...,0.284404,0.02765,0.110092
66,Annual\nReport\n2015 Welcome to our \n201 5 A...,Financial and \nOperational Highlights.\nGro...,SuperGroup Plc (“SuperGroup” or the \n“Compan...,0.190789,0.042904,0.075658
