In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
%cd drive/My\ Drive/Colab\ Notebooks/apex-codes/entity_sum

/content/drive/My Drive/Colab Notebooks/apex-codes/entity_sum


## The ROUGE evaluation is done between a generated summary for a cited article and its abstract (both can be accessed using their cited ids from the respective folders)

In [None]:
!pip3 install -q rouge

In [None]:
import pandas as pd
import numpy as np
import os
from rouge import Rouge
import json
from collections import defaultdict
from pprint import pprint

## Evaluation against the ground truth summary

In [None]:
def _compute_ROUGE(generated_summary, human_summary):  
  rouge = Rouge()
  
  scores = rouge.get_scores(generated_summary, human_summary)[0]
  
  rouge_1_f = scores['rouge-1']['f']
  rouge_2_f = scores['rouge-2']['f']
  rouge_l_f = scores['rouge-l']['f']

  rouge_1_f = rouge_1_f * 100
  rouge_2_f = rouge_2_f * 100
  rouge_l_f = rouge_l_f * 100

  return rouge_1_f, rouge_2_f, rouge_l_f

## Iterate through the directories housing the generated and human summaries and store into a container and do evaluation

### Call to the ROUGE evaluation method in main method

In [None]:
def main():
  lst_modelName = ["T5", "BART", "Pegasus"]
  lst_model_type = ["vanilla", "w_named_entities"]

  evaluation_wrt_ground_truth = False    # changes based on what target to evaluate the generated to

  dict_rouge_scores = defaultdict(list)

  for modelName in lst_modelName:
    for model_type in lst_model_type:
      #SUMMARY_PATH = f"pubmed-FINAL-SUMMARIES/{modelName}"   # path changes with the type of input at inference time (w or wo named entities)
      SUMMARY_PATH = f"pubmed-FINAL-SUMMARIES_w_named_entites_at_inference/{modelName}"   # path of summaries generated with named entities at inference
      input_filename_path = f"{SUMMARY_PATH}/pubmed-summaries-{model_type}.jsonl"

      output_results_path = f"FINAL_RESULTS/ROUGE"
      os.makedirs(output_results_path, exist_ok=True)
      output_file_name = "pubmed_rouge_scores_wrt_input_article_w_entity_at_inference.xlsx"

      rouge_1_f_sum, rouge_2_f_sum, rouge_l_f_sum = 0.0, 0.0, 0.0   # initialize all cumulative scores to zero
      total_no_summaries = 0
      with open(input_filename_path) as fp:
        for iter, line in enumerate(fp):
          if iter % 2000 == 0 and iter != 0:
            print("Iteration: ", iter)
          dict_data = json.loads(line)

          generated_summary = dict_data["abstractive_summary"]    # generated summary

          if evaluation_wrt_ground_truth:
            evaluation_target = dict_data["article_abstract"]   # human-like summary as evaluation target
          else:
            evaluation_target = dict_data["article_text"]   # source input article----to do evaluation wrt the input article

          # call to the ROUGE computing method
          try:
            rouge_1_f, rouge_2_f, rouge_l_f = _compute_ROUGE(generated_summary, evaluation_target)

            rouge_1_f_sum += rouge_1_f
            rouge_2_f_sum += rouge_2_f
            rouge_l_f_sum += rouge_l_f
            
            total_no_summaries += 1

          except:
            continue

      fp.close()

      avg_rouge_1_f = rouge_1_f_sum/float(total_no_summaries)
      avg_rouge_2_f = rouge_2_f_sum/float(total_no_summaries)
      avg_rouge_l_f = rouge_l_f_sum/float(total_no_summaries)

      dict_rouge_scores["training-config"].append(f"{modelName}-{model_type}")
      dict_rouge_scores["ROUGE-1"].append(avg_rouge_1_f)
      dict_rouge_scores["ROUGE-2"].append(avg_rouge_2_f)
      dict_rouge_scores["ROUGE-L"].append(avg_rouge_l_f)

      pprint(dict_rouge_scores)

  df_rouge_scores = pd.DataFrame(dict_rouge_scores)

  print(df_rouge_scores)

  df_rouge_scores.to_excel(f"{output_results_path}/{output_file_name}")  

In [None]:
if __name__ == "__main__":
  main()

Iteration:  2000
Iteration:  4000
defaultdict(<class 'list'>,
            {'ROUGE-1': [12.608594321314213],
             'ROUGE-2': [6.54928600399756],
             'ROUGE-L': [12.51319331410257],
             'training-config': ['T5-vanilla']})
Iteration:  2000
Iteration:  4000
defaultdict(<class 'list'>,
            {'ROUGE-1': [12.608594321314213, 12.373647770926544],
             'ROUGE-2': [6.54928600399756, 6.199850624086307],
             'ROUGE-L': [12.51319331410257, 12.251141968642578],
             'training-config': ['T5-vanilla', 'T5-w_named_entities']})
Iteration:  2000
Iteration:  4000
defaultdict(<class 'list'>,
            {'ROUGE-1': [12.608594321314213,
                         12.373647770926544,
                         18.2797653333462],
             'ROUGE-2': [6.54928600399756,
                         6.199850624086307,
                         8.595944995095323],
             'ROUGE-L': [12.51319331410257,
                         12.251141968642578,
         

## Entity-level Factual consistency

In [None]:
!pip3 install -q scispacy
!pip3 install -q https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.3.0/en_core_sci_sm-0.3.0.tar.gz
!pip3 install -q https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.4.0/en_ner_bc5cdr_md-0.4.0.tar.gz

!python3 -m spacy download en

[K     |████████████████████████████████| 44 kB 3.0 MB/s 
[K     |████████████████████████████████| 13.5 MB 62.2 MB/s 
[K     |████████████████████████████████| 6.3 MB 75.5 MB/s 
[K     |████████████████████████████████| 71 kB 9.5 MB/s 
[K     |████████████████████████████████| 188 kB 96.1 MB/s 
[K     |████████████████████████████████| 42 kB 1.5 MB/s 
[K     |████████████████████████████████| 628 kB 87.8 MB/s 
[K     |████████████████████████████████| 451 kB 98.5 MB/s 
[K     |████████████████████████████████| 10.1 MB 52.0 MB/s 
[K     |████████████████████████████████| 33.1 MB 127 kB/s 
[?25h  Building wheel for en-core-sci-sm (setup.py) ... [?25l[?25hdone
[K     |████████████████████████████████| 125.1 MB 22 kB/s 
[?25h  Building wheel for en-ner-bc5cdr-md (setup.py) ... [?25l[?25hdone
[38;5;3m⚠ As of spaCy v3.0, shortcuts like 'en' are deprecated. Please use the
full pipeline package name 'en_core_web_sm' instead.[0m
Collecting en-core-web-sm==3.0.0
  Downloading

In [None]:
!pip3 install -q jsonlines

In [None]:
import spacy
import scispacy
import pandas as pd
import numpy as np
import pickle as pk
from pprint import pprint
import os

from spacy import displacy
import en_core_sci_sm
from scispacy.abbreviation import AbbreviationDetector
from scispacy.linking import EntityLinker
from collections import (OrderedDict,Counter, defaultdict)


from tqdm import tqdm
tqdm.pandas()

import warnings
warnings.filterwarnings('ignore')

import nltk
nltk.download('punkt')
from nltk.tokenize import sent_tokenize
from pprint import pprint
import json
import jsonlines
from ast import literal_eval



[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [None]:
def _get_named_entities(input_text):
  nlp = spacy.load("en_ner_bc5cdr_md")
  doc = nlp(input_text)
  entities = []
  for ent in doc.ents:
    entities.append(ent.text)
  str_entities = " | ".join(entities)   # a string representation of list of entities with the pipe symbol as a separator
  
  return str_entities

### Extract named entities from the generated summaries

In [None]:
def main():
  lst_modelName = ["Pegasus"]
  lst_model_type = ["vanilla", "w_named_entities"]

  for modelName in lst_modelName:
    for model_type in lst_model_type:
      SUMMARY_PATH = f"pubmed-FINAL-SUMMARIES/{modelName}"
      input_filename_path = f"{SUMMARY_PATH}/pubmed-summaries-{model_type}.jsonl"

      OUTPUT_RESULTS_PATH = f"pubmed_FINAL-SUMMARIES-w-named_entities/{modelName}"
      os.makedirs(OUTPUT_RESULTS_PATH, exist_ok=True)
      output_file_name = f"pubmed-summaries-{model_type}.jsonl"
      
      with open(input_filename_path) as fp:
        for iter, line in enumerate(fp):
          if iter % 2000 == 0:
            print("Iteration: ", iter)
          dict_data = json.loads(line)

          summary = dict_data["abstractive_summary"]
          summary_named_entities = _get_named_entities(summary)

          dict_1 = {"article_text" : dict_data['article_text'],
                    "article_abstract" : dict_data['article_abstract'],
                    "article_text_named_entities" : dict_data['article_text_named_entities'],
                    "article_abstract_named_entities" : dict_data['article_abstract_named_entities'],
                    "abstractive_summary" : dict_data["abstractive_summary"],
                    "abstractive_summary_named_entities" : summary_named_entities
                    }
              
          with jsonlines.open(f"{OUTPUT_RESULTS_PATH}/{output_file_name}", "a") as writer:
            writer.write(dict_1)
          writer.close()

      fp.close()


In [None]:
if __name__ == "__main__":
  main()

Iteration:  0
Iteration:  2000
Iteration:  4000
Iteration:  0
Iteration:  2000
Iteration:  4000


## Entity-level Factual Consistency using F1 scores

In [None]:
def main():
  lst_modelName = ["T5", "BART", "Pegasus"]
  lst_model_type = ["vanilla", "w_named_entities"]

  evaluation_wrt_ground_truth = False    # changes based on what target to evaluate the generated to

  dict_entity_specificity = defaultdict(list)

  output_results_path = f"FINAL_RESULTS/ENTITY_SPECIFITY"
  os.makedirs(output_results_path, exist_ok=True)
  if evaluation_wrt_ground_truth:
    output_file_name = "pubmed_entity_specifity_scores_wrt_ground_truth_w_entity_at_inference.xlsx"
  else:
    output_file_name = "pubmed_entity_specifity_scores_wrt_source_article_w_entity_at_inference.xlsx"

  for modelName in lst_modelName:
    for model_type in lst_model_type:
      #SUMMARY_PATH = f"pubmed-FINAL-SUMMARIES/{modelName}"   # for vanilla input evaluation
      SUMMARY_PATH = f"pubmed-FINAL-SUMMARIES_w_named_entites_at_inference/{modelName}"    # for input doc + named entity at inference time
      
      input_filename_path = f"pubmed_FINAL-SUMMARIES-w-named_entities/{modelName}/pubmed-summaries-{model_type}.jsonl"

      precision_total, recall_total = 0.0, 0.0   # precision and recall for entity specificity
      total_no_summaries = 0
      with open(input_filename_path) as fp:
        for iter, line in enumerate(fp):
          if iter % 2000 == 0:
            print("Iteration: ", iter)
          dict_data = json.loads(line)

          if evaluation_wrt_ground_truth:
            target_named_entities = dict_data['article_abstract_named_entities'].split(' | ')    # ground truth named entities
          else:
            target_named_entities = dict_data['article_text_named_entities'].split(' | ')   # source article named entities
          abstractive_summary_named_entities = dict_data['abstractive_summary_named_entities'].split(' | ')  # named entities in generated summary

          common_named_entities = list(set(target_named_entities) & set(abstractive_summary_named_entities))
          
          try:
            precision = len(common_named_entities) / float(len(abstractive_summary_named_entities))
            recall = len(common_named_entities) / float(len(target_named_entities))

            precision_total += precision
            recall_total += recall
            total_no_summaries += 1
          except:
            continue

      avg_precision = precision_total / float(total_no_summaries) * 100
      avg_recall = recall_total / float(total_no_summaries) * 100
      f1_score = (2 * avg_precision * avg_recall) / (avg_precision + avg_recall)

      dict_entity_specificity["training-config"].append(f"{modelName}-{model_type}")
      dict_entity_specificity["avg_precision"].append(avg_precision)
      dict_entity_specificity["avg_recall"].append(avg_recall)
      dict_entity_specificity["f1_score"].append(f1_score)
      
  df_entity_specifity = pd.DataFrame(dict_entity_specificity)

  print(df_entity_specifity)

  df_entity_specifity.to_excel(f"{output_results_path}/{output_file_name}")



In [None]:
if __name__ == "__main__":
  main()

Iteration:  0
Iteration:  2000
Iteration:  4000
Iteration:  0
Iteration:  2000
Iteration:  4000
Iteration:  0
Iteration:  2000
Iteration:  4000
Iteration:  0
Iteration:  2000
Iteration:  4000
Iteration:  0
Iteration:  2000
Iteration:  4000
Iteration:  0
Iteration:  2000
Iteration:  4000
            training-config  avg_precision  avg_recall   f1_score
0                T5-vanilla      55.076166    7.975629  13.933531
1       T5-w_named_entities      54.014926    7.232001  12.756101
2              BART-vanilla      58.591788    5.622708  10.260753
3     BART-w_named_entities      60.422346    5.361163   9.848488
4           Pegasus-vanilla      33.821211    7.400850  12.144260
5  Pegasus-w_named_entities      46.756602    7.742743  13.285456


## MAUVE Evaluation

In [None]:
!pip3 install -q mauve-text

[K     |████████████████████████████████| 8.5 MB 6.9 MB/s 
[?25h

In [None]:
!pip3 install -q torch>=1.10.0
!pip3 install -q git+https://github.com/huggingface/transformers

  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
    Preparing wheel metadata ... [?25l[?25hdone
[K     |████████████████████████████████| 3.3 MB 7.2 MB/s 
[K     |████████████████████████████████| 895 kB 58.9 MB/s 
[K     |████████████████████████████████| 596 kB 75.6 MB/s 
[K     |████████████████████████████████| 61 kB 585 kB/s 
[?25h  Building wheel for transformers (PEP 517) ... [?25l[?25hdone


In [None]:
import torch
import transformers
import mauve

In [None]:
def main():
  lst_modelName = ["T5", "BART", "Pegasus"]
  lst_model_type = ["vanilla", "w_named_entities"]

  dict_entity_specificity = defaultdict(list)

  for modelName in lst_modelName:
    for model_type in lst_model_type:
      SUMMARY_PATH = f"pubmed-FINAL-SUMMARIES/{modelName}"
      input_filename_path = f"pubmed_FINAL-SUMMARIES-w-named_entities/{modelName}/pubmed-summaries-{model_type}.jsonl"

      lst_generated_summary = []
      lst_ground_truth_summary = []

      with open(input_filename_path) as fp:
        for iter, line in enumerate(fp):
          if iter % 2000 == 0:
            print("Iteration: ", iter)
          dict_data = json.loads(line)

          lst_generated_summary.append(dict_data["abstractive_summary"])    # generated summary
          lst_ground_truth_summary.append(dict_data["article_abstract"])   # human-like summary

      fp.close()
      
      lst_generated_summary = lst_generated_summary[:100]
      lst_ground_truth_summary = lst_ground_truth_summary[:100]

      out = mauve.compute_mauve(p_text=lst_ground_truth_summary, q_text=lst_generated_summary, verbose=False)

      mauve_score = out.mauve  
            
      dict_entity_specificity["training-config"].append(f"{modelName}-{model_type}")
      dict_entity_specificity["avg_mauve"].append(mauve_score)
      
      
  df_mauve_scores = pd.DataFrame(dict_entity_specificity)

  print(df_mauve_scores)

  df_mauve_scores.to_excel("pubmed_MAUVE_scores.xlsx")



In [None]:
if __name__ == "__main__":
  main()