In [1]:
# Run in python console
import nltk; nltk.download('stopwords')

from google.colab import drive
drive.mount('/content/drive')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
Mounted at /content/drive


In [2]:
%cd drive/My\ Drive/Colab\ Notebooks/apex-codes/citation_sum

/content/drive/My Drive/Colab Notebooks/apex-codes/citation_sum


In [3]:
import os
import pandas as pd
import pickle as pk
import numpy as np
import nltk
nltk.download('punkt')
from nltk.tokenize import word_tokenize
from nltk import word_tokenize

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


## Compute Novelty

In [4]:
!python3 -m spacy download en

Collecting en_core_web_sm==2.2.5
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-2.2.5/en_core_web_sm-2.2.5.tar.gz (12.0 MB)
[K     |████████████████████████████████| 12.0 MB 5.3 MB/s 
[38;5;2m✔ Download and installation successful[0m
You can now load the model via spacy.load('en_core_web_sm')
[38;5;2m✔ Linking successful[0m
/usr/local/lib/python3.7/dist-packages/en_core_web_sm -->
/usr/local/lib/python3.7/dist-packages/spacy/data/en
You can now load the model via spacy.load('en')


In [5]:
import sys
import re
import os
import csv
import string
import json
import spacy
import nltk
import gensim
import collections
import pandas as pd
import warnings
import matplotlib.pyplot as plt
from nltk.tokenize import word_tokenize
from nltk.tokenize import sent_tokenize
from nltk.corpus import stopwords
warnings.filterwarnings('ignore')
from gensim.models import Phrases
from gensim.utils import simple_preprocess

#%matplotlib inline

stop_words = set(stopwords.words("english")) 
stopwords = str(stop_words)
stop = open('terrier-stop.txt')
stopString = stop.read()
common_terms = stopString.split()

## Utility functions

In [6]:
# Utility functions

def preprocess_data(data):
  data = data
  # Remove Emails
  data = [re.sub('\S*@\S*\s?', '', sent) for sent in data]
  # Remove new line characters
  data = [re.sub('\s+', ' ', sent) for sent in data]
  # Remove distracting single quotes
  data = [re.sub("\'", "", sent) for sent in data]

  return data

def sent_to_words(sentences):
    for sentence in sentences:
        yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))  # deacc=True removes punctuations

######################################################################################

# Define functions for stopwords, bigrams, trigrams and lemmatization
def remove_stopwords(texts):
    return [[word for word in simple_preprocess(str(doc)) if word not in stop_words] for doc in texts]

def make_bigrams(texts, bigram_mod):
    return [bigram_mod[doc] for doc in texts]

def make_trigrams(texts, bigram_mod, trigram_mod):
    return [trigram_mod[bigram_mod[doc]] for doc in texts]

def lemmatization(texts, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
    """https://spacy.io/api/annotation"""
    #!python3 -m spacy download en
    nlp = spacy.load('en', disable=['parser', 'ner'])
    texts_out = []
    for sent in texts:
        doc = nlp(" ".join(sent)) 
        texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])
    return texts_out

#####################################################################################

def nlp_process(data_words, bigram_mod, trigram_mod):
  # Remove Stop Words
  data_words_nostops = remove_stopwords(data_words)

  # Form Bigrams
  data_words_bigrams = make_bigrams(data_words_nostops, bigram_mod)

  # Do lemmatization keeping only noun, adj, vb, adv
  data_lemmatized = lemmatization(data_words_bigrams)

  return data_lemmatized

### Unigrams generator function

In [7]:
# This function returns unigrams for 1) abstract or 2) the collection of topic-aware citation contexts---so it must be passed the raw abstrac
# or the raw citation contexts---this one generates top 50 unigrams
def _generate_unigrams(text):
  unigrams_list = []

  words = nltk.word_tokenize(text)
  fileList = [word.lower() for word in words if word.isalpha()]
  freeList = [t for t in fileList if t not in common_terms and t not in stop_words and t not in string.punctuation]
  freeString = [t for t in fileList if t not in common_terms and t not in stop_words and t not in string.punctuation]
  a = ' '.join(freeString)       #conversion into a string
  wordcount = {}
  for word in freeList:      
      if word not in stopwords:
          if word not in wordcount:
              wordcount[word] = 1
          else:
              wordcount[word] += 1
  word_counter = collections.Counter(wordcount)
  # Just grab the 50 most frequently occuring unigrams---number can change accordingly
  for word, count in word_counter.most_common(50):
      unigrams_list.append(word)

  return unigrams_list


### Bigrams and trigrams generator function

In [8]:
def _generate_bigrams_trigrams(text):
  #bigrams_list = []
  # tokenize the text into individual sentences that are stored in a list
  filtered_data = sent_tokenize(text)

  data = preprocess_data(filtered_data)

  #######################################################################################
  data_words = list(sent_to_words(data))


  # Build the bigram and trigram models
  bigram = gensim.models.Phrases(data_words, min_count=1, threshold=1) # higher threshold fewer phrases.
  trigram = gensim.models.Phrases(bigram[data_words], min_count=1, threshold=1)  

  # Faster way to get a sentence clubbed as a trigram/bigram
  bigram_mod = gensim.models.phrases.Phraser(bigram)
  trigram_mod = gensim.models.phrases.Phraser(trigram)

  data_lemmatized = nlp_process(data_words, bigram_mod, trigram_mod)

  sentence_stream = data_lemmatized

  bigrams_lemmatized = []
  trigrams_lemmatized = []

  bigram = Phrases(sentence_stream, min_count=3, delimiter=b'_')
  trigram  = Phrases(bigram[sentence_stream], min_count=3, delimiter=b'_')
  for sent in sentence_stream:
      bigrams_ = [b for b in bigram[sent] if b.count('_') == 1]
      trigrams_ = [t for t in trigram[bigram[sent]] 
                                            if t.count('_')==2]
      bigrams_lemmatized.append(bigrams_)
      trigrams_lemmatized.append(trigrams_)
  
  ngrams_lemmatized =  trigrams_lemmatized + bigrams_lemmatized

  return bigrams_lemmatized, trigrams_lemmatized
  #return ngrams_lemmatized

# Compute Novelty with reference to gound truth summaries

---



In [9]:
def _compute_novelty_I(gen_summary, human_summary):
  # generate unigrams for the summary and the abstract
  gen_summary_unigrams = _generate_unigrams(gen_summary)
  human_summary_unigrams = _generate_unigrams(human_summary)

  # generate bigrams and trigrams for the summary and the abstract
  gen_summary_bi_trigrams, gen_summary_tri_trigrams = _generate_bigrams_trigrams(gen_summary)
  human_summary_bi_trigrams, human_summary_tri_trigrams = _generate_bigrams_trigrams(human_summary)

  # Total n-grams for the summary and the abstract
  gen_summary_ngrams = gen_summary_unigrams + gen_summary_bi_trigrams + gen_summary_tri_trigrams
  human_summary_ngrams = human_summary_unigrams + human_summary_bi_trigrams + human_summary_tri_trigrams


  diff_list = [ngram for ngram in gen_summary_ngrams if ngram not in human_summary_ngrams]   # unigram that is in a summary but not in the abstract

  novelty_score = float(len(diff_list)) / float(len(gen_summary_ngrams))
  novelty_score = novelty_score * 100

  return novelty_score

  #print(f"{RESULTS_PATH} Novelty Score wrt abstracts: %.2f" % novelty_score)


## Evaluation against the citation contexts themselves

In [11]:
def main():
  # Read the citation contexts for each reference paper
  model_name = "TransFuse"  # changes with the model used
  w_citing_sentences_w_rp_abstract = True # Turn on or off based on whether to have RP abstract as input

  if w_citing_sentences_w_rp_abstract:
    summary_type = "SUMMARIES_FROM_CITATIONS_AND_RP_ABSTRACT"
  else:
    summary_type = "SUMMARIES_FROM_CITATIONS_ONLY"   # changes with the summary type

  GENERATED_SUMMARY_PATH = f"{model_name}_Results_SciSummNet/{summary_type}"
  HUMAN_SUMMARY_PATH = "ScisummNet/scisummnet_release1.1__20190413/top1000_complete"

  # First read all the generated summaries into a dict where the key is the paper_id---same for human summaries---the keys will be used to match the two
  dict_generated_summaries = {}
  dict_citation_contexts = {}

  # Read all the generated summaries
  for paper_id in os.listdir(GENERATED_SUMMARY_PATH):
    paper_id_wo_txt = str(paper_id.replace('.txt', ''))
    with open(os.path.join(GENERATED_SUMMARY_PATH, paper_id), 'r') as fp:
      summary = fp.read()
    fp.close()
    dict_generated_summaries[paper_id_wo_txt] = summary

  DATA_PATH = "ScisummNet/scisummnet_release1.1__20190413/top1000_complete"
  RESULTS_DIR = f"{model_name}_Results_SciSummNet"   # changes based on the model currently being used
  #w_citing_sentences_w_rp_abstract = True   # changes according to input configuration (if False, it is with citation contexts only)
  dict_citation_contexts = {}

  for iter, paper_id in enumerate(os.listdir(DATA_PATH)):
    if iter % 200 == 0:
      print("Iteration: {}".format(iter))
    citing_sentences = list()   # to store all incoming citing sentences
    for file in os.listdir(os.path.join(DATA_PATH, paper_id)):
      if file.endswith('.json'):
        with open(os.path.join(f"{DATA_PATH}/{paper_id}", file), 'r') as fp:
          data = json.load(fp)
        fp.close()
        citing_sentences = [obj['clean_text'] for obj in data]

    complete_citing_sentences_str = " ".join(citing_sentences)
    dict_citation_contexts[paper_id] = complete_citing_sentences_str

  ## Call to the the novelty computing method
  total_no_summaries = len(dict_generated_summaries)

  novelty_sum = 0.0
  for paper_id, gen_summary in dict_generated_summaries.items():
    citation_contexts = dict_citation_contexts[paper_id]

    # call to the novelty evaluation method
    try:
      novelty = _compute_novelty_I(gen_summary, citation_contexts)
    except:
      continue

    novelty_sum += novelty

  avg_novelty = novelty_sum/float(total_no_summaries)

  print(f"\nFinal Novelty for summaries using {model_name} and {summary_type} wrt citation contexts: %.2f" % avg_novelty)


In [12]:
if __name__ == "__main__":
  main()

Iteration: 0
Iteration: 200
Iteration: 400
Iteration: 600
Iteration: 800
Iteration: 1000

Final Novelty for summaries using TransFuse and SUMMARIES_FROM_CITATIONS_AND_RP_ABSTRACT wrt citation contexts: 45.79
