In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
%cd drive/My\ Drive/Colab\ Notebooks/apex-codes/citation_sum

/content/drive/My Drive/Colab Notebooks/apex-codes/citation_sum


In [None]:
!pip3 install git+https://github.com/huggingface/transformers

Collecting git+https://github.com/huggingface/transformers
  Cloning https://github.com/huggingface/transformers to /tmp/pip-req-build-ry0zvi6p
  Running command git clone -q https://github.com/huggingface/transformers /tmp/pip-req-build-ry0zvi6p
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
    Preparing wheel metadata ... [?25l[?25hdone
Collecting pyyaml>=5.1
  Downloading PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (596 kB)
[K     |████████████████████████████████| 596 kB 5.1 MB/s 
[?25hCollecting huggingface-hub<1.0,>=0.1.0
  Downloading huggingface_hub-0.1.2-py3-none-any.whl (59 kB)
[K     |████████████████████████████████| 59 kB 7.8 MB/s 
Collecting tokenizers<0.11,>=0.10.1
  Downloading tokenizers-0.10.3-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (3.3 MB)
[K     |█████████████████████████████

In [None]:
!pip3 install torch
!pip3 install sentencepiece

Collecting sentencepiece
  Downloading sentencepiece-0.1.96-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.2 MB)
[K     |████████████████████████████████| 1.2 MB 5.1 MB/s 
[?25hInstalling collected packages: sentencepiece
Successfully installed sentencepiece-0.1.96


In [None]:
from transformers import BartTokenizer, BartForConditionalGeneration, BartConfig
from transformers import (BartTokenizer, BartForConditionalGeneration, T5Tokenizer, T5ForConditionalGeneration, 
                          PegasusTokenizer, PegasusForConditionalGeneration, ProphetNetTokenizer, ProphetNetForConditionalGeneration)

import torch
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader, RandomSampler, SequentialSampler

In [None]:
import pandas as pd
import numpy as np
import os
import json
from pprint import pprint

In [None]:
from torch import cuda
device = 'cuda' if cuda.is_available() else 'cpu'
print(device)

cuda


## Load the fine-tuned baseline (fine tuned on 50k on arXiv dataset)
## Citation contexts with their respective reference article's abstract----this is just for one field of study

In [None]:
def _loadModel(model_name):
  if model_name == 'BART':
    model = BartForConditionalGeneration.from_pretrained("../citation_sum/BART-checkpoints/")
    tokenizer = BartTokenizer.from_pretrained("facebook/bart-base")
  elif model_name == 'T5':
    model = T5ForConditionalGeneration.from_pretrained("../citation_sum/T5-checkpoints/")
    tokenizer = T5Tokenizer.from_pretrained("t5-base")
  elif model_name == "Pegasus":
    model = PegasusForConditionalGeneration.from_pretrained("../citation_sum/Pegasus-checkpoints/")
    tokenizer = PegasusTokenizer.from_pretrained('google/pegasus-xsum')
  elif model_name == "Prophetnet":
    model = ProphetNetForConditionalGeneration.from_pretrained("../citation_sum/Prophetnet-checkpoints/")
    tokenizer = ProphetNetTokenizer.from_pretrained('microsoft/prophetnet-large-uncased-cnndm')

  return model, tokenizer

### Load model and tokenizer

In [None]:
model_name = "Prophetnet"   # Specify the model_name to work with here

model, tokenizer = _loadModel(model_name)
model = model.to(device)

Downloading:   0%|          | 0.00/226k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/256 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

In [None]:
# Iterate through each SciSummNet folder and generate abstractive summaries from 1) citing sentences alone; 2) citing sentences+title
# 3) citing sentences + title + RP topics

def _generate_final_summary(ARTICLE_TO_SUMMARIZE, RESULTS_SUB_DIR, paper_id, MAX_LEN=250):
  # Generate Summary
  inputs = tokenizer(ARTICLE_TO_SUMMARIZE, truncation=True, padding='longest',return_tensors='pt').to(device)
  # Get the highest-scoring beam as the abstractive summary
  summary_ids = model.generate(inputs['input_ids'], num_beams=4, min_length=50, max_length=MAX_LEN, 
                               early_stopping=True, num_return_sequences=1)
  tgt_text = tokenizer.batch_decode(summary_ids, skip_special_tokens=True)
  abstractive_summary = tgt_text[0]

  # Write the generated summary to the target BART results directory
  with open(f"{RESULTS_SUB_DIR}/{paper_id}.txt", 'w') as fp:
    fp.write(abstractive_summary)
  fp.close()


## Iterate through the 1000 reference papers in the scisummnet corpus and call the method defined above

In [None]:
DATA_PATH = "ScisummNet/scisummnet_release1.1__20190413/top1000_complete"
RESULTS_DIR = f"{model_name}_Results_SciSummNet"   # changes based on the model currently being used
w_citing_sentences_w_rp_abstract = True   # changes according to input configuration (if False, it is with citation contexts only)

for count, paper_id in enumerate(os.listdir(DATA_PATH)):
  if count % 100 == 0:
    print(count)
  citing_sentences = list()   # to store all incoming citing sentences
  for file in os.listdir(os.path.join(DATA_PATH, paper_id)):
    if file.endswith('.json'):
      with open(os.path.join(f"{DATA_PATH}/{paper_id}", file), 'r') as fp:
        data = json.load(fp)
      fp.close()
      citing_sentences = [obj['clean_text'] for obj in data]

  complete_citing_sentences_str = " ".join(citing_sentences)

  # Read the RP abstract
  with open(f"{DATA_PATH}/{paper_id}/referece_paper_abstract.txt", 'r') as fp:
    rp_abstract = fp.read().strip()
  fp.close()

  if w_citing_sentences_w_rp_abstract:
    TARGET_SUB_DIR = "SUMMARIES_FROM_CITATIONS_AND_RP_ABSTRACT"   # for citation contexts + RP abstract
    ARTICLE_TO_SUMMARIZE = rp_abstract + " " + complete_citing_sentences_str
  else:
    TARGET_SUB_DIR = "SUMMARIES_FROM_CITATIONS_ONLY"   # changes based on input type or what the input text contains
    ARTICLE_TO_SUMMARIZE = complete_citing_sentences_str

  # Now call to the BART/T5 generator method above

  # create a sub-directory corresponding to the input type (whether it is just citing sentences of contains other inputs
  # like RP title or RP abstact or RP topic keywords)
  RESULTS_SUB_DIR = f"{RESULTS_DIR}/{TARGET_SUB_DIR}"   # this line changes corresponding to the input
  os.makedirs(RESULTS_SUB_DIR, exist_ok=True)

  # call to the method generating the abstractive summaries
  _generate_final_summary(ARTICLE_TO_SUMMARIZE, RESULTS_SUB_DIR, paper_id)
  

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


0
100
200
300
400
500
600
700
800
900
1000


## Helper Code on how to read complete model from a file system and save into a checkpoint directory

In [None]:
!pip3 install transformers==3.5.0

Collecting transformers==3.5.0
  Using cached transformers-3.5.0-py3-none-any.whl (1.3 MB)
Collecting tokenizers==0.9.3
  Using cached tokenizers-0.9.3-cp37-cp37m-manylinux1_x86_64.whl (2.9 MB)
Installing collected packages: tokenizers, transformers
  Attempting uninstall: tokenizers
    Found existing installation: tokenizers 0.10.3
    Uninstalling tokenizers-0.10.3:
      Successfully uninstalled tokenizers-0.10.3
  Attempting uninstall: transformers
    Found existing installation: transformers 4.13.0.dev0
    Uninstalling transformers-4.13.0.dev0:
      Successfully uninstalled transformers-4.13.0.dev0
Successfully installed tokenizers-0.9.3 transformers-3.5.0


In [None]:
from torch.utils.data import Dataset, DataLoader, RandomSampler, SequentialSampler

from transformers import (T5Tokenizer, T5ForConditionalGeneration, 
                          BartTokenizer, BartForConditionalGeneration)

In [None]:
model_name = "pubmed-T5"
model_path = f"pubmed-pytorch_models"

# model path
model_path = f"../entity_sum/pubmed-pytorch_models/{model_name}.pt"

model = torch.load(model_path)

In [None]:
model.save_pretrained('../entity_sum/T5-checkpoints/')

In [None]:
model = T5ForConditionalGeneration.from_pretrained("../entity_sum/T5-checkpoints/")
tokenizer = T5Tokenizer.from_pretrained("t5-base")

In [None]:
inputs = tokenizer(["Deep Learning and Machine Learning are everywhere"], max_length=1024, return_tensors='pt')
summary_ids = model.generate(inputs['input_ids'], num_beams=4, min_length=50, max_length=100, early_stopping=True)
abstractive_summary = [tokenizer.decode(g, skip_special_tokens=True, clean_up_tokenization_spaces=False) for g in summary_ids][0]

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
