#Set up

In [2]:
# Special thanks to Christine Chen for coding this!
!pip install bert-serving-client
import pandas as pd
import numpy as np
import os
from pathlib import Path, PurePath
from transformers import *
from summarizer import Summarizer
import logging
import torch
from numpy import ndarray
from typing import List
from tqdm.notebook import tqdm
import spacy

In [3]:
# set data paths, this requires local drive to have a folder calld "COVID-19" with the metadata.csv file
# returns a string to the local path setup
def setup_local_data():
    input_dir = "./Cord-2"
    return input_dir
nlp = spacy.load('/home/acorn/Downloads/en_core_sci_lg-0.2.4/en_core_sci_lg/en_core_sci_lg-0.2.4/', disable=["ner","tagger"])

In [4]:
#read the metadata file into df
def read_metadata_csv(input_dir):
    metadata_path = input_dir  + '/metadata_v5.csv'
    metadata = pd.read_csv(metadata_path, 
                         dtype={'title':str,
                                'abstract':str})
    #set the abstract to the paper title if it is null
    metadata['abstract'] = metadata['abstract'].fillna(metadata['title'])
    #remove if abstract is empty or contains only one word
    metadata = metadata.dropna(subset=['abstract'], axis = 0)
    metadata['number_tokens'] = metadata['abstract'].apply(lambda x: len(x.split()))
    metadata = metadata[metadata['number_tokens']>1].reset_index(drop=True)
    return metadata

In [5]:
def create_custom_model_tokenizer(pretrain_model):
    # Load model, model config and tokenizer via Transformers
    custom_config = AutoConfig.from_pretrained(pretrain_model)
    custom_config.output_hidden_states=True
    custom_tokenizer = AutoTokenizer.from_pretrained(pretrain_model)
    custom_model = AutoModel.from_pretrained(pretrain_model, config=custom_config)
    return custom_model, custom_tokenizer


In [19]:
# from spacy.lang.en import English

# class SentenceHandler(object):

#     def __init__(self, language = English):
#         self.nlp = language()
#         self.nlp.add_pipe(self.nlp.create_pipe('sentencizer'))

#     def process(self, body: str, min_length: int = 40, max_length: int = 600):
#         """
#         Processes the content sentences.
#         :param body: The raw string body to process
#         :param min_length: Minimum length that the sentences must be
#         :param max_length: Max length that the sentences mus fall under
#         :return: Returns a list of sentences.
#         """
#         doc = self.nlp(body)
#         doc.is_parsed=True
#         return [c.string.strip() for c in doc.sents if max_length > len(c.string.strip()) > min_length]

#     def __call__(self, body: str, min_length: int = 40, max_length: int = 600):
#         return self.process(body, min_length, max_length)

In [6]:
def extract_summary(text, custom_model=None, custom_tokenizer = None):
    model = Summarizer(custom_model=custom_model, custom_tokenizer=custom_tokenizer)
    return model(text)

In [7]:
#instantiate custom model and tokenizer
sciBert, sciBert_tokenizer = create_custom_model_tokenizer('/home/acorn/Downloads/scibert_scivocab_uncased')

In [8]:
logging.basicConfig(level=logging.WARNING)


class BertParent(object):

    """
    Base handler for BERT models.
    """

    MODELS = {
        'bert-base-uncased': (BertModel, BertTokenizer),
        'bert-large-uncased': (BertModel, BertTokenizer),
        'xlnet-base-cased': (XLNetModel, XLNetTokenizer),
        'xlm-mlm-enfr-1024': (XLMModel, XLMTokenizer),
        'distilbert-base-uncased': (DistilBertModel, DistilBertTokenizer),
        'albert-base-v1': (AlbertModel, AlbertTokenizer),
        'albert-large-v1': (AlbertModel, AlbertTokenizer),
        ## added this to extract sciBert embeddings
        'allenai/scibert_scivocab_uncased': (sciBert, sciBert_tokenizer) 
    }

    def __init__(
        self,
        model: str,
        custom_model: PreTrainedModel=None,
        custom_tokenizer: PreTrainedTokenizer=None
    ):
        """
        :param model: Model is the string path for the bert weights. If given a keyword, the s3 path will be used
        :param custom_model: This is optional if a custom bert model is used
        :param custom_tokenizer: Place to use custom tokenizer
        """

        base_model, base_tokenizer = self.MODELS.get(model, (None, None))

        if custom_model:
            self.model = custom_model
        else:
            self.model = base_model.from_pretrained(model, output_hidden_states=True)

        if custom_tokenizer:
            self.tokenizer = custom_tokenizer
        else:
            self.tokenizer = base_tokenizer.from_pretrained(model)

        self.model.eval()

    def tokenize_input(self, text: str) -> torch.tensor:
        """
        Tokenizes the text input.
        :param text: Text to tokenize
        :return: Returns a torch tensor
        """
        tokenized_text = self.tokenizer.tokenize(text)
        indexed_tokens = self.tokenizer.convert_tokens_to_ids(tokenized_text)
        return torch.tensor([indexed_tokens])

    def extract_embeddings(
        self,
        text: str,
        hidden: int=-2,
        squeeze: bool=False,
        reduce_option: str ='mean'
    ) -> ndarray:

        """
        Extracts the embeddings for the given text
        :param text: The text to extract embeddings for.
        :param hidden: The hidden layer to use for a readout handler
        :param squeeze: If we should squeeze the outputs (required for some layers)
        :param reduce_option: How we should reduce the items.
        :return: A numpy array.
        """

        tokens_tensor = self.tokenize_input(text)
        pooled, hidden_states = self.model(tokens_tensor)[-2:]

        if -1 > hidden > -12:

            if reduce_option == 'max':
                pooled = hidden_states[hidden].max(dim=1)[0]

            elif reduce_option == 'median':
                pooled = hidden_states[hidden].median(dim=1)[0]

            else:
                pooled = hidden_states[hidden].mean(dim=1)

        if squeeze:
            return pooled.detach().numpy().squeeze()

        return pooled

    def create_matrix(
        self,
        content: List[str],
        hidden: int=-2,
        reduce_option: str = 'mean'
    ) -> ndarray:
        """
        Create matrix from the embeddings
        :param content: The list of sentences
        :param hidden: Which hidden layer to use
        :param reduce_option: The reduce option to run.
        :return: A numpy array matrix of the given content.
        """

        return np.asarray([
            np.squeeze(self.extract_embeddings(t, hidden=hidden, reduce_option=reduce_option).data.numpy())
            for t in content
        ])

    def __call__(
        self,
        content: List[str],
        hidden: int= -2,
        reduce_option: str = 'mean'
    ) -> ndarray:
        return self.create_matrix(content, hidden, reduce_option)

In [9]:
#BertParent.MODELS

In [10]:
def get_summary_embeddings(summary, model):
    emb = BertParent(model=model)
    return emb.extract_embeddings(summary)

#Main code

In [12]:
local_dir = setup_local_data()
metadata = pd.read_csv("./CORD-19-research-challenge/metadata.csv")
metadata.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 47298 entries, 0 to 47297
Data columns (total 18 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   cord_uid                     47298 non-null  object 
 1   sha                          34283 non-null  object 
 2   source_x                     47298 non-null  object 
 3   title                        47140 non-null  object 
 4   doi                          43956 non-null  object 
 5   pmcid                        28038 non-null  object 
 6   pubmed_id                    35409 non-null  float64
 7   license                      47298 non-null  object 
 8   abstract                     39048 non-null  object 
 9   publish_time                 47289 non-null  object 
 10  authors                      45189 non-null  object 
 11  journal                      42894 non-null  object 
 12  Microsoft Academic Paper ID  964 non-null    float64
 13  WHO #Covidence  

  interactivity=interactivity, compiler=compiler, result=result)


In [14]:
CHUNKS_COUNT = 20
from bert_serving.client import BertClient
bc = BertClient()

In [None]:
#extract summaries from the abstracts
#chunk processing due to long processing time and possible notebook runtime shutdowns
chunks = np.array_split(metadata, CHUNKS_COUNT)
chunk_dfs = []
for index, chunk in enumerate(chunks):
    chunk = chunk.reset_index()
    vector_list = []
    for i in tqdm(chunk.index):
        if isinstance(chunk.iloc[i]['abstract'], str):
            if len(chunk.iloc[i]['abstract']) > 10:
                summary = extract_summary(chunk.iloc[i]['abstract'], 
                                          custom_model=sciBert, 
                                          custom_tokenizer=sciBert_tokenizer)
                vector_list.append(
                {
                "cord_uid": chunk.iloc[i]['cord_uid'],
                "sha": chunk.iloc[i]['sha'],
                "scibert_emb": bc.encode([summary])[0],
                "summary": summary
                })
            else: 
                pass

vector_df = pd.DataFrame(data=vector_list)
vector_df.to_json('abstract_summaries.json')