In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
%cd drive/My\ Drive/Colab\ Notebooks/apex-codes/citation_sum

/content/drive/My Drive/Colab Notebooks/apex-codes/citation_sum


In [None]:
import nltk
nltk.download('punkt')  
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [None]:
import gensim
import os
import pandas as pd
import numpy as np
import pickle as pk
from scipy.spatial.distance import cosine
from gensim.test.utils import common_texts
from sklearn.metrics.pairwise import cosine_similarity
from gensim.models.doc2vec import Doc2Vec, TaggedDocument

## https://github.com/allenai/paper-embedding-public-apis

In [None]:
from typing import Dict, List
import json
import requests

URL = "https://model-apis.semanticscholar.org/specter/v1/invoke"
MAX_BATCH_SIZE = 16


def chunks(lst, chunk_size=MAX_BATCH_SIZE):
    """Splits a longer list to respect batch size"""
    for i in range(0, len(lst), chunk_size):
        yield lst[i : i + chunk_size]

def embed(papers):
    embeddings_by_paper_id: Dict[str, List[float]] = {}

    for chunk in chunks(papers):
        # Allow Python requests to convert the data above to JSON
        response = requests.post(URL, json=chunk)

        if response.status_code != 200:
            raise RuntimeError("Sorry, something went wrong, please try later!")

        for paper in response.json()["preds"]:
            embeddings_by_paper_id[paper["paper_id"]] = np.array(paper["embedding"])

    return embeddings_by_paper_id

# Working with SPECTER

In [None]:
%mkdir SPECTER_FILES

In [None]:
!pwd

/content/drive/My Drive/Colab Notebooks/apex-codes/citation_sum


## Function to read an fos and generate SPECTER embedding for each abstract and write to a file system

In [None]:
# Function to read an fos and generate SPECTER embedding for each abstract and write to a file system
def _generate_SPECTER_embed_abstract(fos):
  df = pd.read_csv(f'../100_most_cited_w_titles/{fos}.csv')
  rp_abstract_lst = []
  
  for idx, row in df.iterrows():
    rp_abstract_lst.append({"paper_id":str(df['paper_id'][idx]), 
                            "title":df['paper_title'][idx], 
                            "abstract":df['paper_abstract'][idx]})
    
  all_embeddings = embed(rp_abstract_lst)   # Generate embeddings for the abstracts

  pk.dump(all_embeddings, open(f"SPECTER_FILES/RP_abstract_embeddings/{fos}.pk", "wb"))  # write to the file system


In [None]:
fos = "data_mining"
_generate_SPECTER_embed_abstract(fos)

## Function to generate SPECTER embeddings for each topic-aware/topic-focused citation contexts. Generate a similar data structure as created for the abstracts for each fos---just replace abstracts with a bunch of TaCC

In [None]:
def _generate_SPECTER_embed_TaCC(fos):
  #fos = "artificial_intelligence"
  TOPIC_FOCUSED_CITATION_CONTEXT_PATH = f"100_highest_ranked_citing_papers/citation_contexts_TOPIC_FOCUSED/{fos}"

  df_title_w_paper_id = pd.read_csv(f'../100_most_cited_w_titles/{fos}.csv')

  TaCC_lst = []
  for cited_id in os.listdir(TOPIC_FOCUSED_CITATION_CONTEXT_PATH):
      df = pd.read_csv(os.path.join(TOPIC_FOCUSED_CITATION_CONTEXT_PATH, cited_id))
      # Grab topic-focused citation contexts for a cited article
      if len(df) != 0:
        try:
          topic_citation_contexts_list = df['topic_focused_citation_context'].values
        except:
          continue   
        topic_citation_contexts_final = list()
        for citation_context in topic_citation_contexts_list:
          try:
            topic_citation_contexts_final.append(citation_context.replace("'", '').replace(".,", '.').replace('[', '').replace(']',''))
          except:
            continue

        topic_citation_contexts_final = ' '.join(topic_citation_contexts_final)
        cited_id_wo_csv = cited_id.replace('.csv','')
        #print(cited_id_wo_csv)
        paper_title = df_title_w_paper_id[df_title_w_paper_id['paper_id'] == int(cited_id_wo_csv)]['paper_title'].values
        
        paper_title = paper_title[0]

        TaCC_lst.append({"paper_id":str(cited_id_wo_csv),
                        "title":paper_title,
                        "abstract":topic_citation_contexts_final})
      
  all_embeddings = embed(TaCC_lst)   # Generate embeddings for the abstracts

  pk.dump(all_embeddings, open(f"SPECTER_FILES/TaCC_embeddings/{fos}.pk", "wb"))  # write to the file system

In [None]:
fos = "machine_learning"
_generate_SPECTER_embed_TaCC(fos)

## Function to generate SPECTER embeddings for each class of summaries (i.e., using the different models---e.g., T5, BART, Pegasus, etc). Note that in this function, in addition to the fos, the function accepts the summary_path also, whether T5, BART, Pegasus, or so on

In [None]:
def _generate_SPECTER_embed_summary(fos, summary_path, summary_embed_path):
  df_title_w_paper_id = pd.read_csv(f'../100_most_cited_w_titles/{fos}.csv')
  summary_lst = []

  for cited_id in os.listdir(summary_path):
    with open(os.path.join(summary_path, cited_id), 'r') as fp:
      summary = fp.read()
    fp.close()

    cited_id_wo_csv = cited_id.replace('.csv', '')

    paper_title = df_title_w_paper_id[df_title_w_paper_id['paper_id'] == int(cited_id_wo_csv)]['paper_title'].values    
    paper_title = paper_title[0]

    summary_lst.append({"paper_id":str(cited_id_wo_csv),
                        "title":paper_title,
                        "abstract":summary})
    
  all_embeddings = embed(summary_lst)   # Generate embeddings for the abstracts

  pk.dump(all_embeddings, open(f"SPECTER_FILES/{summary_embed_path}/{fos}.pk", "wb"))  # write to the file system


In [None]:
fos_lst = ["artificial_intelligence", "machine_learning", "data_mining"]
summary_type = "MAGSumm_Results_w_title_and_topic_augmentation"
summary_embed_path = "MAGSumm_Summary_w_title_w_topic_embeddings"

for fos in fos_lst:
  SUMMARY_PATH = f"{summary_type}/{fos}"
  _generate_SPECTER_embed_summary(fos, SUMMARY_PATH, summary_embed_path)


In [None]:
fos = "artificial_intelligence"
RP_ABSTRACT_EMBED_PATH = pk.load(open(f"SPECTER_FILES/RP_abstract_embeddings/{fos}.pk", "rb"))  # read from the file system

In [None]:
RP_ABSTRACT_EMBED_PATH['2964299589'].shape

(768,)

## Semantic Equivalence between summaries and abstracts

In [None]:
def _compute_cosine_summary_w_abstract(fos, summary_embed_path):
  DICT_RP_ABSTRACT_EMBED = pk.load(open(f"SPECTER_FILES/RP_abstract_embeddings/{fos}.pk", "rb"))  # write to the file system
  DICT_SUMMARY = pk.load(open(f"SPECTER_FILES/{summary_embed_path}/{fos}.pk", "rb"))  # write to the file system

  total_cosine_sum = 0.0
  count = 0

  for paper_id, summary_embed in DICT_SUMMARY.items():
    rp_abstract_embed = DICT_RP_ABSTRACT_EMBED[paper_id]
    total_cosine_sum += cosine(summary_embed, rp_abstract_embed)
    count += 1
  return total_cosine_sum/float(count)


In [None]:
fos_lst = ["artificial_intelligence", "machine_learning", "data_mining"]

summary_embed_path_list = ["T5_Summary_w_title_w_topic_embeddings", "BART_Summary_w_title_w_topic_embeddings", 
                           "Pegasus_Summary_w_title_w_topic_embeddings", "ProphetNet_Summary_w_title_w_topic_embeddings", 
                           "MAGSumm_Summary_w_title_w_topic_embeddings"]
for fos in fos_lst:
  print(f"Results for {fos} \n")
  for summary_embed_path in summary_embed_path_list:
    avg_cosine = _compute_cosine_summary_w_abstract(fos, summary_embed_path) * 100
    print(f"{summary_embed_path} cosine wrt RP abstracts: %.2f" % avg_cosine)
  print("\n")

## Semantic Equivalence between summaries and TaCC from which the summaries are generated

In [None]:
def _compute_cosine_summary_w_TaCC(fos, summary_embed_path):
  DICT_TaCC_EMBED = pk.load(open(f"SPECTER_FILES/TaCC_embeddings/{fos}.pk", "rb"))  # write to the file system
  DICT_SUMMARY = pk.load(open(f"SPECTER_FILES/{summary_embed_path}/{fos}.pk", "rb"))  # write to the file system

  total_cosine_sum = 0.0
  count = 0

  for paper_id, summary_embed in DICT_SUMMARY.items():
    TaCC_embed = DICT_TaCC_EMBED[paper_id]
    total_cosine_sum += cosine(summary_embed, TaCC_embed)
    count += 1
  return total_cosine_sum/float(count)

In [None]:
fos_lst = ["artificial_intelligence", "machine_learning", "data_mining"]

summary_embed_path_list = ["T5_Summary_w_title_w_topic_embeddings", "BART_Summary_w_title_w_topic_embeddings", 
                           "Pegasus_Summary_w_title_w_topic_embeddings", "ProphetNet_Summary_w_title_w_topic_embeddings", 
                           "MAGSumm_Summary_w_title_w_topic_embeddings"]

for fos in fos_lst:
  print(f"Results for {fos} \n")
  for summary_embed_path in summary_embed_path_list:
    avg_cosine = _compute_cosine_summary_w_TaCC(fos, summary_embed_path) * 100
    print(f"{summary_embed_path} cosine wrt TaCC: %.2f" % avg_cosine)
  print("\n")