# Generating SPECTER Embeddings for DHQ Corpus

## Benjamin Charles Germain Lee

This Jupyter notebook contains the code for generating SPECTER paper embeddings for the DHQ corpus using title + abstract.

In [1]:
# need this import for the huggingface implementation of SPECTER to work properly
from transformers import AutoTokenizer, AutoModel

# for tensor manipulation
import torch

# for distance calculation with embeddings
from scipy.spatial.distance import pdist, squareform

# generic imports
from tqdm import tqdm
import numpy as np
import math
import csv
import sys

## First, we load in the DHQ corpus data from a TSV (with titles & abstracts).

This can be updated by swapping in a fresh TSV.


In [2]:
fields = []
data = []

row_ct = 0
with open("2022-dhq-articles-with-abstracts.tsv") as fd:
    rd = csv.reader(fd, delimiter="\t", quotechar='"')

    for row in rd:
        
        if row_ct == 0:
            fields = row
        else:
            data.append(row)
        
        row_ct += 1
        

In [3]:
# here are the headings:
print(fields)
print("Number of papers: " + str(len(data)))

['Article ID', 'Pub. Year', 'Volume and Issue', 'Authors', 'Affiliations', 'Title', 'Abstract', '# of Cited Works']
Number of papers: 643


In [4]:
# here, we sort the data by paper ID
sorted_data = []
for i in range(1, len(data)*10):
    for row in data:
        ID = int(row[0])
        if ID == i:
            sorted_data.append(row) 
            
data = sorted_data

# for i in range(0, len(data)):
#     print(i, int(data[i][0]))

# Next, we produce the embeddings.

Here, we feed in title + abstract (concatenated) to SPECTER. The code snippet here is modified from the SPECTER repo.

In [5]:
# load tokenizer
tokenizer = AutoTokenizer.from_pretrained('allenai/specter2_base')
#load base model
model = AutoModel.from_pretrained('allenai/specter2_base')

# set batch size
batch_size = 4

# define chunking function for batches
def chunk(file_list, n_chunks):
    
    # make chunks of files to be distributed across processes
    chunks = []
    chunk_size = math.ceil(float(len(file_list))/n_chunks)
    for i in range(0, n_chunks-1):
        chunks.append(file_list[i*chunk_size:(i+1)*chunk_size])
    chunks.append(file_list[(n_chunks-1)*chunk_size:])
    
    return chunks

# format papers for input
papers = []
for i in range(0, len(data)):
    paper = {}
    paper['ID'] = int(data[i][0])
    paper['title'] = data[i][5]
    paper['abstract'] = data[i][6]
    papers.append(paper)

# construct batches
batches = chunk(papers, math.ceil(len(papers)/batch_size))

embeddings_batches = []

# generate embeddings
for i in tqdm(range(0, len(batches))):
    batch = batches[i]
    # concatenate title and abstract
    title_abs = [d['title'] + tokenizer.sep_token + (d.get('abstract') or '') for d in batch[:4]]
    # preprocess the input
    ### NOTE: CONSUMES A LOT OF MEMORY, LOWERED max_length TO 256 BUT THIS IS A TEMPRORARY FIX
    ### SHOULD FIGURE OUT WHY MEMORY CONSUMPTION GROWS WITH EACH ITERATION...
    inputs = tokenizer(title_abs, padding=True, truncation=True, return_tensors="pt", max_length=300)
    result = model(**inputs)
    # take the first token in the batch as the embedding
    embeddings_batches.append(result.last_hidden_state[:, 0, :])


100%|██████████| 161/161 [00:36<00:00,  4.40it/s]


In [6]:
# next, we concatenate the embeddings (i.e., unravel batches)
embeddings = embeddings_batches[0]
for i in range(1, len(embeddings_batches)):
    embeddings = torch.cat((embeddings, embeddings_batches[i]), 0)
    
# here, we assert that we have the right number of embeddings:
assert len(papers) == len(embeddings)

# convert from tensor to numpy array
embeddings = embeddings.detach().numpy()

# # now, we add the embeddings to the paper metadata
# for i in range(0, len(papers)):
#     papers[i]['embedding'] = embeddings[i]


# Here, we can run tests on similar papers.

In [7]:
# first, we generate pairwise distances
## returned in this form: https://stackoverflow.com/questions/13079563/how-does-condensed-distance-matrix-work-pdist
## i.e., a flattened array of the upper half of upper-triangular matrix
condensed_distances = pdist(embeddings)
# we expand to squareform matrix
distances = squareform(condensed_distances)

n_nearest_neighbors = 10

nearest_neighbors = []
for i in range(0, len(distances)):
    # using trick from: https://stackoverflow.com/questions/34226400/find-the-index-of-the-k-smallest-values-of-a-numpy-array
    # (could use argpartition if we need this fast, but the DHQ corpus is small enough that this is totally fine)
    row_nearest_neighbor_indices = np.argsort(distances[i])[:n_nearest_neighbors+1]
        
    # here, we make sure the indexing is proper, recognizing that paper ID is not guaranteed to equal row number + 1
    row_nearest_neighbors = []
    for j in range(0, len(row_nearest_neighbor_indices)):
        index = row_nearest_neighbor_indices[j]
        row_nearest_neighbors.append(int(data[index][0]))
            
    nearest_neighbors.append(row_nearest_neighbors)

# here, we populate papers but also make sure to slice off 0th nearest neighbor (itself)
for i in range(0, len(papers)):
    papers[i]['recommendations'] = nearest_neighbors[i][1:]

# Lastly, we save the embeddings (as numpy array) and nearest neighbors (in the TSV).

In [8]:
# now, we save the embeddings
with open('SPECTER_embeddings.npy', 'wb') as f:
    np.save(f, embeddings)
    
# we also save an updated TSV where we list the paper IDs of the most similar papers

# first, we update the fields
new_fields = []
for i in range(0, n_nearest_neighbors):
    new_fields.append("Recommendation " + str(i+1))
del fields[-1]
fields += new_fields

# next, we update the data
for i in range(0, len(data)):
    del data[i][-1]
    # we start at 1 to skip adding the paper itself as its own nearest neighbor
    for j in range(1, len(nearest_neighbors[i])):
        data[i].append(nearest_neighbors[i][j])

out_data = [fields] + data


with open("2022-dhq-articles-with-abstracts-and-SPECTER-recommendations.csv", 'w') as csvfile: 
    csvwriter = csv.writer(csvfile) 
    for row in out_data:
        csvwriter.writerow(row) 
