In [20]:
import os

import pandas as pd
import torch
import torch.nn.functional as F
from tqdm.notebook import tqdm
from transformers import AutoModel, AutoTokenizer

In [2]:
cache_dir = 'cache'
if not os.path.exists(cache_dir):
    os.makedirs(cache_dir)

In [3]:
df = pd.read_parquet('../data/arxiv_metadata.parquet')
df.head()

Unnamed: 0,id,authors,title,abstract,categories,update_date
0,704.0001,"C. Bal\'azs, E. L. Berger, P. M. Nadolsky, C.-...",Calculation of prompt diphoton production cros...,A fully differential calculation in perturba...,hep-ph,2008-11-26
1,704.0002,Ileana Streinu and Louis Theran,Sparsity-certifying Graph Decompositions,"We describe a new algorithm, the $(k,\ell)$-...",math.CO cs.CG,2008-12-13
2,704.0003,Hongjun Pan,The evolution of the Earth-Moon system based o...,The evolution of Earth-Moon system is descri...,physics.gen-ph,2008-01-13
3,704.0004,David Callan,A determinant of Stirling cycle numbers counts...,We show that a determinant of Stirling cycle...,math.CO,2007-05-23
4,704.0005,Wael Abu-Shammala and Alberto Torchinsky,From dyadic $\Lambda_{\alpha}$ to $\Lambda_{\a...,In this paper we show how to compute the $\L...,math.CA math.FA,2013-10-15


In [65]:
model_path = 'Alibaba-NLP/gte-large-en-v1.5'
device = 'cuda' if torch.cuda.is_available() else 'cpu'
torch_dtype = torch.float16 if device == 'cuda' else torch.float32
tokenizer = AutoTokenizer.from_pretrained(model_path, cache_dir=cache_dir)
model = AutoModel.from_pretrained(model_path, trust_remote_code=True, cache_dir=cache_dir)

model = model.to(device).eval().half() if torch_dtype == torch.float16 else model.to(device).eval()



In [66]:
batch_size = 64
num_samples = 20_000

samples = df.sample(num_samples)
sample_abstracts = samples['abstract'].tolist()
sample_titles = samples['title'].tolist()

# sample_texts = [f'Title: {title.strip()} Abstract: {abstract.strip()}'.strip().replace('\n', ' ') for title, abstract in zip(sample_titles, sample_abstracts)]
sample_texts = [f'Title: {title.strip()}'.strip().replace('\n', ' ') for title, abstract in zip(sample_titles, sample_abstracts)]

In [67]:
text_embeddings = []

for i in tqdm(range(0, num_samples, batch_size)):
    batch = sample_texts[i:i + batch_size]
    inputs = tokenizer(batch, return_tensors='pt', padding=True, truncation=True, max_length=8192)
    inputs = {k: v.to(device) for k, v in inputs.items()}
    with torch.no_grad():
        outputs = model(**inputs)
    embeddings = outputs.last_hidden_state[:, 0].detach().to('cpu').float()
    embeddings = F.normalize(embeddings, p=2, dim=1)
    text_embeddings.append(embeddings)

text_embeddings = torch.cat(text_embeddings).cpu().numpy()
print(text_embeddings.shape)

  0%|          | 0/313 [00:00<?, ?it/s]

(20000, 1024)


In [68]:
# take random sample of text (and corresponding embeddings) and calculate cosine similarity to all other text embeddings
# show the top 3 most similar text embeddings and their corresponding text and the 3 least similar text embeddings and their corresponding text
import matplotlib.pyplot as plt
import numpy as np

np.random.seed(42)
sample_idx = np.random.randint(0, num_samples)
sample_embedding = text_embeddings[sample_idx]
sample_text = sample_texts[sample_idx]
print(sample_text)

cosine_similarities = text_embeddings @ sample_embedding
sorted_indices = np.argsort(cosine_similarities)

print('Most similar')
for idx in sorted_indices[-4:-1]:
    print(sample_texts[idx], cosine_similarities[idx])

print('\nLeast similar')
for idx in sorted_indices[:3]:
    print(sample_texts[idx], cosine_similarities[idx])

Title: Information asymmetry in KL-regularized RL
Most similar
Title: Conditional Mutual Information for Disentangled Representations in   Reinforcement Learning 0.76912695
Title: Semi-Parametric Efficient Policy Learning with Continuous Actions 0.78060114
Title: An Empirical Study of Implicit Regularization in Deep Offline RL 0.7940179

Least similar
Title: Angular dependence of the high-frequency vortex response in   YBa$_2$Cu$_3$O$_{7-x}$ thin film with self-assembled BaZrO$_3$ nanorods 0.33945122
Title: What Moves the Heavens Above? 0.34980494
Title: Calypso Venus Scout 0.3519418


In [69]:
input_text = "Machine learning for computer vision"
tokenized_input = tokenizer(input_text, return_tensors='pt', padding=True, truncation=True, max_length=8192)
tokenized_input = {k: v.to(device) for k, v in tokenized_input.items()}
with torch.no_grad():
    output = model(**tokenized_input)
embedding = output.last_hidden_state[:, 0].detach().to('cpu').float()
embedding = F.normalize(embedding, p=2, dim=1).squeeze()

cosine_similarities = text_embeddings @ embedding.numpy()

sorted_indices = np.argsort(cosine_similarities)
print('Most similar')
for idx in sorted_indices[-4:-1]:
    print(sample_texts[idx], cosine_similarities[idx])

print('\nLeast similar')
for idx in sorted_indices[:3]:
    print(sample_texts[idx], cosine_similarities[idx])

Most similar
Title: Optimizing Region Selection for Weakly Supervised Object Detection 0.7090218
Title: Fusing image representations for classification using support vector   machines 0.71061456
Title: Convolutional Neural Networks as a Model of the Visual System: Past,   Present, and Future 0.7131296

Least similar
Title: Trapped by the drift 0.26702017
Title: It is the ambiguity. (But only three generations) 0.27463382
Title: How to drive our families mad 0.28433433


In [70]:
# create a interactive graph with all the text embeddings using TSNE and on hover show the text
from bokeh.io import output_notebook
from bokeh.models import ColumnDataSource, HoverTool
from bokeh.plotting import figure, show
from sklearn.manifold import TSNE

print('Calculating TSNE embeddings')
tsne = TSNE(n_components=2, perplexity=50, n_iter=1500, random_state=42)
tsne_embeddings = tsne.fit_transform(text_embeddings)

Calculating TSNE embeddings


In [71]:
from bokeh.models import ColumnDataSource, HoverTool, PanTool, WheelZoomTool
from bokeh.palettes import Category10
from bokeh.plotting import figure, output_notebook, show

# Prepare the data source
source = ColumnDataSource(data=dict(
    x=tsne_embeddings[:, 0],
    y=tsne_embeddings[:, 1],
    text=sample_texts
))

# Define hover tool
hover = HoverTool(tooltips=[("Text", "@text")])

# Create the plot
plot_title = "Text Embeddings"
plot_width = 800
plot_height = 600
p = figure(tools=[hover, WheelZoomTool(), PanTool()], title=plot_title, width=plot_width, height=plot_height)
p.circle('x', 'y', size=8, source=source, color=Category10[10][0], alpha=0.8, legend_label="Text Embeddings")

# Customize plot appearance
p.title.text_font_size = "16pt"
p.xaxis.axis_label = "TSNE Dimension 1"
p.yaxis.axis_label = "TSNE Dimension 2"
p.xaxis.axis_label_text_font_size = "14pt"
p.yaxis.axis_label_text_font_size = "14pt"
p.xaxis.major_label_text_font_size = "12pt"
p.yaxis.major_label_text_font_size = "12pt"
p.legend.label_text_font_size = "12pt"
p.legend.location = "top_right"
p.legend.click_policy = "hide"

# Show the plot in the notebook
output_notebook()
show(p)

