In [1]:
import torch
import random
import json
import os

import numpy as np
import pandas as pd
import torch.nn.functional as F

from random import randrange
from tqdm.notebook import tqdm
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments, BertModel, BertForNextSentencePrediction, AutoTokenizer


In [2]:
# Change to your corresponding key to download dataset from Kaggle
# https://www.kaggle.com/datasets/Cornell-University/arxiv

from google.colab import userdata
import os

os.environ["KAGGLE_KEY"] = userdata.get('KAGGLE_KEY')
os.environ["KAGGLE_USERNAME"] = userdata.get('KAGGLE_USERNAME')

In [3]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cuda')

In [4]:
!kaggle datasets download -d Cornell-University/arxiv

Downloading arxiv.zip to /content
 99% 1.25G/1.26G [00:13<00:00, 83.5MB/s]
100% 1.26G/1.26G [00:13<00:00, 101MB/s] 


In [5]:
! unzip "arxiv.zip"

Archive:  arxiv.zip
  inflating: arxiv-metadata-oai-snapshot.json  


In [2]:
PRE_TRAINED_MODEL_NAME = 'bert-base-cased'
MAX_SEQUENCE_LENGTH = 256
FILE_PATH ='./arxiv-metadata-oai-snapshot.json'
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [3]:
def get_data():
    with open(FILE_PATH) as f:
        for line in f:
            yield line

In [13]:
year_limit = 2023

dataframe = {
    'id': [],
    'title': [],
    'year': [],
    'abstract': []

}

data = get_data()
for i, paper in enumerate(data):
    paper = json.loads(paper)
    try:
        date = int(paper['update_date'].split('-')[0])
        if date > year_limit:
            dataframe['title'].append(paper['title'])
            dataframe['year'].append(date)
            dataframe['abstract'].append(paper['abstract'])
            dataframe['id'].append(paper['id'])
    except: pass

In [14]:
df = pd.DataFrame(dataframe)

In [15]:
subset_df = df.iloc[:500]

In [6]:
model = BertForNextSentencePrediction.from_pretrained(PRE_TRAINED_MODEL_NAME)
model = model.to(device)
model.eval()

tokenizer = AutoTokenizer.from_pretrained(PRE_TRAINED_MODEL_NAME)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [10]:
def get_similarity_scores(query_string, reference_list, BATCH_SIZE=16):
  similarity_scores = []
  paired_texts = [(query_string, reference_row) for reference_row in reference_list]

  for i in tqdm(range(0, len(paired_texts), BATCH_SIZE)):
      batch = paired_texts[i:i + BATCH_SIZE]
      encoded_sequences = tokenizer(
          [pair[0] for pair in batch],
          [pair[1] for pair in batch],
          padding='longest',
          truncation='longest_first',
          return_tensors='pt',
          max_length=MAX_SEQUENCE_LENGTH
      ).to(device)

      outputs = model(
          input_ids=encoded_sequences['input_ids'],
          attention_mask=encoded_sequences['attention_mask'],
          token_type_ids=encoded_sequences['token_type_ids']
      )

      probs = F.softmax(outputs.logits, dim=1)
      similarity_scores.extend(probs[:, 0].detach().cpu().numpy())

  return similarity_scores

# Assuming similarity_scores is now a list of similarity scores,
# one for each row in your original DataFrame:


In [None]:
query_string = 'credit card access'
df['similarity_score'] = get_similarity_scores(query_string, df['abstract'].to_list())

  0%|          | 0/4332 [00:00<?, ?it/s]

In [12]:
df.sort_values(by='similarity_score', ascending=False)

Unnamed: 0,id,title,year,abstract,similarity_score
31,1005.0555,Cosmic Microwave Background Mini-review,2024,A compact overview of the status of CMB anis...,0.450201
193,1405.5801,Physics at the University of Lviv: the first t...,2024,A detailed bibliography related to physics a...,0.244206
160,1309.2605,Constructive mathematics with the knowledge pr...,2024,$K$ denotes both the predicate satisfied by ...,0.139824
108,1301.2469,Strong convergence for the modified Mann's ite...,2024,"In this paper, for an $\lambda$-strict pseud...",0.060501
44,1102.1087,Graph Theory,2024,This is a replacement paper. There are 6 cha...,0.060356
...,...,...,...,...,...
243,1503.07174,A calculation for polar Kerr effect in high te...,2024,A mechanism is proposed for the tantalizing ...,0.000043
268,1507.05606,Nernst and magneto-thermal conductivity in a l...,2024,Weyl semimetals (WSM) are topologically prot...,0.000036
355,1608.00403,Unitary boson-boson and boson-fermion mixtures...,2024,We give exact integral expressions of the th...,0.000030
96,1210.7306,Cosmology constrains gravitational four-fermio...,2024,"If torsion exists, it generates gravitationa...",0.000027


model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

In [None]:
similar_books = get_top_k_most_similar(source_book, book_ids, books, 10)