In [2]:
import torch
import random
import json
import os

import numpy as np
import pandas as pd
import torch.nn.functional as F

from random import randrange
# from tqdm.notebook import tqdm_no
from IPython.display import display, Markdown
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments, BertModel, BertForNextSentencePrediction, AutoTokenizer


In [3]:
# Change to your corresponding key to download dataset from Kaggle
# https://www.kaggle.com/datasets/Cornell-University/arxiv

with open('./kaggle.json') as f:
    file = json.load(f)

os.environ["KAGGLE_KEY"] = file['key']
os.environ["KAGGLE_USERNAME"] = 'jeploretizo'

In [4]:
!kaggle datasets download -d Cornell-University/arxiv

arxiv.zip: Skipping, found more recently modified local copy (use --force to force download)


In [6]:
! unzip "arxiv.zip"

In [7]:
PRE_TRAINED_MODEL_NAME = 'bert-base-cased'
MAX_SEQUENCE_LENGTH = 256
FILE_PATH ='./arxiv-metadata-oai-snapshot.json'
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [8]:
def get_data():
    with open(FILE_PATH) as f:
        for line in f:
            yield line

In [9]:
year_limit = 2023

dataframe = {
    'id': [],
    'title': [],
    'year': [],
    'abstract': []

}

data = get_data()
for i, paper in enumerate(data):
    paper = json.loads(paper)
    try:
        date = int(paper['update_date'].split('-')[0])
        if date > year_limit:
            dataframe['title'].append(paper['title'])
            dataframe['year'].append(date)
            dataframe['abstract'].append(paper['abstract'])
            dataframe['id'].append(paper['id'])
    except: pass

In [13]:
df = pd.DataFrame(dataframe)

# Limit to first 500 for training purposes
df = df[:500]

In [14]:
model = BertForNextSentencePrediction.from_pretrained(PRE_TRAINED_MODEL_NAME)
model = model.to(device)
model.eval()

tokenizer = AutoTokenizer.from_pretrained(PRE_TRAINED_MODEL_NAME)

In [16]:
def get_similarity_scores(query_string, reference_list, BATCH_SIZE=16):
  similarity_scores = []
  paired_texts = [(query_string, reference_row) for reference_row in reference_list]

  for i in range(0, len(paired_texts), BATCH_SIZE):
      batch = paired_texts[i:i + BATCH_SIZE]
      encoded_sequences = tokenizer(
          [pair[0] for pair in batch],
          [pair[1] for pair in batch],
          padding='longest',
          truncation='longest_first',
          return_tensors='pt',
          max_length=MAX_SEQUENCE_LENGTH
      ).to(device)

      outputs = model(
          input_ids=encoded_sequences['input_ids'],
          attention_mask=encoded_sequences['attention_mask'],
          token_type_ids=encoded_sequences['token_type_ids']
      )

      probs = F.softmax(outputs.logits, dim=1)
      similarity_scores.extend(probs[:, 0].detach().cpu().numpy())

  return similarity_scores

# Assuming similarity_scores is now a list of similarity scores,
# one for each row in your original DataFrame:


In [24]:
query_string = "I don't know"
df['similarity_score'] = get_similarity_scores(query_string, df['abstract'].to_list())

In [25]:
df = df.sort_values(by='similarity_score', ascending=False).reset_index(drop=True)

In [26]:
for i in range(5):
    row = df.iloc[i]
    print("TITLE:", f"{row['title']}")
    print('----')
    print("ABSTRACT:", row['abstract'])
    print('----')
    print("SIMILARIY SCORE:", row['similarity_score'])
    print('====')

TITLE: Are Particles Self-Organized Systems?
----
ABSTRACT:   Where did elementary particles come from? What mechanisms are responsible for
their occurrence and maintenance? Are they compound or truly elementary? Is
vacuum primordial soup where elementary particles are born? Are quantum
behavior and relativistic phenomena fundamental or emergent? This paper
describes a primitive active medium far from thermodynamic equilibrium, which
we associate with vacuum and in which a system of particles and fields arises,
similar to that described by the standard model. Phenomena usually attributed
to quantum or relativistic media emerge during vacuum self-organization. These
include discrete spectra of ground states, charges, oscillation periods, and
link flavors, spatial phase coherency, virtual states, tunneling, entanglement,
time-related uncertainty of states, and coexistent Planck-like and
Einstein-like time scales. The form of vacuum self-organization is a coherent
time-crystal network. He