In [2]:
import torch
import random
import json
import os

import numpy as np
import pandas as pd
import torch.nn.functional as F

from random import randrange
# from tqdm.notebook import tqdm_no
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments, BertModel, BertForNextSentencePrediction, AutoTokenizer


In [3]:
# Change to your corresponding key to download dataset from Kaggle
# https://www.kaggle.com/datasets/Cornell-University/arxiv

with open('./kaggle.json') as f:
    file = json.load(f)

os.environ["KAGGLE_KEY"] = file['key']
os.environ["KAGGLE_USERNAME"] = 'jeploretizo'

In [4]:
!kaggle datasets download -d Cornell-University/arxiv

arxiv.zip: Skipping, found more recently modified local copy (use --force to force download)


In [6]:
! unzip "arxiv.zip"

In [7]:
PRE_TRAINED_MODEL_NAME = 'bert-base-cased'
MAX_SEQUENCE_LENGTH = 256
FILE_PATH ='./arxiv-metadata-oai-snapshot.json'
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [8]:
def get_data():
    with open(FILE_PATH) as f:
        for line in f:
            yield line

In [9]:
year_limit = 2023

dataframe = {
    'id': [],
    'title': [],
    'year': [],
    'abstract': []

}

data = get_data()
for i, paper in enumerate(data):
    paper = json.loads(paper)
    try:
        date = int(paper['update_date'].split('-')[0])
        if date > year_limit:
            dataframe['title'].append(paper['title'])
            dataframe['year'].append(date)
            dataframe['abstract'].append(paper['abstract'])
            dataframe['id'].append(paper['id'])
    except: pass

In [10]:
df = pd.DataFrame(dataframe)
.DS_Store
# Limit to first 500 for training purposes
df = df[:500]

In [11]:
model = BertForNextSentencePrediction.from_pretrained(PRE_TRAINED_MODEL_NAME)
model = model.to(device)
model.eval()

tokenizer = AutoTokenizer.from_pretrained(PRE_TRAINED_MODEL_NAME)

In [12]:
def get_similarity_scores(query_string, reference_list, BATCH_SIZE=16):
  similarity_scores = []
  paired_texts = [(query_string, reference_row) for reference_row in reference_list]

  for i in tqdm(range(0, len(paired_texts), BATCH_SIZE)):
      batch = paired_texts[i:i + BATCH_SIZE]
      encoded_sequences = tokenizer(
          [pair[0] for pair in batch],
          [pair[1] for pair in batch],
          padding='longest',
          truncation='longest_first',
          return_tensors='pt',
          max_length=MAX_SEQUENCE_LENGTH
      ).to(device)

      outputs = model(
          input_ids=encoded_sequences['input_ids'],
          attention_mask=encoded_sequences['attention_mask'],
          token_type_ids=encoded_sequences['token_type_ids']
      )

      probs = F.softmax(outputs.logits, dim=1)
      similarity_scores.extend(probs[:, 0].detach().cpu().numpy())

  return similarity_scores

# Assuming similarity_scores is now a list of similarity scores,
# one for each row in your original DataFrame:


In [30]:
query_string = 'graph theory'
df['similarity_score'] = get_similarity_scores(query_string, df['abstract'].to_list())

  0%|          | 0/32 [00:00<?, ?it/s]

In [None]:
df = df.sort_values(by='similarity_score', ascending=False).reset_index(drop=True)

In [43]:
from IPython.display import display, Markdown

In [49]:
markdown_text = ""
for i in range(5):
    row = df.iloc[i]
    markdown_text = markdown_text + f"""
    <div class="alert alert-block alert-success">
    <b>{row['title']}</b> <br> Abstract: {row['abstract']}
    </div>
    """

In [50]:
display(Markdown(markdown_text))


    <div class="alert alert-block alert-success">
    <b>On expanders from the action of GL(2,Z)</b> <br> Abstract:   Consider the undirected graph $G_n=(V_n, E_n)$ where $V_n = (Z/nZ)^2$ and
$E_n$ contains an edge from $(x,y)$ to $(x+1,y)$, $(x,y+1)$, $(x+y,y)$, and
$(x,y+x)$ for every $(x,y) \in V_n$. Gabber and Galil, following Margulis, gave
an elementary proof that ${G_n}$ forms an expander family. In this note, we
present a somewhat simpler proof of this fact, and demonstrate its utility by
isolating a key property of the linear transformations $(x,y) -> (x+y,x),
(x,y+x)$ that yields expansion.
  As an example, consider any invertible, integral matrix $S \in GL_2(Z)$ and
let $G^S_n = (V_n, E^S_n)$ where $E^S_n$ contains, for every $(x,y) \in V_n$,
an edge from $(x,y)$ to $(x+1,y)$, $(x,y+1)$, $S(x,y)$, and $S^T(x,y)$, where
$S^T$ denotes the transpose of $S$. Then {G_n^S} forms an expander family if
and only if a related infinite graph has positive Cheeger constant. This latter
property turns out to be elementary to analyze and can be used to show that
{G_n^S} are expanders precisely when the trace of S is non-zero and S is not
equal to its transpose. We also present some other generalizations.

    </div>
    
    <div class="alert alert-block alert-success">
    <b>On the number of outer connected dominating sets of graphs</b> <br> Abstract:   Let $G=(V,E)$ be a simple graph. A set $S\subseteq V(G)$ is called an
outer-connected dominating set (or ocd-set) of $G$, if $S$ is a dominating set
of $G$ and either $S=V(G)$ or $V\backslash S$ is a connected graph. In this
paper we introduce a polynomial which its coefficients are the number of
ocd-sets of $G$. We obtain some properties of this polynomial and its
coefficients. Also we compute this polynomial for some specific graphs.

    </div>
    
    <div class="alert alert-block alert-success">
    <b>Impartial achievement and avoidance games for generating finite groups</b> <br> Abstract:   We study two impartial games introduced by Anderson and Harary and further
developed by Barnes. Both games are played by two players who alternately
select previously unselected elements of a finite group. The first player who
builds a generating set from the jointly selected elements wins the first game.
The first player who cannot select an element without building a generating set
loses the second game. After the development of some general results, we
determine the nim-numbers of these games for abelian and dihedral groups. We
also present some conjectures based on computer calculations. Our main
computational and theoretical tool is the structure diagram of a game, which is
a type of identification digraph of the game digraph that is compatible with
the nim-numbers of the positions. Structure diagrams also provide simple yet
intuitive visualizations of these games that capture the complexity of the
positions.

    </div>
    
    <div class="alert alert-block alert-success">
    <b>Impartial avoidance and achievement games for generating symmetric and
  alternating groups</b> <br> Abstract:   We study two impartial games introduced by Anderson and Harary. Both games
are played by two players who alternately select previously-unselected elements
of a finite group. The first player who builds a generating set from the
jointly-selected elements wins the first game. The first player who cannot
select an element without building a generating set loses the second game. We
determine the nim-numbers, and therefore the outcomes, of these games for
symmetric and alternating groups.

    </div>
    
    <div class="alert alert-block alert-success">
    <b>A family of graphs that cannot occur as character degree graphs of
  solvable groups</b> <br> Abstract:   We investigate character degree graphs of solvable groups. In particular, we
provide general results that can be used to eliminate which degree graphs can
occur as solvable groups. Finally, we show a specific family of graphs cannot
occur as a character degree for any solvable group.

    </div>
    