In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import dask.bag as db
import pandas as pd
import numpy as np
import json
import nltk

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/arxiv/arxiv-metadata-oai-snapshot.json


In [2]:
import itertools

In [3]:
docs = db.read_text('../input/arxiv/arxiv-metadata-oai-snapshot.json').map(json.loads)
docs.count().compute()

2193429

In [87]:
docs_category_list=['cs.AI']
docs_sublist = (docs.filter(lambda x:any(ele in x['categories'] for ele in docs_category_list)==True))
docs_sublist.count().compute()

57139

In [88]:
get_latest_version = lambda x: x['versions'][-1]['created']

trim = lambda x: {'id': x['id'],
                  'authors': x['authors'],
                  'title': x['title'],
                  'category':x['categories'].split(' '),
                  'abstract':x['abstract'],}

#docs_df = (docs_sublist.map(trim).compute())
columns = ['id','category','abstract']
docs_df = (docs_sublist.filter(lambda x: int(get_latest_version(x).split(' ')[3]) > 2022)
           .map(trim).
           compute())
docs_df = pd.DataFrame(docs_df)
docs_df.head()

Unnamed: 0,id,authors,title,category,abstract
0,1212.1108,Joshua Belanich and Luis E. Ortiz,On the Convergence Properties of Optimal AdaBoost,"[cs.LG, cs.AI, stat.ML]",AdaBoost is one of the most popular ML algor...
1,1906.04893,"Mahyar Fazlyab, Alexander Robey, Hamed Hassani...",Efficient and Accurate Estimation of Lipschitz...,"[cs.LG, cs.AI, math.OC, stat.ML]",Tight estimation of the Lipschitz constant f...
2,1906.12314,Charlie Blake and Ian P. Gent,The Winnability of Klondike Solitaire and Many...,[cs.AI],Our ignorance of the winnability percentage ...
3,1909.0294,Mridul Agarwal and Vaneet Aggarwal,Reinforcement Learning for Joint Optimization ...,"[cs.LG, cs.AI, cs.GT, cs.IT, cs.MA, math.IT, s...",Finding optimal policies which maximize long...
4,2007.01498,"Yuqian Jiang, Sudarshanan Bharadwaj, Bo Wu, Ri...",Temporal-Logic-Based Reward Shaping for Contin...,"[cs.AI, cs.LG, stat.ML]","In continuing tasks, average-reward reinforc..."


In [42]:
stop_words = set(nltk.corpus.stopwords.words('english'))
lemmatizer = nltk.stem.WordNetLemmatizer()
def doc_clean(title, abstract):
  title_list = [*set(title.replace("\n", " ").replace("-", " ").replace("(", "").replace(")", "").replace(".", " ").replace(",", " ").lower().split())]
  abstract_list = [*set(abstract.replace("\n", " ").replace("-", " ").replace("(", "").replace(")", "").replace(".", " ").replace(",", " ").lower().split())]
  words = [*set(title_list + abstract_list)]
  words = [ele for ele in words if all(ch not in ele for ch in ('$', '^', '{', '}'))]
  words = [ele for ele in words if ele not in stop_words]
  res = []
  for ele in words:
    res.append(lemmatizer.lemmatize(ele))
  return(' '.join(res))

In [89]:
docs_df["tokens"] = docs_df.apply (lambda x: doc_clean(x["title"], x["abstract"]), axis=1)
docs_df.head()

Unnamed: 0,id,authors,title,category,abstract,tokens
0,1212.1108,Joshua Belanich and Luis E. Ortiz,On the Convergence Properties of Optimal AdaBoost,"[cs.LG, cs.AI, stat.ML]",AdaBoost is one of the most popular ML algor...,interesting high elegant among object analysis...
1,1906.04893,"Mahyar Fazlyab, Alexander Robey, Hamed Hassani...",Efficient and Accurate Estimation of Lipschitz...,"[cs.LG, cs.AI, math.OC, stat.ML]",Tight estimation of the Lipschitz constant f...,efficient gradient certification analysis allo...
2,1906.12314,Charlie Blake and Ian P. Gent,The Winnability of Klondike Solitaire and Many...,[cs.AI],Our ignorance of the winnability percentage ...,applied particular variant one called reductio...
3,1909.0294,Mridul Agarwal and Vaneet Aggarwal,Reinforcement Learning for Joint Optimization ...,"[cs.LG, cs.AI, cs.GT, cs.IT, cs.MA, math.IT, s...",Finding optimal policies which maximize long...,decision example free among applied notice one...
4,2007.01498,"Yuqian Jiang, Sudarshanan Bharadwaj, Bo Wu, Ri...",Temporal-Logic-Based Reward Shaping for Contin...,"[cs.AI, cs.LG, stat.ML]","In continuing tasks, average-reward reinforc...",without far reduction speed shaping usual prov...


In [90]:
indices = pd.Series(docs_df.index, index=docs_df['id']).drop_duplicates()

In [97]:
def list_similar(ids, cosine_sim, indices):
    idx = indices[ids]
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = list(filter(lambda x: x[1] > 10/100, sim_scores))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:3]
    res_indices = [i[0] for i in sim_scores]
    return docs_df['id'].iloc[res_indices]

In [91]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel

vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(docs_df["tokens"])
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)

cosine_sim.shape 

(811, 811)

In [98]:
inv_index = {}

def construct_doc_relations_index(inv_index, ids):
    sim_docs = list(list_similar(ids, cosine_sim, indices))
    inv_index[ids] = sim_docs

docs_df.apply (lambda x: construct_doc_relations_index(inv_index, x["id"]), axis=1)
print(len(inv_index))

811


In [99]:
def write_links(ids):
    sim_docs = inv_index[ids]
    for i in range(len(sim_docs)):
        if sim_docs[i] != ids:
            f.write("{\"source\": \"%s\", \"target\": \"%s\"}," % (ids, sim_docs[i]))
            f.write("\n")

f= open("links.txt","w+")
docs_df.apply (lambda x: write_links(x["id"]), axis=1)
f.close()


In [96]:
def write_nodes(x):
    
    f.write("{\"id\": \"%s\", \"title\": \"%s\"}," % (x["id"], x["title"].replace("\n", "").replace("\\", "" ).replace("\"", "''" )))
    f.write("\n")

f= open("nodes.txt","w+")
docs_df.apply (lambda x: write_nodes(x), axis=1)
f.close()

In [None]:
def write_links(ids):
    sim_docs = inv_index[ids]
    for i in range(len(sim_docs)):
        if sim_docs[i] != ids:
            f.write("{\"source\": \"%s\", \"target\": \"%s\"}," % (ids, sim_docs[i]))

write_links("0704.2963")


['2301.06393', '2108.05623']