# Arxiv Library

In [1]:
import arxiv

client = arxiv.Client()

article_id = "2305.16338"

search_result = arxiv.Client().results(arxiv.Search(id_list=[article_id]))

if article := next(search_result):
    print(f'Starting download of article: "{article.title}" ({article_id})')
    pdf_path = article.download_pdf()
    print(f"Download finished! Result saved at:\n{pdf_path}")
else:
    print("Article not found.")

Starting download of article: "Think Before You Act: Decision Transformers with Working Memory" (2305.16338)
Download finished! Result saved at:
./2305.16338v3.Think_Before_You_Act__Decision_Transformers_with_Working_Memory.pdf


# Other

In [None]:
import json

import requests

In [None]:
import pandas as pd

df = pd.read_csv("../data/sample_2024.csv")
df.head(3)

In [None]:
from tqdm import tqdm

request_fields = "externalIds,referenceCount,citationCount,title,url,year,authors,fieldsOfStudy,s2FieldsOfStudy,publicationTypes,publicationDate,journal,references"

def request_papers(ids):
    r = requests.post(
        "https://api.semanticscholar.org/graph/v1/paper/batch",
        params={"fields": request_fields},
        json={"ids": ids},
    )
    return r.json()

def save_papers_data(papers_data):
    with open("papers_metadata.json", "w") as f:
        json.dump(papers_data, f, indent=4)

def process_df(df):
    papers_data = []

    for i in tqdm(range(0, len(df), 100)):
        ids = df["id"].iloc[i : i + 100].tolist()
        ids = [f"ARXIV:{id}" for id in ids]
        results = request_papers(ids)
        papers_data.extend(results)

    return papers_data

In [21]:
import json

import pandas as pd

with open("../../data/papers_metadata.json") as f:
    papers_data = json.load(f)

papers_df = pd.read_csv("../../data/sample_2024.csv")
papers_df["arxiv_id"] = papers_df["id"].astype(str)

In [22]:
from collections import defaultdict

additional_data = defaultdict(list)
paper_authors = dict()
paper_references = dict()

for paper in papers_data:
    if paper is None:
        continue
    semantic_scholar_id = paper["paperId"]
    arxiv_id = paper["externalIds"]["ArXiv"]
    doi = paper["externalIds"].get("DOI", None)
    reference_count = paper["referenceCount"]
    citation_count = paper["citationCount"]
    publication_type = paper["publicationTypes"]
    publication_type = publication_type[0] if publication_type is not None else None
    paper_authors[arxiv_id] = paper["authors"]
    paper_references[arxiv_id] = paper["references"]

    additional_data["arxiv_id"].append(arxiv_id)
    additional_data["doi"].append(doi)
    additional_data["reference_count"].append(reference_count)
    additional_data["citation_count"].append(citation_count)
    additional_data["publication_type"].append(publication_type)
    additional_data["semantic_scholar_id"].append(semantic_scholar_id)

papers_df = pd.merge(papers_df, pd.DataFrame(additional_data), how="left", on="arxiv_id")

In [None]:
papers_df

In [None]:
papers_data[0]