In [4]:
import json

import requests

In [5]:
import pandas as pd

df = pd.read_csv("../data/sample_2024.csv")
df.head(3)

Unnamed: 0,id,title,abstract,categories,update_date,title_words,abstract_words,mapped_categories,amount_categories,update_year,super_categories,super_category,amount_super_categories,of_interest
0,2308.06394,Detecting and Preventing Hallucinations in Lar...,Instruction tuned Large Vision Language Mode...,"['cs.CV', 'cs.LG']",2024-02-13,9,231,"['Computer Vision and Pattern Recognition', 'M...",2,2024,"['Computer Science', 'Computer Science']",Computer Science,1,True
1,2406.07294,OTO Planner: An Efficient Only Travelling Once...,Autonomous exploration in complex and clutte...,"['cs.RO', 'cs.CV']",2024-06-12,14,167,"['Robotics', 'Computer Vision and Pattern Reco...",2,2024,"['Computer Science', 'Computer Science']",Computer Science,1,True
2,2405.18293,CF-OPT: Counterfactual Explanations for Struct...,Optimization layers in deep neural networks ...,['cs.LG'],2024-06-04,6,162,['Machine Learning'],1,2024,['Computer Science'],Computer Science,1,True


In [6]:
from tqdm import tqdm

request_fields = "externalIds,referenceCount,citationCount,title,url,year,authors,fieldsOfStudy,s2FieldsOfStudy,publicationTypes,publicationDate,journal,references"

def request_papers(ids):
    r = requests.post(
        "https://api.semanticscholar.org/graph/v1/paper/batch",
        params={"fields": request_fields},
        json={"ids": ids},
    )
    return r.json()

def save_papers_data(papers_data):
    with open("papers_metadata.json", "w") as f:
        json.dump(papers_data, f, indent=4)

def process_df(df):
    papers_data = []

    for i in tqdm(range(0, len(df), 100)):
        ids = df["id"].iloc[i : i + 100].tolist()
        ids = [f"ARXIV:{id}" for id in ids]
        results = request_papers(ids)
        papers_data.extend(results)

    return papers_data

  0%|          | 0/5 [00:00<?, ?it/s]

100%|██████████| 5/5 [00:35<00:00,  7.15s/it]


In [21]:
import json

import pandas as pd

with open("../../data/papers_metadata.json") as f:
    papers_data = json.load(f)

papers_df = pd.read_csv("../../data/sample_2024.csv")
papers_df["arxiv_id"] = papers_df["id"].astype(str)

In [22]:
from collections import defaultdict

additional_data = defaultdict(list)
paper_authors = dict()
paper_references = dict()

for paper in papers_data:
    if paper is None:
        continue
    semantic_scholar_id = paper["paperId"]
    arxiv_id = paper["externalIds"]["ArXiv"]
    doi = paper["externalIds"].get("DOI", None)
    reference_count = paper["referenceCount"]
    citation_count = paper["citationCount"]
    publication_type = paper["publicationTypes"]
    publication_type = publication_type[0] if publication_type is not None else None
    paper_authors[arxiv_id] = paper["authors"]
    paper_references[arxiv_id] = paper["references"]

    additional_data["arxiv_id"].append(arxiv_id)
    additional_data["doi"].append(doi)
    additional_data["reference_count"].append(reference_count)
    additional_data["citation_count"].append(citation_count)
    additional_data["publication_type"].append(publication_type)
    additional_data["semantic_scholar_id"].append(semantic_scholar_id)

papers_df = pd.merge(papers_df, pd.DataFrame(additional_data), how="left", on="arxiv_id")

In [25]:
papers_df

Unnamed: 0,id,title,abstract,categories,update_date,title_words,abstract_words,mapped_categories,amount_categories,update_year,super_categories,super_category,amount_super_categories,of_interest,arxiv_id,doi,reference_count,citation_count,publication_type,semantic_scholar_id
0,2308.06394,Detecting and Preventing Hallucinations in Lar...,Instruction tuned Large Vision Language Mode...,"['cs.CV', 'cs.LG']",2024-02-13,9,231,"['Computer Vision and Pattern Recognition', 'M...",2,2024,"['Computer Science', 'Computer Science']",Computer Science,1,True,2308.06394,10.48550/arXiv.2308.06394,36.0,74.0,JournalArticle,658cd67a91da86cf451e6f1b015f762b56015172
1,2406.07294,OTO Planner: An Efficient Only Travelling Once...,Autonomous exploration in complex and clutte...,"['cs.RO', 'cs.CV']",2024-06-12,14,167,"['Robotics', 'Computer Vision and Pattern Reco...",2,2024,"['Computer Science', 'Computer Science']",Computer Science,1,True,2406.07294,10.48550/arXiv.2406.07294,30.0,0.0,JournalArticle,e2dcb0a115985bb1dfe539affc0f74f39eae7b56
2,2405.18293,CF-OPT: Counterfactual Explanations for Struct...,Optimization layers in deep neural networks ...,['cs.LG'],2024-06-04,6,162,['Machine Learning'],1,2024,['Computer Science'],Computer Science,1,True,2405.18293,10.48550/arXiv.2405.18293,35.0,0.0,JournalArticle,e0e350dd181bfb4928706ff4033ca45d40e3d15b
3,2406.17419,Leave No Document Behind: Benchmarking Long-Co...,Long-context modeling capabilities have garn...,"['cs.CL', 'cs.AI']",2024-06-26,11,164,"['Computation and Language', 'Artificial Intel...",2,2024,"['Computer Science', 'Computer Science']",Computer Science,1,True,2406.17419,10.48550/arXiv.2406.17419,30.0,3.0,JournalArticle,9061cc45c64846498f572c9ad2cb14f76324d665
4,2402.19014,Enhancing Visual Document Understanding with C...,"Recently, the advent of Large Visual-Languag...",['cs.CV'],2024-03-01,11,238,['Computer Vision and Pattern Recognition'],1,2024,['Computer Science'],Computer Science,1,True,2402.19014,10.48550/arXiv.2402.19014,42.0,4.0,JournalArticle,38d1dc7552115bd2ac0a76b9189928a62c04e994
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
495,2310.13585,POTLoc: Pseudo-Label Oriented Transformer for ...,This paper tackles the challenge of point-su...,['cs.CV'],2024-06-07,9,206,['Computer Vision and Pattern Recognition'],1,2024,['Computer Science'],Computer Science,1,True,2310.13585,10.48550/arXiv.2310.13585,71.0,0.0,JournalArticle,b6ff86a86eb96315194249b28045fa4475626a70
496,2402.02350,Interference-Aware Emergent Random Access Prot...,"In this article, we propose a multi-agent de...","['cs.NI', 'cs.LG']",2024-02-06,10,92,"['Networking and Internet Architecture', 'Mach...",2,2024,"['Computer Science', 'Computer Science']",Computer Science,1,True,2402.0235,,,,,
497,2404.14464,Tree of Reviews: A Tree-based Dynamic Iterativ...,Multi-hop question answering is a knowledge-...,"['cs.CL', 'cs.AI', 'cs.IR']",2024-04-24,13,246,"['Computation and Language', 'Artificial Intel...",3,2024,"['Computer Science', 'Computer Science', 'Comp...",Computer Science,1,True,2404.14464,10.48550/arXiv.2404.14464,26.0,1.0,JournalArticle,3be9c1db7465e386bdbd196546ab4b564e07c3a4
498,2405.16034,DiffuBox: Refining 3D Object Detection with Po...,Ensuring robust 3D object detection and loca...,['cs.CV'],2024-05-28,8,107,['Computer Vision and Pattern Recognition'],1,2024,['Computer Science'],Computer Science,1,True,2405.16034,10.48550/arXiv.2405.16034,55.0,1.0,JournalArticle,862e97c1aaf19b57f9775109b17ef308880391fd


In [5]:
papers_data[0]

{'paperId': '658cd67a91da86cf451e6f1b015f762b56015172',
 'externalIds': {'DBLP': 'conf/aaai/GunjalYB24',
  'ArXiv': '2308.06394',
  'DOI': '10.48550/arXiv.2308.06394',
  'CorpusId': 260887222},
 'url': 'https://www.semanticscholar.org/paper/658cd67a91da86cf451e6f1b015f762b56015172',
 'title': 'Detecting and Preventing Hallucinations in Large Vision Language Models',
 'year': 2023,
 'referenceCount': 36,
 'citationCount': 74,
 'fieldsOfStudy': ['Computer Science'],
 's2FieldsOfStudy': [{'category': 'Computer Science', 'source': 'external'},
  {'category': 'Computer Science', 'source': 's2-fos-model'}],
 'publicationTypes': ['JournalArticle', 'Conference'],
 'publicationDate': '2023-08-11',
 'journal': {'pages': '18135-18143'},
 'authors': [{'authorId': '1478299214', 'name': 'A. Gunjal'},
  {'authorId': '1602318431', 'name': 'Jihan Yin'},
  {'authorId': '2857272', 'name': 'Erhan Bas'}],
 'references': [{'paperId': '104b0bb1da562d53cbda87aec79ef6a2827d191a',
   'title': 'Llama 2: Open Fou