In [1]:
import requests
import pandas as pd
import xml.etree.ElementTree as ET

def get_arxiv_metadata(category, start_year, end_year):
    base_url = 'http://export.arxiv.org/api/query'
     #stores the base URL of the ArXiv API
    query = f"cat:{category} AND submittedDate:[{start_year}0101 TO {end_year}1231]"
    #The search query for the API request, filters by category and date range.
    params = {'search_query': query, 'max_results': 1000}
    response = requests.get(base_url, params=params)
    return response.text

def extract_metadata(xml_data, year):
    # Function to extract metadata from XML data
    metadata = [] #creating an empty list
    root = ET.fromstring(xml_data)
    for i, entry in enumerate(root.findall('{http://www.w3.org/2005/Atom}entry'), start=1):
        # Looping over each <entry> element in the XML
        entry_metadata = {}
        # Dictionary to store metadata for each entry
        entry_metadata['title'] = entry.find('{http://www.w3.org/2005/Atom}title').text
        # Extracting the title and doing the same for the rest of the fields
        entry_metadata['authors'] = [author.text for author in entry.findall('.//{http://www.w3.org/2005/Atom}author/{http://www.w3.org/2005/Atom}name')]
        entry_metadata['summary'] = entry.find('.//{http://www.w3.org/2005/Atom}summary').text
        entry_metadata['year'] = year
        entry_metadata['entry_id'] = entry.find('{http://www.w3.org/2005/Atom}id').text
        entry_metadata['updated'] = entry.find('{http://www.w3.org/2005/Atom}updated').text
        entry_metadata['published'] = entry.find('{http://www.w3.org/2005/Atom}published').text
        entry_metadata['primary_category'] = entry.find('{http://www.w3.org/2005/Atom}category[@scheme="http://arxiv.org/schemas/atom"]').attrib['term']
        entry_metadata['links'] = entry.find('{http://www.w3.org/2005/Atom}link[@title="pdf"]').attrib['href']
        
        metadata.append(entry_metadata)
        # Appending the entry metadata to the list
    return metadata

def store_metadata(metadata, output_file):
    # Function to store metadata in a CSV file
    df = pd.DataFrame(metadata)
    # Creating a DataFrame from the metadata
    df.to_csv(output_file, index=False)
    # Saving the DataFrame as a CSV file
    
categories = ['cs.DB', 'cs.GR', 'cs.RO', 'cs.ET']
start_year = 2018
end_year = 2022

all_metadata = []
# Empty list to store all metadata
for category in categories:
    for year in range(start_year, end_year + 1):
        xml_data = get_arxiv_metadata(category, year, year)
        metadata = extract_metadata(xml_data, year)
        all_metadata.extend(metadata)

output_file = 'papers.csv'
store_metadata(all_metadata, output_file)