# Library Imports

We import all the necessary Python libraries used throughout the analysis.

In [3]:
import time
import requests
import feedparser
import json
from urllib.parse import quote_plus


The file aims to extract abstracts of papers from arXiv. In particular, we focus on papers with a literary theme.

In [5]:
KEYWORDS = [
    "literature",
    "literary",
    "philology",
    "book history",
    "textual criticism",
    "textual criticism",
    "book-history",
    "book_history",
    "digital humanities",
    "book studies",
    "manuscript",
    "codicology",
    "paleography",
    "literary theory",
    "comparative literature",
    "literary criticism",
]

TARGET_COUNT = 1000  
BATCH_SIZE = 100     
BASE_URL = "http://export.arxiv.org/api/query?"

# Utility
def make_query(keywords):
    parts = []
    for kw in keywords:
        parts.append('all:%s' % quote_plus('"%s"' % kw))
    return '+OR+'.join(parts)


def parse_entry(entry):
    arxiv_id = entry.get('id')
    title = entry.get('title', '').strip().replace('\n', ' ')
    abstract = entry.get('summary', '').strip().replace('\n', ' ')
    authors = [a.name for a in entry.get('authors', [])] if entry.get('authors') else []
    # categories: feedparser puts tags in 'term'
    cats = [t['term'] for t in entry.get('tags', [])] if entry.get('tags') else []
    primary_cat = cats[0] if cats else None
    # pdf link
    pdf_url = None
    for link in entry.get('links', []):
        if link.get('type') == 'application/pdf':
            pdf_url = link.get('href')
            break
    return {
        'id': arxiv_id,
        'title': title,
        'authors': authors,
        'primary_category': primary_cat,
        'categories': cats,
        'abstract': abstract,
        'pdf_url': pdf_url,
    }


def is_cs_paper(parsed_entry):
    pc = parsed_entry.get('primary_category')
    if not pc:
        return False
    return pc.startswith('cs')


def fetch_arxiv_abstracts(keywords, target_count=1000, batch_size=100, pause_seconds=3):
    collected = []
    seen_ids = set()
    query = make_query(keywords)
    start = 0
    total_results_estimate = None

    while len(collected) < target_count:
        url = f"{BASE_URL}search_query={query}&start={start}&max_results={batch_size}"
        print(f"arXiv request: start={start} max_results={batch_size}")
        r = requests.get(url, headers={'User-Agent': 'arXivAbstractFetcher/1.0 (+https://example.org)'})
        if r.status_code != 200:
            print(f"HTTP error {r.status_code} - stopping")
            break
        feed = feedparser.parse(r.text)

        if total_results_estimate is None:
            # feed.opensearch_totalresults usually exists
            try:
                total_results_estimate = int(feed['feed'].get('opensearch_totalresults', 0))
            except Exception:
                total_results_estimate = None

        entries = feed.entries
        if not entries:
            print("No more entries found: stopping")
            break

        new_found = 0
        for e in entries:
            parsed = parse_entry(e)
            # deduplication
            if parsed['id'] in seen_ids:
                continue
            seen_ids.add(parsed['id'])
            new_found += 1
            # filter out CS papers
            if is_cs_paper(parsed):
                continue
            collected.append(parsed)
            if len(collected) >= target_count:
                break

        print(f"Found {new_found} entries, collected so far: {len(collected)} / {target_count}")

        # move forward
        start += batch_size

        # if estimated total is reached, stop
        if total_results_estimate is not None and start >= total_results_estimate:
            print("Reached the estimated total number of results from the arXiv API.")
            break

        # pause to respect rate limits
        time.sleep(pause_seconds)

    return collected


if __name__ == '__main__':
    print("Starting arXiv fetch...")
    abstracts = fetch_arxiv_abstracts(KEYWORDS, target_count=TARGET_COUNT, batch_size=BATCH_SIZE)
    print(f"Downloaded {len(abstracts)} abstracts. Saving to file...")

    # Save JSON
    with open('abstracts.json', 'w', encoding='utf-8') as f:
        json.dump(abstracts, f, ensure_ascii=False, indent=2)

    import csv
    
    # Save abstracts in CSV
    with open('../Datasets/abstracts.csv', 'w', newline='', encoding='utf-8') as csvfile:
        fieldnames = ['Index', 'Title', 'Authors', 'Categories', 'abstract']
        writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
    
        writer.writeheader()
        for i, a in enumerate(abstracts, 1):
            writer.writerow({
                'Title': a['title'],
                'Authors': ', '.join(a['authors']),
                'primary_category': ",".join(a['primary_category']),
                'Categories': ', '.join(a['categories']),
                'abstract': a['abstract'].strip()
            })

    print("Done. Files: abstracts.json  abstracts.txt")


Starting arXiv fetch...
arXiv request: start=0 max_results=100
Found 100 entries, collected so far: 9 / 1000
arXiv request: start=100 max_results=100
Found 100 entries, collected so far: 21 / 1000
arXiv request: start=200 max_results=100
Found 100 entries, collected so far: 47 / 1000
arXiv request: start=300 max_results=100
Found 100 entries, collected so far: 90 / 1000
arXiv request: start=400 max_results=100
Found 100 entries, collected so far: 118 / 1000
arXiv request: start=500 max_results=100
Found 100 entries, collected so far: 149 / 1000
arXiv request: start=600 max_results=100
Found 100 entries, collected so far: 182 / 1000
arXiv request: start=700 max_results=100
Found 100 entries, collected so far: 258 / 1000
arXiv request: start=800 max_results=100
Found 100 entries, collected so far: 328 / 1000
arXiv request: start=900 max_results=100
Found 100 entries, collected so far: 377 / 1000
arXiv request: start=1000 max_results=100
Found 100 entries, collected so far: 435 / 1000
arX

ValueError: dict contains fields not in fieldnames: 'primary_category'

In [None]:
import time
import requests
import feedparser
from urllib.parse import quote_plus
import csv

KEYWORDS = [
    "literature",
    "literary",
    "philology",
    "book history",
    "textual criticism",
    "book-history",
    "book_history",
    "digital humanities",
    "book studies",
    "manuscript",
    "codicology",
    "paleography",
    "literary theory",
    "comparative literature",
    "literary criticism",
]

TARGET_COUNT = 1000
BATCH_SIZE = 100
BASE_URL = "http://export.arxiv.org/api/query?"


# Utility
def make_query(keywords):
    parts = []
    for kw in keywords:
        parts.append('all:%s' % quote_plus('"%s"' % kw))
    return '+OR+'.join(parts)


def parse_entry(entry):
    arxiv_id = entry.get('id')
    title = entry.get('title', '').strip().replace('\n', ' ')
    abstract = entry.get('summary', '').strip().replace('\n', ' ')
    authors = [a.name for a in entry.get('authors', [])] if entry.get('authors') else []
    cats = [t['term'] for t in entry.get('tags', [])] if entry.get('tags') else []
    primary_cat = cats[0] if cats else None
    return {
        'id': arxiv_id,
        'title': title,
        'authors': authors,
        'primary_category': primary_cat,
        'categories': cats,
        'abstract': abstract,
    }


def is_cs_paper(parsed_entry):
    pc = parsed_entry.get('primary_category')
    return pc.startswith('cs') if pc else False


def fetch_arxiv_abstracts(keywords, target_count=1000, batch_size=100, pause_seconds=3):
    collected = []
    seen_ids = set()
    query = make_query(keywords)
    start = 0
    total_results_estimate = None

    while len(collected) < target_count:
        url = f"{BASE_URL}search_query={query}&start={start}&max_results={batch_size}"
        print(f"arXiv request: start={start} max_results={batch_size}")
        r = requests.get(url, headers={'User-Agent': 'arXivAbstractFetcher/1.0'})
        if r.status_code != 200:
            print(f"HTTP error {r.status_code} - stopping")
            break

        feed = feedparser.parse(r.text)
        if total_results_estimate is None:
            try:
                total_results_estimate = int(feed['feed'].get('opensearch_totalresults', 0))
            except Exception:
                total_results_estimate = None

        entries = feed.entries
        if not entries:
            print("No more entries found: stopping")
            break

        new_found = 0
        for e in entries:
            parsed = parse_entry(e)
            if parsed['id'] in seen_ids or is_cs_paper(parsed):
                continue
            seen_ids.add(parsed['id'])
            collected.append(parsed)
            new_found += 1
            if len(collected) >= target_count:
                break

        print(f"Found {new_found} new entries, collected so far: {len(collected)} / {target_count}")

        start += batch_size
        if total_results_estimate is not None and start >= total_results_estimate:
            print("Reached the estimated total number of results from the arXiv API.")
            break

        time.sleep(pause_seconds)

    return collected


if __name__ == '__main__':
    print("Starting arXiv fetch...")
    abstracts = fetch_arxiv_abstracts(KEYWORDS, target_count=TARGET_COUNT, batch_size=BATCH_SIZE)
    print(f"Downloaded {len(abstracts)} abstracts. Saving to CSV...")

    with open('abstracts.csv', 'w', newline='', encoding='utf-8') as csvfile:
        fieldnames = ['Index', 'Title', 'Authors', 'Categories', 'Abstract']
        writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
        writer.writeheader()
        for i, a in enumerate(abstracts, 1):
            writer.writerow({
                'Index': i,
                'Title': a['title'],
                'Authors': ', '.join(a['authors']),
                'Categories': ', '.join(a['categories']),
                'Abstract': a['abstract'].strip()
            })

    print("Done. File created: abstracts.csv")
