In [1]:
import feedparser
import requests
import os
import time
import pandas as pd
import re
# 🔹 Search Queries for Various Topics
TOPICS = [
    "machine learning", "deep learning", "reinforcement learning", 
    "large language models", "chatbot", "natural language processing", 
    "medical AI", "computer vision", "data science", "data mining",
    "databases", "big data", "cybersecurity", "bioinformatics"
]

# 🔹 Constants
MAX_PAPERS = 10000  # Adjust as needed
RESULTS_PER_PAGE = 200  # arXiv's max limit per query
SAVE_DIR = r"D:\2CSI-Project\arxiv_papers"  # Directory to save PDFs
CSV_FILE = "arxiv_metadata.csv"  # Metadata file

# 🔹 Ensure Directories Exist
os.makedirs(SAVE_DIR, exist_ok=True)

# 🔹 Initialize Metadata Storage
all_papers = []

# 🔹 Fetch Papers for Each Topic
for topic in TOPICS:
    print(f"Fetching papers for topic: {topic}")

    num_papers_fetched = 0
    start_index = 1024

    while num_papers_fetched < MAX_PAPERS // len(TOPICS):
        query = topic.replace(" ", "+")
        api_url = f"http://export.arxiv.org/api/query?search_query={query}&start={start_index}&max_results={RESULTS_PER_PAGE}"

        print(f"Fetching {RESULTS_PER_PAGE} papers starting from {start_index}...")
        feed = feedparser.parse(api_url)

        if not feed.entries:
            print("No more results. Moving to next topic.")
            break

        for entry in feed.entries:
            title = entry.title.replace("\n", " ").strip()
            authors = ", ".join([author.name for author in entry.authors])
            abstract = entry.summary.replace("\n", " ").strip()
            pdf_url = entry.id.replace("abs", "pdf")
            category = entry.arxiv_primary_category['term'] if 'arxiv_primary_category' in entry else "Unknown"

            # Save metadata
            all_papers.append([title, authors, abstract, category, pdf_url])

            # 🔹 Download PDF
            def sanitize_filename(filename, max_length=50):
                # Remove invalid characters
                filename = re.sub(r'[<>:"/\\|?*]', "", filename)
                filename = filename.strip()  # Remove leading/trailing spaces
                return filename[:max_length]  # Shorten if too long

            # Use the function to sanitize the filename
            safe_title = sanitize_filename(title)
            pdf_filename = os.path.join(SAVE_DIR, f"{safe_title}.pdf")
            try:
                response = requests.get(pdf_url, stream=True, timeout=10)
                if response.status_code == 200:
                    with open(pdf_filename, "wb") as file:
                        for chunk in response.iter_content(1024):
                            file.write(chunk)
                    print(f"✅ Saved PDF: {pdf_filename}")
                else:
                    print(f"❌ Failed to download: {title}")
            except requests.RequestException as e:
                print(f"⚠️ Error downloading {title}: {e}")

            num_papers_fetched += 1
            if num_papers_fetched >= MAX_PAPERS // len(TOPICS):
                break

        start_index += RESULTS_PER_PAGE
        time.sleep(2)  # Respect arXiv's rate limit

# 🔹 Save Metadata to CSV
df = pd.DataFrame(all_papers, columns=["Title", "Authors", "Abstract", "Category", "PDF_URL"])
df.to_csv(CSV_FILE, index=False, encoding="utf-8")

print(f"\n✅ Collected {len(all_papers)} papers. Metadata saved to {CSV_FILE}. PDFs saved in {SAVE_DIR}")


Fetching papers for topic: machine learning
Fetching 200 papers starting from 1024...
No more results. Moving to next topic.
Fetching papers for topic: deep learning
Fetching 200 papers starting from 1024...
✅ Saved PDF: D:\2CSI-Project\arxiv_papers\Proteomics Analysis of FLT3-ITD Mutation in Acute .pdf
✅ Saved PDF: D:\2CSI-Project\arxiv_papers\Deep Learning An Introduction for Applied Mathemat.pdf
✅ Saved PDF: D:\2CSI-Project\arxiv_papers\The Limits and Potentials of Deep Learning for Rob.pdf
✅ Saved PDF: D:\2CSI-Project\arxiv_papers\Probabilistic Deep Learning using Random Sum-Produ.pdf
✅ Saved PDF: D:\2CSI-Project\arxiv_papers\Reversed Active Learning based Atrous DenseNet for.pdf
✅ Saved PDF: D:\2CSI-Project\arxiv_papers\Disaster Monitoring using Unmanned Aerial Vehicles.pdf
✅ Saved PDF: D:\2CSI-Project\arxiv_papers\Macquarie University at BioASQ 6b Deep learning an.pdf
✅ Saved PDF: D:\2CSI-Project\arxiv_papers\Federated Deep Reinforcement Learning.pdf
✅ Saved PDF: D:\2CSI-Project\