In [19]:
import urllib.parse
import urllib.request
import requests
import xml.etree.ElementTree as ET
from io import BytesIO
from PyPDF2 import PdfReader
import os
import time
import re

In [20]:
keyword = "machine learning"
num_articles = 10
encodingmethod = "utf-8"
errortype = "strict"

output_dir = "journal_articles"
if not os.path.exists(output_dir):
    os.makedirs(output_dir)
    print(f"Created directory: {output_dir}")
else:
    print(f"Directory already exists: {output_dir}")

Created directory: journal_articles


In [21]:
encoded_search_term = urllib.parse.quote(keyword, encoding=encodingmethod, errors=errortype)
url = f'http://export.arxiv.org/api/query?search_query=all:{encoded_search_term}&start=0&max_results={num_articles}'

print(f"Searching for '{keyword}' on arXiv...")
print(f"URL: {url}")

try:
    url_read = urllib.request.urlopen(url).read().decode("utf-8")
    parse_xml = ET.fromstring(url_read)
    print("Successfully retrieved search results!")
except Exception as e:
    print(f"Error retrieving data: {e}")
    raise

Searching for 'machine learning' on arXiv...
URL: http://export.arxiv.org/api/query?search_query=all:machine%20learning&start=0&max_results=10
Successfully retrieved search results!
Successfully retrieved search results!


In [22]:
ns = {"ns": "http://www.w3.org/2005/Atom"}
entries = parse_xml.findall('ns:entry', ns)

articles_data = []
for entry in entries:
    link = entry.find('ns:link[@type="application/pdf"]', ns)
    if link is not None and "href" in link.attrib:
        pdf_url = link.attrib['href']
        
        title = entry.find('ns:title', ns)
        title_text = title.text.strip() if title is not None else "Unknown Title"
        
        authors = entry.findall('ns:author/ns:name', ns)
        author_names = [author.text for author in authors] if authors else ["Unknown Author"]
        
        published = entry.find('ns:published', ns)
        published_date = published.text[:10] if published is not None else "Unknown Date"
        
        summary = entry.find('ns:summary', ns)
        summary_text = summary.text.strip() if summary is not None else "No summary available"
        
        metadata = {
            'title': title_text,
            'authors': author_names,
            'published': published_date,
            'summary': summary_text
        }
            
        articles_data.append({
            'pdf_url': pdf_url,
            'metadata': metadata
        })

print(f"Found {len(articles_data)} articles with PDF links")
for i, article in enumerate(articles_data):
    print(f"{i+1}. {article['metadata']['title'][:80]}...")

Found 10 articles with PDF links
1. Lecture Notes: Optimization for Machine Learning...
2. An Optimal Control View of Adversarial Machine Learning...
3. Minimax deviation strategies for machine learning and recognition with
  short l...
4. Machine Learning for Clinical Predictive Analytics...
5. Towards Modular Machine Learning Solution Development: Benefits and
  Trade-offs...
6. Introduction to Machine Learning: Class Notes 67577...
7. The Tribes of Machine Learning and the Realm of Computer Architecture...
8. A Machine Learning Tutorial for Operational Meteorology, Part I:
  Traditional M...
9. Position Paper: Towards Transparent Machine Learning...
10. Understanding Bias in Machine Learning...


In [23]:
successful_downloads = 0
failed_downloads = 0

for i, article in enumerate(articles_data):
    print(f"Processing article {i+1}")
    print(f"Title: {article['metadata']['title'][:80]}...")
        
    pdf_response = requests.get(article['pdf_url'], timeout=30)
    pdf_response.raise_for_status()
        
    pdf = PdfReader(BytesIO(pdf_response.content))
    pdf_text = ""
        
    for page in pdf.pages:
        page_text = page.extract_text()
        if page_text.strip():
            pdf_text += page_text + " "

    pdf_text = re.sub(r' {2,}', ' ', pdf_text)
    pdf_text = re.sub(r'\n{3,}', '\n\n', pdf_text)
    pdf_text = re.sub(r'[\f\v\r]', ' ', pdf_text)
    pdf_text = re.sub(r'[\x00-\x08\x0B\x0C\x0E-\x1F\x7F]', '', pdf_text)
    pdf_text = re.sub(r'([.!?])\s*([A-Z])', r'\1 \2', pdf_text)   
    pdf_text = pdf_text.strip()
    
    safe_title = re.sub(r'[^\w\s-]', '', article['metadata']['title'])
    safe_title = re.sub(r'[-\s]+', '_', safe_title)[:50] 
    filename = f"{i+1:02d}_{safe_title}.txt"
    filepath = os.path.join(output_dir, filename)
    
    with open(filepath, 'w', encoding='utf-8') as f:
        f.write("="*80 + "\n")
        f.write(f"JOURNAL ARTICLE #{i+1}\n")
        f.write("="*80 + "\n\n")
        f.write(f"Title: {article['metadata']['title']}\n")
        f.write(f"Authors: {', '.join(article['metadata']['authors'])}\n")
        f.write(f"Published: {article['metadata']['published']}\n")
        f.write(f"Source: {article['pdf_url']}\n")
        f.write("\n" + "-"*80 + "\n")
        f.write("ABSTRACT/SUMMARY:\n")
        f.write("-"*80 + "\n")
        f.write(f"{article['metadata']['summary']}\n\n")
        f.write("-"*80 + "\n")
        f.write("FULL TEXT CONTENT:\n")
        f.write("-"*80 + "\n")
        f.write(pdf_text)
    
    print(f"saved to: {filename}")
    successful_downloads += 1
    
    time.sleep(2)

Processing article 1
Title: Lecture Notes: Optimization for Machine Learning...
saved to: 01_Lecture_Notes_Optimization_for_Machine_Learning.txt
saved to: 01_Lecture_Notes_Optimization_for_Machine_Learning.txt
Processing article 2
Title: An Optimal Control View of Adversarial Machine Learning...
Processing article 2
Title: An Optimal Control View of Adversarial Machine Learning...
saved to: 02_An_Optimal_Control_View_of_Adversarial_Machine_Lea.txt
saved to: 02_An_Optimal_Control_View_of_Adversarial_Machine_Lea.txt
Processing article 3
Title: Minimax deviation strategies for machine learning and recognition with
  short l...
Processing article 3
Title: Minimax deviation strategies for machine learning and recognition with
  short l...
saved to: 03_Minimax_deviation_strategies_for_machine_learning_.txt
saved to: 03_Minimax_deviation_strategies_for_machine_learning_.txt
Processing article 4
Title: Machine Learning for Clinical Predictive Analytics...
saved to: 04_Machine_Learning_for_Clin