In [1]:
import dspy
import os
from dotenv import load_dotenv
from bs4 import BeautifulSoup
import requests

load_dotenv()
lm = dspy.LM(model="gpt-4o-mini", api_key=os.getenv("OPENAI_API_KEY"))
dspy.configure(lm=lm)

def get_paper_summary(url: str) -> str:
    """Get a summary of a paper from a url."""
    response = requests.get(url)
    soup = BeautifulSoup(response.text, "html.parser")
    return soup

print(get_paper_summary("https://arxiv.org/html/2511.09831v1"))

<!DOCTYPE html>

<html lang="en">
<head>
<meta content="text/html; charset=utf-8" http-equiv="content-type"/>
<title>Answering Students‚Äô Questions on Course Forums Using Multiple Chain-of-Thought Reasoning and Finetuning RAG-Enabled LLM</title>
<!--Generated on Thu Nov 13 00:23:31 2025 by LaTeXML (version 0.8.8) http://dlmf.nist.gov/LaTeXML/.-->
<meta content="width=device-width, initial-scale=1, shrink-to-fit=no" name="viewport"/>
<link href="/static/browse/0.3.4/css/arxiv-html-papers-20250916.css" rel="stylesheet" type="text/css"/>
<script src="https://cdn.jsdelivr.net/npm/bootstrap@5.3.0/dist/js/bootstrap.bundle.min.js"></script>
<script src="https://cdnjs.cloudflare.com/ajax/libs/html2canvas/1.3.3/html2canvas.min.js"></script>
<script src="/static/browse/0.3.4/js/addons_new.js"></script>
<script src="/static/browse/0.3.4/js/feedbackOverlay.js"></script>
<base href="/html/2511.09831v1/"/></head>
<body>
<nav class="ltx_page_navbar">
<nav class="ltx_TOC">
<ol class="ltx_toclist">
<l

In [6]:
"""
ArXiv HTML to Markdown Parser
Step-by-step parsing of arXiv HTML papers
"""

from bs4 import BeautifulSoup
import re

def parse_arxiv_html(html_content):
    """
    Parse arXiv HTML and convert to markdown
    """
    soup = html_content
    markdown_lines = []
    # Step 1: Extract title
    title = soup.find('h1', class_='ltx_title_document')
    if title:
        markdown_lines.append(f"# {title.get_text(strip=True)}\n")
    
    # Step 2: Extract authors
    authors_div = soup.find('div', class_='ltx_authors')
    if authors_div:
        authors = authors_div.get_text(strip=True)
        # Clean up superscript numbers
        authors = re.sub(r'\d+', '', authors)
        markdown_lines.append(f"**Authors:** {authors}\n")
    
    # Step 3: Extract abstract
    abstract_div = soup.find('div', class_='ltx_abstract')
    if abstract_div:
        markdown_lines.append("## Abstract\n")
        # Get the abstract text, excluding the title
        abstract_title = abstract_div.find('h6')
        if abstract_title:
            abstract_title.extract()
        abstract_text = abstract_div.get_text(strip=True)
        markdown_lines.append(f"{abstract_text}\n")
    
    # Step 4: Extract main content sections
    article = soup.find('article', class_='ltx_document')
    if article:
        sections = article.find_all('section', class_='ltx_section')
        
        for section in sections:
            # Get section header
            section_title = section.find('h2', class_='ltx_title_section')
            if section_title:
                # Extract section number and title
                title_text = section_title.get_text(strip=True)
                markdown_lines.append(f"\n## {title_text}\n")
            
            # Get subsections
            subsections = section.find_all('section', class_='ltx_subsection')
            if subsections:
                for subsection in subsections:
                    subsection_title = subsection.find('h3', class_='ltx_title_subsection')
                    if subsection_title:
                        title_text = subsection_title.get_text(strip=True)
                        markdown_lines.append(f"\n### {title_text}\n")
                    
                    # Get paragraphs in subsection
                    paragraphs = subsection.find_all('div', class_='ltx_para')
                    for para in paragraphs:
                        # Skip if it contains subsection title
                        if para.find('h3') or para.find('h4'):
                            continue
                        para_text = para.get_text(strip=True)
                        if para_text:
                            markdown_lines.append(f"{para_text}\n")
            else:
                # Get paragraphs directly in section
                paragraphs = section.find_all('div', class_='ltx_para', recursive=False)
                for para in paragraphs:
                    # Skip if contains headers
                    if para.find(['h2', 'h3', 'h4']):
                        continue
                    para_text = para.get_text(strip=True)
                    if para_text:
                        markdown_lines.append(f"{para_text}\n")
            
            # Get tables in section
            tables = section.find_all('figure', class_='ltx_table')
            for table in tables:
                caption = table.find('figcaption')
                if caption:
                    markdown_lines.append(f"\n**{caption.get_text(strip=True)}**\n")
                
                # Simple table extraction (can be improved)
                table_elem = table.find('table')
                if table_elem:
                    markdown_lines.append("\n| Table content |\n|---|\n")
            
            # Get figures in section
            figures = section.find_all('figure', class_='ltx_figure')
            for figure in figures:
                caption = figure.find('figcaption')
                if caption:
                    markdown_lines.append(f"\n**{caption.get_text(strip=True)}**\n")
                img = figure.find('img')
                if img and img.get('src'):
                    markdown_lines.append(f"![Figure]({img['src']})\n")
    
    # Step 5: Extract bibliography
    bibliography = soup.find('section', class_='ltx_bibliography')
    if bibliography:
        markdown_lines.append("\n## References\n")
        bib_items = bibliography.find_all('li', class_='ltx_bibitem')
        for item in bib_items:
            ref_text = item.get_text(strip=True)
            # Clean up the reference
            ref_text = re.sub(r'^\[\d+\]', '', ref_text)
            markdown_lines.append(f"- {ref_text}\n")
    
    return '\n'.join(markdown_lines)


def clean_markdown(markdown_text):
    """
    Clean up the markdown text
    """
    # Remove multiple consecutive newlines
    markdown_text = re.sub(r'\n{3,}', '\n\n', markdown_text)
    
    # Clean up citations [1] format
    markdown_text = re.sub(r'\[(\d+)\]', r'[\1]', markdown_text)
    
    return markdown_text


if __name__ == "__main__":
    # Read the HTML file
    
    # Parse and convert to markdown
    markdown_output = parse_arxiv_html(get_paper_summary("https://arxiv.org/html/2511.09831v1"))
    markdown_output = clean_markdown(markdown_output)
    
    # # Save to file
    # output_path = '/'
    # with open(output_path, 'w', encoding='utf-8') as f:
    #     f.write(markdown_output)
    
    # print(f"Conversion complete! Markdown saved to: {output_path}")
    print(f"\nFirst 500 characters of output:\n{markdown_output[:500]}...")

FileExistsError: [Errno 17] File exists: '/'