In [1]:
!pip install requests beautifulsoup4 lxml -q

In [2]:
!pip install tqdm -q

In [3]:
import re
import os
import pandas as pd
import requests
from bs4 import BeautifulSoup
import time
from urllib.parse import urljoin
from typing import List
from tqdm import tqdm

In [4]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# HTML scrawler


In [5]:
def extract_title(soup):
    h1_tag = soup.find("h1")
    title = h1_tag.get_text(strip=True) if h1_tag else None
    return title

def extract_abstract(soup):
    abstract = soup.find('div', class_='ltx_abstract')
    paragraph = abstract.find_all('p')
    abstract = ''
    for p in paragraph:
        abstract += p.get_text()
    clean_abstract = re.sub(r'\s+', ' ', abstract)
    return clean_abstract.strip()


def extract_intro(soup):
    # Attempt to find a section with an <h2> tag that includes "Introduction"
    intro = None
    for section in soup.find_all('section'):
        h2_tag = section.find('h2', {'class': 'ltx_title ltx_title_section'})
        if h2_tag and 'Introduction' in h2_tag.get_text():
            intro = section
            h2_tag.decompose()  # Remove the <h2> tag to clean the intro text
            break

    # Fallback if no <h2> with "Introduction" is found
    if not intro:
        intro = soup.find('section')

    if intro:
        # Clean and format the intro text
        clean_intro = re.sub(r'\s+', ' ', intro.get_text())
        return clean_intro.strip()

    # Return None if no suitable section was found
    return None

# Investigate Arxiv structure


In [6]:
def get_page_content(url):
    """Fetches content of a page."""
    response = requests.get(url)
    if response.status_code == 200:
        return response.content
    else:
        print(f"Failed to fetch {url}")
        return None

Find out which paper has html format

In [7]:
def find_view_html_links(page_content) -> List:
    """Finds all <a> tags with title="View HTML"."""
    soup = BeautifulSoup(page_content, 'html.parser')
    # Find all <a> tags with title="View HTML"
    view_html_links = soup.find_all("a", title="View HTML")

    html_links = []
    # Print each link's href and text
    for link in view_html_links:
        href = link.get("href")
        html_links.append(href)
    return html_links

In [8]:
len(find_view_html_links(get_page_content("https://arxiv.org/list/cs.CV/recent?skip=0&show=2000")))

513

Scrawl paper links on sub pages

In [29]:
def active_all_sub_pages(start_url):
    html_url_list = []

    # Fetch page content
    page_content = get_page_content(start_url)
    if page_content is None:
        return None

    soup = BeautifulSoup(page_content, 'html.parser')

    # # Pagination logic
    # paging = soup.select_one("div.paging")
    # paging = soup.select_one("div.paging")
    # if paging:
    #     # Find all pagination links in paging section
    #     pagination_links = paging.find_all("a", href=True)
    #     if pagination_links:
    #         for i in range(len(pagination_links)):
    #             sub_page_url = urljoin(base_url, pagination_links[i]["href"])
    #             print(f"Crawling sub page: {sub_page_url}")
    #             sub_page_content = get_page_content(sub_page_url)
    #             html_url_list.extend(find_view_html_links(page_content))
    #     else:
    #         print("Reached the last page, stopping crawl.")
    #         return None
    # else:
    #     return None

    active_href = soup.find('div', class_='morefewer').find_all('a', href=True)
    all_pages = active_href[len(active_href)-1]

    if 'all' in all_pages.text:
        all_pages_url = urljoin(base_url, all_pages["href"])
        all_pages_content = get_page_content(all_pages_url)
    else:
        all_pages_content = page_content

    html_url_list.extend(find_view_html_links(all_pages_content))

    return html_url_list


In [10]:
def main(base_url, start_url):
    paper_url_lst = active_all_sub_pages(start_url)
    if paper_url_lst is None:
        return None
    data = []
    missing_urls = []
    fail_urls = []

    for url in tqdm(paper_url_lst, desc="Processing URLs"):
        response = requests.get(url)
        if response.status_code == 200:
          soup = BeautifulSoup(response.text, 'lxml')
        else:
          print(f"Failed to fetch {url}")
          return None
        try:
          title = extract_title(soup)
          abstract = extract_abstract(soup)
          intro = extract_intro(soup)

          if not title or not abstract or not intro:
              missing_urls.append(url)
              continue
        except:
          fail_urls.append(url)
          continue

        # Append to data list as a dictionary
        data.append({
            "URL": url,
            "Title": title,
            "Abstract": abstract,
            "Introduction": intro
        })

    print(f"Failed to fetch {fail_urls}")
    print(f"Missing urls: {missing_urls}")

    return data

In [27]:
def save_to_csv(data, filename):
    print("\t Valid data: ", len(data))
    df = pd.DataFrame(data)
    df.to_csv(filename, index=False)

In [None]:
'''
<a href="/list/cs.AI/recent" id="cs.AI" aria-labelledby="main-cs cs.AI">Artificial Intelligence</a>;
<a href="/list/cs.CL/recent" id="cs.CL" aria-labelledby="main-cs cs.CL">Computation and Language</a>;
<a href="/list/cs.CC/recent" id="cs.CC" aria-labelledby="main-cs cs.CC">Computational Complexity</a>;
<a href="/list/cs.CE/recent" id="cs.CE" aria-labelledby="main-cs cs.CE">Computational Engineering, Finance, and Science</a>;
<a href="/list/cs.CG/recent" id="cs.CG" aria-labelledby="main-cs cs.CG">Computational Geometry</a>;
<a href="/list/cs.GT/recent" id="cs.GT" aria-labelledby="main-cs cs.GT">Computer Science and Game Theory</a>;
<a href="/list/cs.CV/recent" id="cs.CV" aria-labelledby="main-cs cs.CV">Computer Vision and Pattern Recognition</a>;
<a href="/list/cs.CY/recent" id="cs.CY" aria-labelledby="main-cs cs.CY">Computers and Society</a>;
<a href="/list/cs.CR/recent" id="cs.CR" aria-labelledby="main-cs cs.CR">Cryptography and Security</a>;
<a href="/list/cs.DS/recent" id="cs.DS" aria-labelledby="main-cs cs.DS">Data Structures and Algorithms</a>;
<a href="/list/cs.DB/recent" id="cs.DB" aria-labelledby="main-cs cs.DB">Databases</a>;
<a href="/list/cs.DL/recent" id="cs.DL" aria-labelledby="main-cs cs.DL">Digital Libraries</a>;
<a href="/list/cs.DM/recent" id="cs.DM" aria-labelledby="main-cs cs.DM">Discrete Mathematics</a>;
<a href="/list/cs.DC/recent" id="cs.DC" aria-labelledby="main-cs cs.DC">Distributed, Parallel, and Cluster Computing</a>;
<a href="/list/cs.ET/recent" id="cs.ET" aria-labelledby="main-cs cs.ET">Emerging Technologies</a>;
<a href="/list/cs.FL/recent" id="cs.FL" aria-labelledby="main-cs cs.FL">Formal Languages and Automata Theory</a>;
'''

In [33]:
html = """
<a href="/list/cs.GL/recent" id="cs.GL" aria-labelledby="main-cs cs.GL">General Literature</a>;
<a href="/list/cs.GR/recent" id="cs.GR" aria-labelledby="main-cs cs.GR">Graphics</a>;
<a href="/list/cs.AR/recent" id="cs.AR" aria-labelledby="main-cs cs.AR">Hardware Architecture</a>;
<a href="/list/cs.HC/recent" id="cs.HC" aria-labelledby="main-cs cs.HC">Human-Computer Interaction</a>;
<a href="/list/cs.IR/recent" id="cs.IR" aria-labelledby="main-cs cs.IR">Information Retrieval</a>;
<a href="/list/cs.IT/recent" id="cs.IT" aria-labelledby="main-cs cs.IT">Information Theory</a>;
<a href="/list/cs.LO/recent" id="cs.LO" aria-labelledby="main-cs cs.LO">Logic in Computer Science</a>;
<a href="/list/cs.LG/recent" id="cs.LG" aria-labelledby="main-cs cs.LG">Machine Learning</a>;
<a href="/list/cs.MS/recent" id="cs.MS" aria-labelledby="main-cs cs.MS">Mathematical Software</a>;
<a href="/list/cs.MA/recent" id="cs.MA" aria-labelledby="main-cs cs.MA">Multiagent Systems</a>;
<a href="/list/cs.MM/recent" id="cs.MM" aria-labelledby="main-cs cs.MM">Multimedia</a>;
<a href="/list/cs.NI/recent" id="cs.NI" aria-labelledby="main-cs cs.NI">Networking and Internet Architecture</a>;
<a href="/list/cs.NE/recent" id="cs.NE" aria-labelledby="main-cs cs.NE">Neural and Evolutionary Computing</a>;
<a href="/list/cs.NA/recent" id="cs.NA" aria-labelledby="main-cs cs.NA">Numerical Analysis</a>;
<a href="/list/cs.OS/recent" id="cs.OS" aria-labelledby="main-cs cs.OS">Operating Systems</a>;
<a href="/list/cs.OH/recent" id="cs.OH" aria-labelledby="main-cs cs.OH">Other Computer Science</a>;
<a href="/list/cs.PF/recent" id="cs.PF" aria-labelledby="main-cs cs.PF">Performance</a>;
<a href="/list/cs.PL/recent" id="cs.PL" aria-labelledby="main-cs cs.PL">Programming Languages</a>;
<a href="/list/cs.RO/recent" id="cs.RO" aria-labelledby="main-cs cs.RO">Robotics</a>;
<a href="/list/cs.SI/recent" id="cs.SI" aria-labelledby="main-cs cs.SI">Social and Information Networks</a>;
<a href="/list/cs.SE/recent" id="cs.SE" aria-labelledby="main-cs cs.SE">Software Engineering</a>;
<a href="/list/cs.SD/recent" id="cs.SD" aria-labelledby="main-cs cs.SD">Sound</a>;
<a href="/list/cs.SC/recent" id="cs.SC" aria-labelledby="main-cs cs.SC">Symbolic Computation</a>;
<a href="/list/cs.SY/recent" id="cs.SY" aria-labelledby="main-cs cs.SY">Systems and Control</a>
"""

soup = BeautifulSoup(html, 'html.parser')
category_dict = {}

for a in soup.find_all('a'):
    # Convert the text content to snake_case and use it as the key
    category_name = a.text.lower().replace(" ", "_").replace(",", "")
    category_dict[category_name] = a['href']


In [13]:
base_url = "https://arxiv.org"


In [34]:
for name, start_url in list(category_dict.items()):
    print(f"Crawling category: {name}")
    start_url = urljoin(base_url, start_url)
    try:
      content = main(base_url, start_url)
    except:
      print(f"  Invalid structure. Failed to crawl {name}")
      continue
    save_to_csv(content, f"/content/drive/MyDrive/TitleGenerator/DataSheet/{name}.csv")

Crawling category: general_literature
Invalid structure. Failed to crawl general_literature
Crawling category: graphics


Processing URLs: 100%|██████████| 18/18 [00:05<00:00,  3.34it/s]


Failed to fetch []
Missing urls: []
	 Valid data:  18
Crawling category: hardware_architecture


Processing URLs: 100%|██████████| 22/22 [00:07<00:00,  2.79it/s]


Failed to fetch ['https://arxiv.org/html/2411.00815v1', 'https://arxiv.org/html/2411.00734v1', 'https://arxiv.org/html/2411.00530v1', 'https://arxiv.org/html/2411.00408v1']
Missing urls: []
	 Valid data:  18
Crawling category: human-computer_interaction


Processing URLs: 100%|██████████| 86/86 [00:22<00:00,  3.88it/s]


Failed to fetch ['https://arxiv.org/html/2411.03243v1', 'https://arxiv.org/html/2411.02714v1', 'https://arxiv.org/html/2411.00007v1']
Missing urls: ['https://arxiv.org/html/2411.03287v1']
	 Valid data:  82
Crawling category: information_retrieval


Processing URLs: 100%|██████████| 64/64 [00:28<00:00,  2.23it/s]


Failed to fetch ['https://arxiv.org/html/2411.01843v1', 'https://arxiv.org/html/2411.00702v1', 'https://arxiv.org/html/2411.00188v1']
Missing urls: ['https://arxiv.org/html/2411.00780v1']
	 Valid data:  60
Crawling category: information_theory


Processing URLs: 100%|██████████| 61/61 [01:03<00:00,  1.04s/it]


Failed to fetch ['https://arxiv.org/html/2411.04306v1', 'https://arxiv.org/html/2411.03054v1', 'https://arxiv.org/html/2411.00790v1']
Missing urls: ['https://arxiv.org/html/2411.02004v1']
	 Valid data:  57
Crawling category: logic_in_computer_science


Processing URLs: 100%|██████████| 17/17 [00:14<00:00,  1.18it/s]


Failed to fetch ['https://arxiv.org/html/2411.00026v1']
Missing urls: []
	 Valid data:  16
Crawling category: machine_learning


Processing URLs: 100%|██████████| 846/846 [08:46<00:00,  1.61it/s]


Failed to fetch ['https://arxiv.org/html/2411.04453v1', 'https://arxiv.org/html/2411.04396v1', 'https://arxiv.org/html/2411.04315v1', 'https://arxiv.org/html/2411.04580v1', 'https://arxiv.org/html/2411.04389v1', 'https://arxiv.org/html/2411.04265v1', 'https://arxiv.org/html/2411.03877v1', 'https://arxiv.org/html/2411.03753v1', 'https://arxiv.org/html/2411.03387v1', 'https://arxiv.org/html/2411.03320v1', 'https://arxiv.org/html/2411.02708v1', 'https://arxiv.org/html/2411.02820v1', 'https://arxiv.org/html/2411.02645v1', 'https://arxiv.org/html/2411.02635v1', 'https://arxiv.org/html/2411.02557v1', 'https://arxiv.org/html/2411.02495v1', 'https://arxiv.org/html/2411.02343v1', 'https://arxiv.org/html/2411.00851v1', 'https://arxiv.org/html/2411.00796v1', 'https://arxiv.org/html/2411.02224v2', 'https://arxiv.org/html/2411.01982v1', 'https://arxiv.org/html/2411.00833v1', 'https://arxiv.org/html/2411.00336v1', 'https://arxiv.org/html/2411.00623v1', 'https://arxiv.org/html/2411.00530v1', 'https:/

Processing URLs: 100%|██████████| 2/2 [00:01<00:00,  1.34it/s]


Failed to fetch []
Missing urls: []
	 Valid data:  2
Crawling category: multiagent_systems


Processing URLs: 100%|██████████| 27/27 [00:12<00:00,  2.23it/s]


Failed to fetch ['https://arxiv.org/html/2411.02820v1']
Missing urls: ['https://arxiv.org/html/2411.03519v1']
	 Valid data:  25
Crawling category: multimedia


Processing URLs: 100%|██████████| 19/19 [00:05<00:00,  3.21it/s]


Failed to fetch []
Missing urls: []
	 Valid data:  19
Crawling category: networking_and_internet_architecture


Processing URLs: 100%|██████████| 59/59 [00:26<00:00,  2.19it/s]


Failed to fetch ['https://arxiv.org/html/2411.03507v1', 'https://arxiv.org/html/2411.03203v1', 'https://arxiv.org/html/2411.03017v1', 'https://arxiv.org/html/2411.01970v1', 'https://arxiv.org/html/2411.00681v1', 'https://arxiv.org/html/2411.00408v1']
Missing urls: []
	 Valid data:  53
Crawling category: neural_and_evolutionary_computing


Processing URLs: 100%|██████████| 27/27 [00:11<00:00,  2.34it/s]


Failed to fetch []
Missing urls: ['https://arxiv.org/html/2411.02406v1']
	 Valid data:  26
Crawling category: numerical_analysis


Processing URLs: 100%|██████████| 64/64 [01:14<00:00,  1.16s/it]


Failed to fetch []
Missing urls: ['https://arxiv.org/html/2411.04145v1']
	 Valid data:  63
Crawling category: operating_systems


Processing URLs: 100%|██████████| 2/2 [00:00<00:00,  6.59it/s]


Failed to fetch []
Missing urls: []
	 Valid data:  2
Crawling category: other_computer_science
Invalid structure. Failed to crawl other_computer_science
Crawling category: performance


Processing URLs: 100%|██████████| 6/6 [00:02<00:00,  2.33it/s]


Failed to fetch ['https://arxiv.org/html/2411.00815v1']
Missing urls: []
	 Valid data:  5
Crawling category: programming_languages


Processing URLs: 100%|██████████| 6/6 [00:02<00:00,  2.25it/s]


Failed to fetch ['https://arxiv.org/html/2411.00637v1']
Missing urls: []
	 Valid data:  5
Crawling category: robotics


Processing URLs: 100%|██████████| 169/169 [00:58<00:00,  2.90it/s]


Failed to fetch ['https://arxiv.org/html/2411.04374v1', 'https://arxiv.org/html/2411.04005v1', 'https://arxiv.org/html/2411.01943v1', 'https://arxiv.org/html/2411.01475v1', 'https://arxiv.org/html/2411.00785v1', 'https://arxiv.org/html/2411.00345v1', 'https://arxiv.org/html/2411.00007v1']
Missing urls: ['https://arxiv.org/html/2411.03287v1', 'https://arxiv.org/html/2411.00659v1']
	 Valid data:  160
Crawling category: social_and_information_networks


Processing URLs: 100%|██████████| 41/41 [00:21<00:00,  1.89it/s]


Failed to fetch ['https://arxiv.org/html/2411.01852v1', 'https://arxiv.org/html/2411.01330v1', 'https://arxiv.org/html/2411.02005v1', 'https://arxiv.org/html/2411.00376v1', 'https://arxiv.org/html/2411.00702v1']
Missing urls: []
	 Valid data:  36
Crawling category: software_engineering


Processing URLs: 100%|██████████| 38/38 [00:10<00:00,  3.55it/s]


Failed to fetch ['https://arxiv.org/html/2411.03455v1', 'https://arxiv.org/html/2411.01601v1']
Missing urls: []
	 Valid data:  36
Crawling category: sound


Processing URLs: 100%|██████████| 43/43 [00:10<00:00,  4.07it/s]


Failed to fetch ['https://arxiv.org/html/2411.04142v1']
Missing urls: []
	 Valid data:  42
Crawling category: symbolic_computation


Processing URLs: 100%|██████████| 2/2 [00:01<00:00,  1.56it/s]


Failed to fetch []
Missing urls: []
	 Valid data:  2
Crawling category: systems_and_control


Processing URLs: 100%|██████████| 93/93 [00:54<00:00,  1.70it/s]

Failed to fetch ['https://arxiv.org/html/2411.03834v1', 'https://arxiv.org/html/2411.03271v1', 'https://arxiv.org/html/2411.01668v1', 'https://arxiv.org/html/2411.00318v1']
Missing urls: ['https://arxiv.org/html/2411.03287v1', 'https://arxiv.org/html/2411.01380v1']
	 Valid data:  87



