<a href="https://colab.research.google.com/github/AhmedCoolProjects/ESI/blob/main/Text_Mining_Project_Scraping_ArXiv_Articles.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# BY AHMED BARGADY

In [1]:
# category and total number of articles
ARXIV_FIELDS = {
    # maths
    "math": 955,
    # statistics
    "stat": 229,
    # Computer Science
    "cs": 2248,
    # Electrical Engineering and Systems Science
    "eess": 352,
    # Astrophysics
    "astro-ph": 376,
    # High Energy Physics - Experiment
    "hep-ex": 69,
    # Quantum Physics
    "quant-ph": 266,
    # Quantitative Biology
    "q-bio": 76
}

In [2]:
ARXIV_BASE_URL = "https://arxiv.org/list/"
ARXIV_ABS_URL = "https://arxiv.org/abs/"

In [3]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

In [4]:
def generate_arxiv_pages(fields):
    pages = {}

    for category, total_articles in fields.items():
        # Generating URLs based on the category and total number of articles
        category_pages = []
        for skip in range(0, total_articles, 25):
            page = f"{ARXIV_BASE_URL}{category}/pastweek?skip={skip}&show=25"
            category_pages.append(page)

        pages[category] = category_pages

    return pages

In [5]:
arxiv_pages = generate_arxiv_pages(ARXIV_FIELDS)

In [6]:
total_pages = 0
for category, category_pages in arxiv_pages.items():
    print(f"{category}:")
    length = len(category_pages)
    print(length)
    total_pages += length
print("total pages: ", total_pages)

math:
39
stat:
10
cs:
90
eess:
15
astro-ph:
16
hep-ex:
3
quant-ph:
11
q-bio:
4
total pages:  188


In [7]:
def extract_ids(html_content):
    # Parse HTML content with BeautifulSoup
    soup = BeautifulSoup(html_content, 'html.parser')

    # Find all dl tags containing dt tags
    dl_tags = soup.find_all('dl')

    # List to store extracted IDs
    extracted_ids = []

    # Iterate through dl tags
    for dl_tag in dl_tags:
        # Find all dt tags within the current dl tag
        dt_tags = dl_tag.find_all('dt')

        # Iterate through dt tags
        for dt_tag in dt_tags:
            # Find the span tag within the dt tag
            span_tag = dt_tag.find('span')

            # Find the a tag within the span tag
            a_tag = span_tag.find('a')

            # Check if the a tag has a title containing "Abstract"
            if a_tag and a_tag.get('title', '').lower() == 'abstract':
                # Extract the 'href' attribute, which contains the ID
                href_attr = a_tag.get('href')

                # Extract the ID from the 'href' attribute
                id_value = href_attr.split('/')[-1]

                # Append the ID to the list
                extracted_ids.append(id_value)

    return extracted_ids

In [8]:
def scrape_articles_ids(page_url):
    try:
        # Send an HTTP GET request to the URL
        response = requests.get(page_url)

        # Check if the request was successful (status code 200)
        if response.status_code == 200:
            # Extract Ids
            return extract_ids(response.content)
        else:
            print(f"Failed to retrieve HTML content. Status code: {response.status_code}")
            return None

    except Exception as e:
        print(f"An error occurred: {e}")
        return None

In [9]:
def get_articles_ids(pages_urls):
    articles_ids = {}
    for category, pages_urls in arxiv_pages.items():

        ids = []
        try:
            for page_url in pages_urls:
                ids += scrape_articles_ids(page_url)

                articles_ids[category] = ids
        except Exception as e:
          print(f"An error occurred: {e}")

        print(f"{len(ids)} IDs for {category}")

    return articles_ids

In [10]:
articles_ids = get_articles_ids(arxiv_pages)

955 IDs for math
229 IDs for stat
2248 IDs for cs
352 IDs for eess
376 IDs for astro-ph
69 IDs for hep-ex
266 IDs for quant-ph
76 IDs for q-bio


In [11]:
total_ids = 0
for _, ids in articles_ids.items():
  total_ids += len(ids)

print(f"total number of articles: {total_ids}")

total number of articles: 4571


In [12]:
def extract_summary(html_content):
    # Parse HTML content with BeautifulSoup
    soup = BeautifulSoup(html_content, 'html.parser')

    # Find all blockquote tags containing blockquote tags
    blockquote = soup.find_all('blockquote')

    content_without_span = ''.join(blockquote[0].find_all(string=True, recursive=False)).strip()

    return content_without_span

In [13]:
def scrape_article(id):
    article_url = ARXIV_ABS_URL + id
    try:
        # Send an HTTP GET request to the URL
        response = requests.get(article_url)

        # Check if the request was successful (status code 200)
        if response.status_code == 200:
            # Extract Ids
            return extract_summary(response.content)
        else:
            print(f"Failed to retrieve HTML content. Status code: {response.status_code}")
            return None

    except Exception as e:
        print(f"An error occurred: {e}")
        return None

In [14]:
def create_dfs(articles_ids):

    list_dfs = []
    for category, ids in articles_ids.items():

        df_dicts_list = []
        for id in ids:

            content = scrape_article(id)
            df_dicts_list.append({"id": id, "content": content})
        category_df = pd.DataFrame(df_dicts_list)
        list_dfs.append(category_df)
        print(f"done for {category} category with shape: {category_df.shape}")

    return list_dfs

In [15]:
list_articles_dfs = create_dfs(articles_ids)

An error occurred: ('Connection aborted.', ConnectionResetError(104, 'Connection reset by peer'))
An error occurred: ('Connection aborted.', ConnectionResetError(104, 'Connection reset by peer'))
done for math category with shape: (955, 2)
done for stat category with shape: (229, 2)
done for cs category with shape: (2248, 2)
done for eess category with shape: (352, 2)
done for astro-ph category with shape: (376, 2)
done for hep-ex category with shape: (69, 2)
done for quant-ph category with shape: (266, 2)
done for q-bio category with shape: (76, 2)


In [16]:
list_articles_dfs[0].head()

Unnamed: 0,id,content
0,2312.06659,We establish the convergence of the unified tw...
1,2312.06656,Using the notion of integral distance to analy...
2,2312.06651,"This paper is the first part of the series ""Sp..."
3,2312.0665,"This paper is the second part of the series ""S..."
4,2312.06649,This paper is the fourth and the last part of ...


In [17]:
list_articles_dfs[4].head()

Unnamed: 0,id,content
0,2312.06599,Active regions (ARs) appear in the solar atmos...
1,2312.06586,This document details the first public data re...
2,2312.06559,The time evolution of primordial fluctuations ...
3,2312.06556,In core-collapse supernovae and neutron star m...
4,2312.06536,We present a comparison of the flux normalizat...


In [30]:
for i in range(len(list_articles_dfs)):

    category = list(ARXIV_FIELDS.keys())[i]
    list_articles_dfs[i]['category'] = category


In [31]:
list_articles_dfs[0].head()

Unnamed: 0,id,content,category
0,2312.06659,We establish the convergence of the unified tw...,math
1,2312.06656,Using the notion of integral distance to analy...,math
2,2312.06651,"This paper is the first part of the series ""Sp...",math
3,2312.0665,"This paper is the second part of the series ""S...",math
4,2312.06649,This paper is the fourth and the last part of ...,math


In [34]:
list_articles_dfs[1].head()

Unnamed: 0,id,content,category
0,2312.06605,Latent space models are powerful statistical t...,stat
1,2312.066,We propose a non-linear state-space model to e...,stat
2,2312.06591,Variational Autoencoders (VAEs) have been a pi...,stat
3,2312.06547,Partial Least-Squares (PLS) Regression is a wi...,stat
4,2312.06531,"Non-parametric machine learning models, such a...",stat


In [38]:
list_articles_dfs[1].info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 229 entries, 0 to 228
Data columns (total 3 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   id        229 non-null    object
 1   content   229 non-null    object
 2   category  229 non-null    object
dtypes: object(3)
memory usage: 5.5+ KB


In [35]:
final_df = pd.concat(list_articles_dfs, ignore_index=True)
print(final_df.shape)
final_df.head()

(4571, 3)


Unnamed: 0,id,content,category
0,2312.06659,We establish the convergence of the unified tw...,math
1,2312.06656,Using the notion of integral distance to analy...,math
2,2312.06651,"This paper is the first part of the series ""Sp...",math
3,2312.0665,"This paper is the second part of the series ""S...",math
4,2312.06649,This paper is the fourth and the last part of ...,math


In [36]:
final_df.tail()

Unnamed: 0,id,content,category
4566,2312.02187,The past decade has witnessed a dramatically g...,q-bio
4567,2312.02962,Horizontal gene transfer inference approaches ...,q-bio
4568,2312.02956,"Purpose: To develop Choroidalyzer, an open-sou...",q-bio
4569,2312.02953,Objective: This study aimed to explore the ass...,q-bio
4570,2312.02195,The integration of multi-omics data has emerge...,q-bio


In [37]:
final_df.to_csv('final_dataset.csv', index=False)