In [1]:
!pip install arxiv

import arxiv
import pandas as pd
import random

# Define a broad search query (no category or date restrictions)
query = 'all'  # Search all papers
total_results = 1000

# Construct the default API client.
client = arxiv.Client()

search = arxiv.Search(query, max_results= total_results)
results = client.results(search)

# Collect paper metadata
papers = []
for result in results:
    papers.append({
        'id': result.entry_id.split('/')[-1],  # Extract ArXiv ID
        'title': result.title,
        'abstract': result.summary,
        'published': result.published,
        'pdf_url': result.pdf_url
    })

# Convert to a DataFrame
papers_df = pd.DataFrame(papers)



In [2]:
!pip install pypdfium2

import requests
import pypdfium2 as pdfium
from io import BytesIO
from concurrent.futures import ProcessPoolExecutor, as_completed


# Randomly sample papers
random_papers_df = papers_df.sample(n=2, random_state=0)
print(random_papers_df.info())


# Download the PDF
def download_and_extract(pdf_url):
    response = requests.get(pdf_url)
    pdf_file = BytesIO(response.content)
    
    # Extract text using PyPDFium2
    text = ''
    pdf = pdfium.PdfDocument(pdf_file)
    for i in range(len(pdf)):
        page = pdf[i]
        text += page.get_textpage().get_text_bounded()
    
    return text

delay = 15
def limit_download(pdf_url):
    time.sleep(delaty)
    return download_and_extract(pdf_url)
    
# Add full text to the DataFrame
random_papers_df['full_text'] = random_papers_df['pdf_url'].apply(download_and_extract)

# The below code downloads the data as fast as possible in parallel.
# This violates the terms of the arxiv's service and will resultyour ip being temorarily restricted
"""
def download_and_extract_text(pdf_url):
    try:
        # Download the PDF
        response = requests.get(pdf_url)
        pdf_file = BytesIO(response.content)

        # Extract text using PyPDFium2
        text = ''
        pdf = pdfium.PdfDocument(pdf_file)
        for i in range(len(pdf)):
            page = pdf[i]
            text += page.get_textpage().get_text_bounded()

        return text
    
    except Exception as e:
        print(f"Error processing {pdf_url}: {e}")
        return None


# Use ThreadPoolExecutor for Parallel Processing
def process_papers_parallel(papers_df, max_workers=10):
    results = {}
    with ProcessPoolExecutor(max_workers=max_workers) as executor:
        # Submit tasks
        future_to_id = {
            executor.submit(download_and_extract_text, row['pdf_url']): row['id']
            for _, row in papers_df.iterrows()
        }
        
        # Collect results as they complete
        for future in as_completed(future_to_id):
            paper_id = future_to_id[future]
            try:
                results[paper_id] = future.result()
            except Exception as e:
                print(f"Error processing {paper_id}: {e}")
                results[paper_id] = None
    return results


import time
start = time.time()


# Process papers in parallel
text_results = process_papers_parallel(random_papers_df, max_workers=20)  # Adjust `max_workers` as needed

# Add full text to the DataFrame
random_papers_df['full_text'] = random_papers_df['id'].map(text_results)


end = time.time()
print(end - start)
"""



# Display the first few rows with full text
print(random_papers_df[['id', 'abstract', 'full_text']].head())

<class 'pandas.core.frame.DataFrame'>
Index: 2 entries, 993 to 859
Data columns (total 5 columns):
 #   Column     Non-Null Count  Dtype              
---  ------     --------------  -----              
 0   id         2 non-null      object             
 1   title      2 non-null      object             
 2   abstract   2 non-null      object             
 3   published  2 non-null      datetime64[ns, UTC]
 4   pdf_url    2 non-null      object             
dtypes: datetime64[ns, UTC](1), object(4)
memory usage: 96.0+ bytes
None
               id                                           abstract  \
993  2102.07948v4  A Kempe swap in a proper coloring interchanges...   
859   0807.4753v1  For all p > 1, we demonstrate the existence of...   

                                             full_text  
993  arXiv:2102.07948v4 [math.CO] 10 Mar 2022\r\nIn...  
859  arXiv:0807.4753v1 [quant-ph] 30 Jul 2008\r\nCo...  


In [3]:
import re

def remove_abstract(text):
    # Match common abstract headings (case insensitive)
    abstract_patterns = [
        r'\babstract\b.*?\n',  # Matches "Abstract" as a heading
        r'\babstract\b:?'       # Matches "Abstract:" or "abstract"
    ]

    # Compile regex with case insensitivity
    pattern = re.compile('|'.join(abstract_patterns), re.IGNORECASE)

    # Remove abstract heading
    text = pattern.sub('', text, count=1)  

    # Remove the actual abstract text until the next section heading (non-greedy)
    text = re.sub(r'(?s)^.*?(?=\n\s*(?:\d*\s*Introduction|\bI\b\.|I\.))', '', text, count=1)

    return text.strip()

In [4]:
random_papers_df["full_text"] = random_papers_df["full_text"].apply(remove_abstract)


# Printing data to look at possible issues 
for index, row in random_papers_df.iterrows():
    print(f"Paper ID: {row['id']}")  # Print the paper ID (if available)
    print(f"Full Text:\n{row['full_text']}\n")  # Print the full text
    print("-" * 80)  # Separator for readability

Paper ID: 2102.07948v4
Full Text:
1 Introduction
Given a k-coloring ϕ of a graph G and colors α, β ∈ {1, . . . , k}, a Kempe swap Kempe
swap
recolors a component of
the subgraph induced by colors α and β, interchanging those colors on that component, which is called a
Kempe component Kempe
component . Two k-colorings, ϕ1 and ϕ2 of a graph G are (Kempe) k-equivalent
kequivalent
, if there exists a
sequence of k-colorings of G, beginning with ϕ1 and ending with ϕ2, such that each two successive colorings
differ by only a single Kempe swap. Problems regarding the k-equivalence of k-colorings often go by the
name reconfiguration. We form an auxilliary graph H which has as its vertices all k-colorings of G, and two
vertices of H are adjacent if the corresponding colorings differ by only a single Kempe swap. Previous work
has focused on deciding whether H is connected [13, 10, 2, 5, 6] and (when H is connected) on determining
or bounding the diameter of H [3, 4, 8, 9].
The k-state Pott’s mo

In [5]:
from sklearn.feature_extraction.text import TfidfVectorizer
import nltk
nltk.download('punkt')
from nltk.tokenize import sent_tokenize

def tfidf_summarization(text, num_sentences = 5):
    if not text or text.strip() == "":
        return "No content available for summarization."
    # Tokenize the input text into sentences
    sentences = sent_tokenize(text)
    if len(sentences) <= num_sentences:
        return text  # Return full text if it does not meet specified requirement

    # Transform the sentences into a TF-IDF matrix
    vectorizer = TfidfVectorizer()
    X = vectorizer.fit_transform(sentences)
    
    # Rank sentences by their average TF-IDF score
    sentence_scores = X.mean(axis=1).A1
    ranked_sentences = [sentences[i] for i in sentence_scores.argsort()[-num_sentences:][::-1]]

    # Join the top-ranked sentences to form the summary
    return " ".join(ranked_sentences)

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\vinay\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [6]:
# Ensure the full content of columns is displayed without truncation
pd.set_option('display.max_colwidth', None)  
# Set the maximum number of rows to display in the DataFrame preview
pd.set_option('display.max_rows', 50)  

# Apply summarization function to each paper's full text
random_papers_df["summary"] = random_papers_df["full_text"].apply(lambda x: tfidf_summarization(x, num_sentences = 5))
print(random_papers_df[["id", "abstract", "summary"]].head())


               id  \
993  2102.07948v4   
859   0807.4753v1   

                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                        

In [7]:
!pip install rouge_score
from rouge_score import rouge_scorer

def evaluate_summary(reference_summary, generated_summary):
    # enable stemming for better word matching
    scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
    # Compute ROUGE scores by comparing the reference summary to the generated summary
    scores = scorer.score(reference_summary, generated_summary)
    return scores

#Evaluation of rouge scores
sample_paper = random_papers_df.iloc[0]
# Call the evaluate_summary function with the abstract and summary of the relevant paper
rouge_scores = evaluate_summary(sample_paper["abstract"], sample_paper["summary"])
print(rouge_scores)

{'rouge1': Score(precision=0.23778501628664495, recall=0.437125748502994, fmeasure=0.30801687763713076), 'rouge2': Score(precision=0.058823529411764705, recall=0.10843373493975904, fmeasure=0.07627118644067797), 'rougeL': Score(precision=0.1270358306188925, recall=0.23353293413173654, fmeasure=0.16455696202531644)}


In [None]:
random_papers_df.to_csv("results.csv")