In [None]:
!pip install ollama pandas requests fpdf

In [None]:
import ollama

In [None]:
ollama.pull("llama3.2")

In [None]:
def summarise_paper (text):
    stream = ollama.chat(
        model='llama3.2',
        messages=[{
            'role': 'user',
            'content': f'Summarize the following abstract, return me just the summary and nothing else: {text}',
        }])
    return stream['message']['content']

In [None]:
import requests
import xml.etree.ElementTree as ET
from datetime import datetime
import pandas as pd
# from docling.document_converter import DocumentConverter

def search_arxiv_to_dataframe_with_text_optimized(topic, start_date, end_date):
    url = "https://export.arxiv.org/api/query"
    params = {
        "search_query": f'all:%22{topic}%22',
        "start": 0,
        "max_results": 100,  # Fetch a reasonable number of results
        "sortBy": "submittedDate",
        "sortOrder": "descending",
    }

    response = requests.get(url, params=params)

    if response.status_code == 200:
        root = ET.fromstring(response.content)
        entries = root.findall('{http://www.w3.org/2005/Atom}entry')

        # Convert date strings to datetime objects for comparison
        start_date_dt = datetime.strptime(start_date, "%Y-%m-%d")
        end_date_dt = datetime.strptime(end_date, "%Y-%m-%d")

        data = []
        for idx, entry in enumerate(entries, 1):  # Added index for progress tracking
            publication_date = entry.find('{http://www.w3.org/2005/Atom}published').text
            publication_date_dt = datetime.strptime(publication_date.split("T")[0], "%Y-%m-%d")
            
            # Check if the publication date is within the specified range
            if start_date_dt <= publication_date_dt <= end_date_dt:
                title = entry.find('{http://www.w3.org/2005/Atom}title').text
                authors = ', '.join([author.find('{http://www.w3.org/2005/Atom}name').text for author in entry.findall('{http://www.w3.org/2005/Atom}author')])
                abstract = entry.find('{http://www.w3.org/2005/Atom}summary').text
                link = entry.find('{http://www.w3.org/2005/Atom}id').text.replace("abs", "pdf") + ".pdf"  # Convert to PDF link

#                 print(f"Fetching file {idx} of {len(entries)}: {link}")
                
#                 # Fetch paper text using DocumentConverter
#                 try:
#                     converter = DocumentConverter()
#                     result = converter.convert(link)
#                     paper_text = result.document.export_to_markdown()
#                 except Exception as e:
#                     paper_text = f"Error processing paper: {str(e)}"

                # Append the details to the data list
                data.append({
                    "Title": title,
                    "Authors": authors,
                    "Abstract": abstract,
                    "Published Date": publication_date,
                    "Link": link,
#                     "Paper Text": paper_text
                })

        # Create a DataFrame from the data list
        if data:
            df = pd.DataFrame(data)
            return df
        else:
            print("No papers found for the given date range.")
            return pd.DataFrame()  # Return an empty DataFrame

    else:
        print(f"Error: {response.status_code}")
        return pd.DataFrame()  # Return an empty DataFrame in case of error



In [None]:
from tqdm import tqdm

def add_summary_column(df):
    """
    Apply the summarization function to the 'Abstract' column and create a new column 'Summary'.
    """
    tqdm.pandas(desc="Summarizing Papers")
    df['Summary'] = df['Abstract'].progress_apply(summarise_paper)
    return df

# Assuming `df` is your DataFrame with the 'Abstract' column
df_with_summary = add_summary_column(df)

# Display the updated DataFrame
print(df_with_summary)

In [None]:
from fpdf import FPDF
import os

class PDF(FPDF):
    def header(self):
        self.set_font('Arial', 'B', 12)
        self.cell(0, 10, 'Research Paper Summaries', border=False, ln=True, align='C')
        self.ln(10)

    def footer(self):
        self.set_y(-15)
        self.set_font('Arial', 'I', 8)
        self.cell(0, 10, f'Page {self.page_no()}', align='C')

def generate_combined_markdown_and_pdf(df, output_dir="output", output_name="all_papers"):
    """
    Generate a single Markdown and PDF file containing all papers.
    
    Args:
        df (pd.DataFrame): DataFrame containing Title, Authors, Published Date, and Summary.
        output_dir (str): Directory to save the Markdown and PDF files.
        output_name (str): Base name for the output files (without extension).
    
    Returns:
        tuple: Paths to the combined Markdown and PDF files.
    """
    # Ensure the output directory exists
    os.makedirs(output_dir, exist_ok=True)
    
    markdown_file = os.path.join(output_dir, f"{output_name}.md")
    pdf_file = os.path.join(output_dir, f"{output_name}.pdf")
    
    # Generate Markdown
    with open(markdown_file, "w", encoding="utf-8") as md_file:
        md_file.write("# Research Paper Summaries\n\n")
        
        for _, row in df.iterrows():
            title = row['Title']
            authors = row['Authors']
            publication_date = row['Published Date']
            summary = row['Summary']
            
            # Add paper details to Markdown
            md_file.write(f"## {title}\n\n")
            md_file.write(f"**Authors**: {authors}\n\n")
            md_file.write(f"**Publication Date**: {publication_date}\n\n")
            md_file.write("### Summary\n")
            md_file.write(f"{summary}\n\n")
            md_file.write("---\n\n")
    
    # Generate PDF
    pdf = PDF()
    pdf.add_page()
    pdf.set_font('Arial', size=12)
    pdf.cell(0, 10, "Research Paper Summaries", ln=True, align='C')
    pdf.ln(10)
    
    for _, row in df.iterrows():
        title = row['Title']
        authors = row['Authors']
        publication_date = row['Published Date']
        summary = row['Summary']
        
        # Add paper details to PDF
        pdf.set_font('Arial', 'B', size=12)
        pdf.multi_cell(0, 10, f"Title: {title}\n")
        pdf.set_font('Arial', size=10)
        pdf.multi_cell(0, 10, f"Authors: {authors}\n")
        pdf.multi_cell(0, 10, f"Publication Date: {publication_date}\n")
        pdf.set_font('Arial', size=10)
        pdf.multi_cell(0, 10, "Summary:\n", align="L")
        pdf.set_font('Arial', size=10)
        pdf.multi_cell(0, 10, f"{summary}\n")
        pdf.ln(5)
        pdf.set_font('Arial', size=8)
        pdf.cell(0, 5, "-" * 100, ln=True)  # Add a horizontal line
        pdf.ln(5)
    
    pdf.output(pdf_file)
    
    return markdown_file, pdf_file


In [None]:
# Import necessary libraries
import gradio as gr
import pandas as pd
from datetime import datetime

# Define a function to fetch and summarize arXiv papers
def fetch_and_summarize(topic, start_date, end_date, export_type):
    df = search_arxiv_to_dataframe_with_text_optimized(topic, start_date, end_date)
    df_with_summary = add_summary_column(df)    
    markdown_path, pdf_path = generate_combined_markdown_and_pdf(df_with_summary)
    
    # Generate a file based on the export type
    if export_type == 'PDF':
        file_path = 'output/all_papers.pdf'
    elif export_type == 'Markdown':
        file_path = 'output/all_papers.md'
    
    return df_with_summary[['Title', 'Authors', 'Published Date']], file_path

# Create Gradio interface
with gr.Blocks(theme=gr.themes.Glass()) as demo:
    # Input components
    topic = gr.Textbox(label="Topic")
    start_date = gr.Textbox(label="Start Date (YYYY-MM-DD) format")
    end_date = gr.Textbox(label="End Date (YYYY-MM-DD) format")
    export_type = gr.Radio(choices=['PDF', 'Markdown'], label="Export Type")
    
    # Output components
    output_table = gr.Dataframe(label="Summarized Papers")
    output_file = gr.File(label="Download Summary")
    
    # Define the event listener
    fetch_button = gr.Button("Fetch and Summarize")
    fetch_button.click(fetch_and_summarize, inputs=[topic, start_date, end_date, export_type], outputs=[output_table, output_file])

# Launch the Gradio app
if __name__ == "__main__":
    demo.launch(show_error=True,debug=True,share=True)

* Running on local URL:  http://127.0.0.1:7863
* Running on public URL: https://97ac89f7fc0f4ef6fe.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)
