<a href="https://colab.research.google.com/github/AbrarAlotaibi/WiDS-KFUPM-Workshop-2025/blob/main/arXiv_%2B_OpenAI_Paper_Filtering_System.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# arXiv + OpenAI Paper Filtering System


By: Abrar Alotaibi

This notebook demonstrates how to combine the arXiv API with OpenAI's language models to create an intelligent paper filtering system for research.

The system works in 3 main steps:
 1. Query arXiv API to get papers based on keyword search
 2. Use OpenAI to analyze each paper's relevance to your specific research interest
 3. Filter, rank, and visualize the most relevant papers

This approach is much more powerful than simple keyword filtering because it
uses AI to understand the semantic relevance of papers to your research.

# Install necessary packages and import libraries

In [None]:
# First, we need to install the required packages if they aren't already available

# Uncomment and run these lines if you need to install any packages
#!pip install openai requests pandas matplotlib tqdm
#!pip install ipywidgets  # For interactive UI
#!pip install wordcloud   # Optional: for creating word clouds of key concepts

# Now import all the libraries we'll need
import requests                      # For making HTTP requests to arXiv API
import xml.etree.ElementTree as ET   # For parsing XML responses from arXiv
import openai                        # For accessing OpenAI models
import json                          # For handling JSON data
import time                          # For adding delays between API calls
import pandas as pd                  # For data manipulation and analysis
from IPython.display import display, HTML, Markdown  # For rich output in Colab
import matplotlib.pyplot as plt      # For data visualization
from datetime import datetime        # For date formatting
import re                            # For regular expressions
import warnings                      # To suppress unnecessary warnings
warnings.filterwarnings('ignore')    # Keep output clean

# Import tqdm for progress bars
from tqdm.notebook import tqdm

print("Libraries imported successfully!")

# Set up your OpenAI API key

In [None]:
# This cell sets up your OpenAI API key, which is required to use their models.
# You need to replace the placeholder with your actual API key.
# go to https://platform.openai.com/settings/profile/user

# Replace this with your actual OpenAI API key
OPENAI_API_KEY = "your-openai-api-key-here"

# Set the API key for the OpenAI library
openai.api_key = OPENAI_API_KEY

# Verify API key is set (this doesn't check if it's valid, just that it's set)
if openai.api_key and openai.api_key != "your-openai-api-key-here":
    print("✅ API key configured.")
else:
    print("⚠️ Please replace the placeholder with your actual OpenAI API key.")

# Function to query the arXiv API

In [None]:
# The arXiv API allows us to programmatically search for academic papers.
# This function queries the API and parses the XML response into a list of paper objects.
#
# Key concepts:
# - arXiv API query structure
# - XML parsing with ElementTree
# - Handling namespaces in XML

def fetch_arxiv_papers(query, max_results=50):
    """
    Query the arXiv API for papers based on search terms

    Parameters:
    -----------
    query : str
        Search query (e.g., "machine learning")
    max_results : int
        Maximum number of results to return (default: 50)

    Returns:
    --------
    list
        List of dictionaries, each containing paper information
    """
    try:
        # Step 1: Format the query for arXiv API
        # Replace spaces with '+' for URL formatting
        formatted_query = query.replace(' ', '+')

        # Step 2: Construct the API URL
        # The arXiv API uses a REST-style interface with query parameters
        arxiv_url = f"http://export.arxiv.org/api/query?search_query=all:{formatted_query}&start=0&max_results={max_results}"

        print(f"Fetching papers from arXiv with query: {query}")

        # Step 3: Make the HTTP request to the arXiv API
        response = requests.get(arxiv_url)

        # Step 4: Parse XML response using ElementTree
        root = ET.fromstring(response.text)

        # Step 5: Define namespace for XML parsing
        # arXiv API uses the Atom XML format which requires namespace handling
        namespace = {'atom': 'http://www.w3.org/2005/Atom'}

        # Step 6: Extract all entries (papers) from the response
        entries = root.findall('atom:entry', namespace)

        if not entries:
            print("No papers found matching the query")
            return []

        # Step 7: Process each paper entry
        papers = []
        for entry in entries:
            # Extract authors (there may be multiple)
            authors = entry.findall('atom:author', namespace)
            author_names = [author.find('atom:name', namespace).text for author in authors]

            # Extract categories (subject areas)
            categories = entry.findall('atom:category', namespace)
            category_terms = [category.get('term') for category in categories]

            # Create a dictionary for this paper with all relevant information
            paper = {
                'title': entry.find('atom:title', namespace).text.strip().replace('\n', ' '),
                'authors': author_names,
                'summary': entry.find('atom:summary', namespace).text.strip().replace('\n', ' '),
                'published': entry.find('atom:published', namespace).text,
                'updated': entry.find('atom:updated', namespace).text,
                'link': entry.find('atom:id', namespace).text,
                'categories': category_terms
            }
            papers.append(paper)

        print(f"Successfully retrieved {len(papers)} papers.")
        return papers

    except Exception as e:
        print(f"Error fetching papers from arXiv: {e}")
        raise e

# Test the function (optional)
# You can uncomment this to test just the arXiv query function
#test_papers = fetch_arxiv_papers("quantum computing", max_results=5)
#print(f"Example paper title: {test_papers[0]['title']}")

# Function to analyze paper relevance using OpenAI

In [None]:
# This function uses OpenAI's language models to analyze the relevance of each paper
# to the specific research interest provided by the user.
#
# Key concepts:
# - Prompt engineering for effective results
# - Using OpenAI's chat completions API
# - JSON response formatting
# - Error handling for API calls

def analyze_paper_relevance(paper, research_interest):
    """
    Analyze paper relevance using OpenAI's language models

    Parameters:
    -----------
    paper : dict
        Paper object containing title, abstract, etc.
    research_interest : str
        User's specific research interest

    Returns:
    --------
    dict
        Original paper object enhanced with relevance score, explanation, and key concepts
    """
    try:
        # Step 1: Construct a detailed prompt for OpenAI
        # This prompt is crucial for getting good results - it needs to:
        # - Provide sufficient context from the paper
        # - Clearly state the research interest
        # - Specify exactly what information we want in return
        prompt = f"""
              Paper Title: {paper['title']}
              Paper Abstract: {paper['summary']}
              Paper Categories: {', '.join(paper['categories'])}

              Research Interest: {research_interest}

              Task: Evaluate the relevance of this paper to the research interest.
              1) Provide a relevance score from 0-100 where 0 is completely irrelevant and 100 is extremely relevant
              2) Explain in 2-3 sentences why this paper is or isn't relevant to the research interest
              3) Extract key concepts from the paper that match the research interest

              Format your response as JSON:
              {{
                "relevanceScore": [number between 0-100],
                "explanation": [explanation text],
                "keyMatchingConcepts": [array of key concepts]
              }}
"""

        # Step 2: Call the OpenAI API with our prompt
        # We use the chat completions API with specific parameters:
        # - model: Which model to use (gpt-4o-mini is powerful but you could use others)
        # - system message: Sets the context and role for the AI
        # - user message: Our detailed prompt
        # - temperature: Lower values make output more deterministic/consistent
        # - response_format: Request JSON formatting for easier parsing
        response = openai.chat.completions.create(
            model="gpt-4o-mini",  # You can change this to other models like "gpt-3.5-turbo" if needed
            messages=[
                {"role": "system", "content": "You are a research assistant specialized in analyzing scientific papers and determining their relevance to specific research interests. Be precise in your evaluations."},
                {"role": "user", "content": prompt}
            ],
            temperature=0.1,  # Low temperature for more consistent outputs
            response_format={"type": "json_object"}  # Request JSON formatted response
        )

        # Step 3: Parse the response
        analysis_text = response.choices[0].message.content
        analysis = json.loads(analysis_text)

        # Step 4: Add the analysis results to the paper object
        paper['relevanceScore'] = analysis['relevanceScore']
        paper['relevanceExplanation'] = analysis['explanation']
        paper['keyMatchingConcepts'] = analysis['keyMatchingConcepts']

        return paper

    except Exception as e:
        print(f"Error analyzing paper: {paper['title']}")
        print(f"Error details: {e}")

        # If an error occurs, return the paper with default values
        paper['relevanceScore'] = 0
        paper['relevanceExplanation'] = "Error occurred during analysis"
        paper['keyMatchingConcepts'] = []
        return paper

# Test the function (optional)
# You can uncomment this to test just the OpenAI analysis function
# Note: This requires both the fetch_arxiv_papers function and a valid API key
'''
test_paper = fetch_arxiv_papers("reinforcement learning", max_results=1)[0]
test_interest = "Applications of reinforcement learning in robotic control"
analyzed_paper = analyze_paper_relevance(test_paper, test_interest)
print(f"Relevance score: {analyzed_paper['relevanceScore']}")
print(f"Explanation: {analyzed_paper['relevanceExplanation']}")
'''

# Main function to search, analyze, and filter papers

In [None]:
# This function combines our previous functions to:
# 1. Search arXiv for papers
# 2. Analyze each paper with OpenAI
# 3. Filter papers based on relevance
# 4. Sort the results
#
# Key concepts:
# - Batch processing for API efficiency
# - Progress tracking with tqdm
# - Rate limiting to avoid API throttling
# - Sorting and filtering data

def find_relevant_papers(initial_query, research_interest, relevance_threshold=50):
    """
    Main function to search and filter papers

    Parameters:
    -----------
    initial_query : str
        Initial arXiv search query (broader terms)
    research_interest : str
        Detailed research interest for filtering (more specific)
    relevance_threshold : int
        Minimum relevance score (0-100) for papers to be included

    Returns:
    --------
    list
        Filtered and sorted papers
    """
    try:
        # Step a: Initial search - fetch papers from arXiv
        # This uses our previous function to get a list of papers
        papers = fetch_arxiv_papers(initial_query)
        print(f"Found {len(papers)} papers from initial search.")

        if len(papers) == 0:
            return []

        # Step b: Process papers with OpenAI analysis
        # We do this in batches with a progress bar to:
        # - Give visual feedback on progress
        # - Avoid overwhelming the API
        # - Allow for graceful error handling
        batch_size = 5  # Processing 5 papers at a time
        analyzed_papers = []

        # Create a progress bar
        print("Analyzing papers with OpenAI (this may take a few minutes)...")
        for i in tqdm(range(0, len(papers), batch_size)):
            # Get the current batch of papers
            batch = papers[i:i + batch_size]

            # Process each paper in the batch
            for paper in batch:
                analyzed_paper = analyze_paper_relevance(paper, research_interest)
                analyzed_papers.append(analyzed_paper)
                # Add a small delay to avoid rate limits
                time.sleep(0.5)

        # Step c: Filter papers based on relevance threshold
        # Only keep papers that scored above our threshold
        filtered_papers = [paper for paper in analyzed_papers if paper['relevanceScore'] >= relevance_threshold]

        # Step d: Sort papers by relevance score (highest first)
        sorted_papers = sorted(filtered_papers, key=lambda x: x['relevanceScore'], reverse=True)

        print(f"Filtering complete. Found {len(sorted_papers)} papers with relevance score ≥ {relevance_threshold}.")
        return sorted_papers

    except Exception as e:
        print(f"Error in find_relevant_papers: {e}")
        raise e

# Test the full search pipeline (optional)
# You can uncomment this to test the full pipeline
# Note: This will use OpenAI API credits

results = find_relevant_papers(
    initial_query="reinforcement learning robotics",
    research_interest="Applications of reinforcement learning in robotic manipulation with sparse rewards",
    relevance_threshold=70
)
print(f"Found {len(results)} highly relevant papers")


# Functions to visualize results

In [None]:
# These functions create visualizations and formatted displays of our results.
# Good visualizations are essential for interpreting the results of our analysis.
#
# Key concepts:
# - Data visualization with matplotlib
# - Formatted HTML output in notebooks
# - Optional word cloud for concept visualization
# - DataFrame creation for further analysis

# First function: Create data visualizations
def visualize_results(papers):
    """
    Create visualizations of the paper results

    Parameters:
    -----------
    papers : list
        List of paper objects with relevance scores
    """
    if not papers:
        display(Markdown("## No relevant papers found."))
        return

    # Create a DataFrame for easier data manipulation
    df = pd.DataFrame(papers)

    # Visualization 1: Histogram of relevance scores
    # Shows the distribution of relevance scores across all papers
    plt.figure(figsize=(10, 6))
    plt.hist(df['relevanceScore'], bins=10, alpha=0.7, color='skyblue')
    plt.title('Distribution of Relevance Scores')
    plt.xlabel('Relevance Score')
    plt.ylabel('Number of Papers')
    plt.grid(True, alpha=0.3)
    plt.show()

    # Visualization 2: Top papers bar chart
    # Shows the most relevant papers (up to 10) sorted by score
    top_papers = df.sort_values('relevanceScore', ascending=False).head(10)
    plt.figure(figsize=(12, 6))

    # Truncate long titles for better display
    truncated_titles = top_papers['title'].str[:50] + '...'

    bars = plt.barh(truncated_titles, top_papers['relevanceScore'], color='skyblue')
    plt.xlabel('Relevance Score')
    plt.title('Top 10 Papers by Relevance')
    plt.tight_layout()
    plt.grid(True, alpha=0.3)
    plt.show()

    # Visualization 3 (Optional): Word cloud of key concepts
    # Creates a word cloud from all the key concepts across papers
    try:
        from wordcloud import WordCloud

        # Flatten all key concepts into a single list
        all_concepts = []
        for concepts in df['keyMatchingConcepts']:
            all_concepts.extend(concepts)

        # Create a string of all concepts (word cloud needs text input)
        text = ' '.join(all_concepts)

        # Generate word cloud
        wordcloud = WordCloud(width=800, height=400, background_color='white',
                              max_words=100, contour_width=3, contour_color='steelblue')
        wordcloud.generate(text)

        # Display the word cloud
        plt.figure(figsize=(12, 6))
        plt.imshow(wordcloud, interpolation='bilinear')
        plt.axis('off')
        plt.title('Key Concepts Word Cloud')
        plt.show()
    except ImportError:
        print("WordCloud not installed. Run: !pip install wordcloud")

# Second function: Display formatted results as an HTML table
def display_results(papers):
    """
    Format and display the results in a nice HTML table

    Parameters:
    -----------
    papers : list
        List of paper objects

    Returns:
    --------
    DataFrame
        Pandas DataFrame containing the paper data
    """
    if not papers:
        display(Markdown("## No relevant papers found."))
        return pd.DataFrame()

    display(Markdown(f"## Found {len(papers)} relevant papers"))

    # Create an HTML table with styling for better display in Colab
    html = """
    <style>
        .paper-table {width: 100%; border-collapse: collapse; margin-bottom: 20px;}
        .paper-table th {background-color: #4CAF50; color: white; text-align: left; padding: 12px;}
        .paper-table td {padding: 12px; border-bottom: 1px solid #ddd;}
        .paper-title {font-weight: bold; font-size: 1.1em;}
        .relevance-high {color: green; font-weight: bold;}
        .relevance-medium {color: orange; font-weight: bold;}
        .relevance-low {color: red; font-weight: bold;}
        .authors {font-style: italic;}
        .paper-row:hover {background-color: #f5f5f5;}
    </style>
    <table class="paper-table">
        <tr>
            <th>Title & Relevance</th>
            <th>Details</th>
        </tr>
    """

    for i, paper in enumerate(papers):
        # Determine relevance class for color-coding
        if paper['relevanceScore'] >= 80:
            relevance_class = "relevance-high"  # Green for highly relevant
        elif paper['relevanceScore'] >= 60:
            relevance_class = "relevance-medium"  # Orange for moderately relevant
        else:
            relevance_class = "relevance-low"  # Red for marginally relevant

        # Format the publication date
        pub_date = datetime.strptime(paper['published'], '%Y-%m-%dT%H:%M:%SZ').strftime('%b %d, %Y')

        # Create HTML row for this paper
        html += f"""
        <tr class="paper-row">
            <td>
                <div class="paper-title">{i+1}. {paper['title']}</div>
                <div class="{relevance_class}">Relevance: {paper['relevanceScore']}%</div>
            </td>
            <td>
                <div class="authors">Authors: {', '.join(paper['authors'])}</div>
                <div>Published: {pub_date}</div>
                <div>Link: <a href="{paper['link']}" target="_blank">{paper['link']}</a></div>
                <div><strong>Relevance Explanation:</strong> {paper['relevanceExplanation']}</div>
                <div><strong>Key Concepts:</strong> {', '.join(paper['keyMatchingConcepts'])}</div>
            </td>
        </tr>
        """

    html += "</table>"

    # Display the HTML table in the notebook
    display(HTML(html))

    # Also create a DataFrame for further analysis or export
    df = pd.DataFrame(papers)
    return df

# Interactive UI for paper search

In [None]:
# This function creates an interactive user interface using ipywidgets.
# It's perfect for teaching because students can modify parameters and see results immediately.
#
# Key concepts:
# - Interactive widgets in Jupyter/Colab
# - Event handling for button clicks
# - Dynamic output updates
# - Data export

def run_interactive_search():
    """
    Create an interactive UI for paper search using ipywidgets
    """
    # Import widgets library for interactive UI components
    from ipywidgets import widgets
    from IPython.display import display

    # Create input widgets
    # Text widget for the initial arXiv query
    query_widget = widgets.Text(
        value='machine learning reinforcement learning',  # Default value
        description='Initial Query:',
        style={'description_width': 'initial'},
        layout={'width': '50%'},
        tooltip='Enter keywords for the initial arXiv search (broader terms)'
    )

    # Textarea for more detailed research interest
    interest_widget = widgets.Textarea(
        value='Applications of reinforcement learning in robotic control systems with sparse rewards',  # Default value
        description='Research Interest:',
        style={'description_width': 'initial'},
        layout={'width': '70%', 'height': '80px'},
        tooltip='Enter your specific research interest in detail (more specific)'
    )

    # Slider for relevance threshold
    threshold_widget = widgets.IntSlider(
        value=70,  # Default value
        min=0,
        max=100,
        step=5,
        description='Relevance Threshold:',
        style={'description_width': 'initial'},
        layout={'width': '50%'},
        tooltip='Set minimum relevance score (0-100) for including papers'
    )

    # Create an output widget to display results
    results_widget = widgets.Output()

    # Create search button
    button = widgets.Button(
        description='Search Papers',
        button_style='success',  # Green button
        icon='search',
        tooltip='Click to start the search process'
    )

    # Define button click event handler
    def on_button_click(b):
        with results_widget:
            # Clear previous output
            results_widget.clear_output()
            print("Searching for papers...")
            try:
                # Run the search with current widget values
                papers = find_relevant_papers(
                    query_widget.value,
                    interest_widget.value,
                    threshold_widget.value
                )

                # Display results
                df = display_results(papers)
                visualize_results(papers)

                # Save results to CSV if we found any papers
                if len(papers) > 0:
                    df.to_csv('arxiv_filtered_papers.csv')
                    print("Results saved to 'arxiv_filtered_papers.csv'")
            except Exception as e:
                print(f"Error during search: {e}")

    # Connect the button click event to our handler function
    button.on_click(on_button_click)

    # Display all the widgets
    display(widgets.HTML("<h2>arXiv + OpenAI Paper Filtering</h2>"))
    display(widgets.HTML("<p>Enter a broad initial query for arXiv, then specify your detailed research interest for AI filtering.</p>"))
    display(query_widget)
    display(interest_widget)
    display(threshold_widget)
    display(button)
    display(results_widget)

# This function creates an interactive UI for students to experiment with
# Uncomment the line below to run the interactive UI
# run_interactive_search()

# Simple example usage

In [None]:

# This cell shows a simple example of using our system with predefined parameters.
# It's useful for demonstrating the complete pipeline without interactive widgets.
#
# this lets you:
# - Run a predefined example
# - Show the complete workflow in one step

def main():
    """
    Run the paper search and analysis with predefined parameters
    """
    # Example search parameters - feel free to modify these for your demonstration
    initial_query = "machine learning reinforcement learning"
    research_interest = "Applications of reinforcement learning in robotic control systems with sparse rewards"
    relevance_threshold = 70

    print(f"Searching for papers related to: {research_interest}")
    print(f"Initial arXiv query: {initial_query}")
    print(f"Relevance threshold: {relevance_threshold}%\n")

    # Run the search and analysis pipeline
    papers = find_relevant_papers(initial_query, research_interest, relevance_threshold)

    # Display and visualize the results
    df = display_results(papers)
    visualize_results(papers)

    # Save results to CSV
    if len(papers) > 0:
        df.to_csv('arxiv_filtered_papers.csv')
        print("Results saved to 'arxiv_filtered_papers.csv'")

    return papers  # Return the papers for further analysis if needed

# Uncomment one of the following lines to run either:
# The simple example with predefined parameters:
# papers = main()

# Or the interactive version:
#run_interactive_search()