In [None]:
!pip install arxiv pandas matplotlib

In [None]:
import arxiv
import pandas as pd
from datetime import datetime
import matplotlib.pyplot as plt

In [None]:
SEARCH_QUERY = '((ti:Bayesian OR abs:Bayesian OR ti:"Uncertainty Quantification" OR abs:"Uncertainty Quantification" OR ti:UQ OR abs:UQ) AND (all:Modular OR all:"Multi-Fidelity" OR all:"Real-Time" OR all:"Digital Twin"))'
MAX_RESULTS = 10000 # Limit the number of papers to analyze for a quick look

## Utility Functions

In [None]:
def clean_theme_name(theme_key):
    """Converts a theme key (e.g., 'T_Digital_Twins') into a clean display name (e.g., 'Digital Twins')."""
    # Split by the first underscore (to remove T_ or M_) and then replace remaining underscores with spaces
    parts = theme_key.split('_', 1)
    if len(parts) > 1:
        return parts[1].replace('_', ' ').title()
    return theme_key.replace('_', ' ').title()

## Data Retrieval Functions

In [None]:
def fetch_arxiv_papers(query, max_results):
    """
    Fetches papers from arXiv based on a structured query.
    Note: Requires the 'arxiv' Python package (pip install arxiv).
    """
    print(f"Searching arXiv for: '{query}'")
    client = arxiv.Client()

    # Configure the search parameters
    search = arxiv.Search(
        query=query,
        max_results=max_results,
        sort_by=arxiv.SortCriterion.SubmittedDate,
        sort_order=arxiv.SortOrder.Descending # Start with the newest papers
    )

    results = []
    # Use a try-except block for robust API call handling
    try:
        for r in client.results(search):
            results.append({
                'title': r.title.replace('\n', ' ').strip(),
                'abstract': r.summary.replace('\n', ' ').strip(),
                'year': r.published.year,
                'primary_category': r.primary_category,
                'categories': [c for c in r.categories if c.startswith('cs.') or c.startswith('stat.')], # Filter to ML/CS categories
                'authors': [a.name for a in r.authors],
                'url': r.entry_id,
            })
        print(f"Successfully fetched {len(results)} papers.")
    except Exception as e:
        print(f"An error occurred while fetching data from arXiv: {e}")

    return pd.DataFrame(results)


## Analysis Functions

In [None]:
def temporal_analysis(df):
    """Analyzes the publication trends over time."""
    print("\n--- 1. TEMPORAL ANALYSIS (Publication Trends) ---")

    # Count papers per year
    year_counts = df['year'].value_counts().sort_index()
    print("Papers Published Per Year:")
    print(year_counts)

    # Plotting the trend
    plt.figure(figsize=(10, 5))
    year_counts.plot(kind='bar', color='#1f77b4')
    plt.title(f'Temporal Trend of Relevant Papers (N={len(df)})')
    plt.xlabel('Publication Year')
    plt.ylabel('Number of Papers')
    plt.xticks(rotation=45, ha='right')
    plt.grid(axis='y', linestyle='--', alpha=0.7)
    plt.tight_layout()
    plt.show()

def bibliometric_analysis(df):
    """Identifies top categories (authors removed per user request)."""
    print("\n--- 2. BIBLIOMETRIC ANALYSIS (Key Domains) ---")

    # Top Categories
    all_categories = df['categories'].explode()
    top_categories = all_categories.value_counts().head(10)
    print("\nTop 10 Most Frequent Categories (Research Domains):")
    print(top_categories)

def thematic_analysis(df):
    """Categorizes papers based on thematic and methodological keywords and plots the distribution."""
    # Define keywords for the themes based on your review plan
    themes = {
        'T_Digital_Twins': ['digital twin', 'physical system', 'virtual representation'],
        'T_Real_Time_Systems': ['real-time', 'online execution', 'dynamic modeling', 'low latency'],
        'M_Modular_Frameworks': ['modular', 'decentralized', 'decoupled', 'component-based'],
        'M_Multi_Source_Data': ['multi-fidelity', 'multi-source', 'heterogeneous data', 'information fusion'],
        'M_Uncertainty_Quantification': ['uncertainty quantification', 'uq', 'bayesian inference', 'credible interval'],
        'M_Surrogate_Modeling': ['emulator', 'surrogate model', 'reduced order model', 'rom']
    }

    print("\n--- 3. THEMATIC & METHODOLOGICAL ANALYSIS ---")

    # Create a column for each theme (True/False if keyword found in title/abstract)
    df_temp = df.copy()
    for theme, keywords in themes.items():
        # Combine title and abstract for comprehensive search
        text_to_search = (df_temp['title'] + ' ' + df_temp['abstract']).str.lower()

        # Check if any keyword in the list is present
        # Note: Using regex '|' for OR condition between keywords
        pattern = '|'.join(keywords)
        df_temp[theme] = text_to_search.str.contains(pattern, na=False)

    # Summarize the thematic counts
    theme_summary = df_temp[themes.keys()].sum().sort_values(ascending=False)
    print("Keyword-Based Thematic Counts:")
    print(theme_summary)

    # --- Plotting the thematic summary (Infographic 1) ---

    # Map the internal theme keys to clean display names for the plot
    clean_summary = theme_summary.rename(clean_theme_name)

    plt.figure(figsize=(10, 6))
    clean_summary.plot(kind='barh', color='#ff7f0e') # Horizontal bar chart for better readability
    plt.title('Relative Dominance of Thematic Keywords in Paper Abstracts')
    plt.xlabel('Number of Papers Mentioning Keyword(s)')
    plt.ylabel('Thematic Category')
    plt.gca().invert_yaxis() # Put the highest count at the top
    plt.grid(axis='x', linestyle='--', alpha=0.7)
    plt.tight_layout()
    plt.show()
    # ----------------------------------------------------

    # Identify the top 5 papers for the most dominant theme (for quick review)
    dominant_theme = theme_summary.index[0]
    print(f"\nTop 5 papers for the most dominant theme ('{clean_theme_name(dominant_theme)}'):")
    top_papers = df_temp[df_temp[dominant_theme]].head(5)

    for index, row in top_papers.iterrows():
        print(f"  - Title: {row['title']}")
        print(f"    URL: {row['url']}")
        print(f"    Year: {row['year']}, Primary Category: {row['primary_category']}")
        print("-" * 20)

    return df_temp # Return the dataframe with theme flags for co-occurrence analysis

def temporal_thematic_evolution(df_with_themes):
    """
    Analyzes and visualizes how the thematic focus of the papers has changed over time
    using a multi-line chart (Infographic 3).
    """
    print("\n--- 4. TEMPORAL THEMATIC EVOLUTION (Trend Infographic) ---")

    # Identify themes for grouping
    themes = [col for col in df_with_themes.columns if col.startswith(('T_', 'M_'))]

    # Group by year and sum the boolean theme flags
    temporal_theme_counts = df_with_themes.groupby('year')[themes].sum()

    # Rename columns for the plot
    temporal_theme_counts.columns = [clean_theme_name(t) for t in temporal_theme_counts.columns]

    print("Temporal Theme Counts by Year:")
    print(temporal_theme_counts)

    # Plotting the Line Chart (Infographic 3)
    plt.figure(figsize=(12, 7))

    # Filter out years with very few papers for cleaner trend lines, e.g., before 2010
    start_year = temporal_theme_counts[temporal_theme_counts.sum(axis=1) > 10].index.min()
    if pd.isna(start_year):
        start_year = temporal_theme_counts.index.min()

    # Plot the line chart
    temporal_theme_counts.loc[start_year:].plot(kind='line', marker='o', linewidth=2, figsize=(12, 7))

    plt.title('Evolution of Thematic Focus Over Time (Yearly Paper Counts)')
    plt.xlabel(f'Publication Year (Showing data from {int(start_year)} onwards)')
    plt.ylabel('Number of Papers Mentioning Theme')
    plt.legend(title='Thematic Category', loc='upper left', bbox_to_anchor=(1.05, 1))
    plt.grid(True, linestyle='--', alpha=0.6)
    plt.tight_layout(rect=[0, 0, 0.85, 1]) # Adjust layout for legend
    plt.show()


def category_thematic_co_occurrence(df_with_themes):
    """
    Analyzes and visualizes the co-occurrence between major arXiv categories
    and the predefined thematic keywords using a heatmap (Infographic 4).
    """
    print("\n--- 5. DOMAIN-THEME CO-OCCURRENCE ANALYSIS (Clustering Infographic) ---")

    # Explode categories to one row per paper-category pair
    df_exploded = df_with_themes.explode('categories')

    # Identify themes for grouping
    themes = [col for col in df_exploded.columns if col.startswith(('T_', 'M_'))]

    # Filter to only the top categories (e.g., those with at least 50 papers) for clarity in the plot
    min_cat_count = 50
    valid_categories = df_exploded['categories'].value_counts()
    valid_categories = valid_categories[valid_categories >= min_cat_count].index
    df_filtered = df_exploded[df_exploded['categories'].isin(valid_categories)]

    if df_filtered.empty or not themes:
        print("Not enough data or themes to generate co-occurrence matrix after filtering.")
        return

    # Calculate the co-occurrence matrix (Count of papers in Category X that also match Theme Y)
    # The sum() operation counts the True (1) values for each theme within each category group.
    co_occurrence_matrix = df_filtered.groupby('categories')[themes].sum()

    print(f"Co-occurrence Matrix (Filtered Categories with >={min_cat_count} papers):")
    print(co_occurrence_matrix)

    # --- Plotting the Heatmap (Infographic 4) ---
    # Generate clean labels for the x-axis
    clean_themes = [clean_theme_name(t) for t in themes]

    plt.figure(figsize=(12, 8))
    plt.imshow(co_occurrence_matrix, aspect='auto', cmap='viridis')

    # Setup labels
    plt.colorbar(label='Number of Co-occurring Papers')
    plt.xticks(range(len(themes)), clean_themes, rotation=45, ha='right')
    plt.yticks(range(len(valid_categories)), valid_categories)

    plt.title('Heatmap of Thematic Keyword Co-occurrence by Research Domain')
    plt.xlabel('Thematic Keyword (Methodology/Application)')
    plt.ylabel('arXiv Research Domain Category')
    plt.tight_layout()
    plt.show()

## Main Execution Function

In [None]:
def main():
    """Main function to run the analysis."""
    print("--- arXiv Literature Review Analysis ---")
    print(f"Attempting to fetch up to {MAX_RESULTS} papers.")

    # 1. Fetch Data
    df_papers = fetch_arxiv_papers(SEARCH_QUERY, MAX_RESULTS)

    if df_papers.empty:
        print("\nAnalysis failed: No data was fetched. Please check your network connection or the search query.")
        return

    # 2. Bibliometric & Temporal Analysis
    temporal_analysis(df_papers)
    bibliometric_analysis(df_papers)

    # 3. Thematic & Methodological Analysis (Returns the dataframe with theme flags)
    df_with_themes = thematic_analysis(df_papers)

    # 4. New Temporal Thematic Evolution Analysis
    temporal_thematic_evolution(df_with_themes)

    # 5. Domain-Theme Co-occurrence Analysis
    category_thematic_co_occurrence(df_with_themes)

    # 6. Limitation Note
    print("\n--- NOTE ON GEOGRAPHICAL ANALYSIS ---")
    print("The arXiv API does not provide author affiliation or country information,")
    print("so a 'Papers by Country' infographic cannot be generated using this data source.")
    print("The Category-Theme Co-occurrence map serves as a proxy for Domain Clustering.")


if __name__ == "__main__":
    # Ensure you have the necessary libraries installed:
    # pip install arxiv pandas matplotlib
    main()