# ArXiv Pipeline - MongoDB Analysis (Simple Version)

This notebook explores and visualizes the paper metadata stored in MongoDB.

Key analyses include:
- Paper publication trends over time
- Author analytics and rankings
- Category distribution analysis

In [None]:
# Import required libraries
import os
import re
import json
from datetime import datetime
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from pymongo import MongoClient
from collections import Counter
from dotenv import load_dotenv

# Load environment variables from .env file (if present)
load_dotenv()

# Set Matplotlib config
%matplotlib inline
plt.style.use('ggplot')

In [None]:
# MongoDB Configuration
MONGO_URI = os.getenv("MONGO_URI", "mongodb://localhost:27017/")
MONGO_DB = os.getenv("MONGO_DB", "arxiv_papers")

print(f"MongoDB URI: {MONGO_URI}")
print(f"MongoDB Database: {MONGO_DB}")

# Connect to MongoDB
try:
    client = MongoClient(MONGO_URI)
    db = client[MONGO_DB]
    # Test connection
    server_info = client.server_info()
    print(f"✅ Connected to MongoDB (version: {server_info.get('version')})")
    
    # Get collection stats
    print("\nCollection statistics:")
    collections = db.list_collection_names()
    for collection in collections:
        count = db[collection].count_documents({})
        print(f"- {collection}: {count:,} documents")
except Exception as e:
    print(f"❌ Failed to connect to MongoDB: {e}")
    raise

## 1. Paper Publication Trends Over Time

Let's analyze how the publication volume has changed over time.

In [None]:
# Convert to DataFrame for time series analysis
papers_df = pd.DataFrame(list(db.papers.find({}, {'_id': 1, 'id': 1, 'title': 1, 'authors': 1, 'categories': 1, 'update_date': 1, 'published': 1})))

# Convert date strings to datetime objects
papers_df['published_date'] = pd.to_datetime(papers_df['published'], errors='coerce')
papers_df['year'] = papers_df['published_date'].dt.year
papers_df['month'] = papers_df['published_date'].dt.month
papers_df['year_month'] = papers_df['published_date'].dt.strftime('%Y-%m')

# Get papers per year
papers_per_year = papers_df['year'].value_counts().sort_index()

# Plot papers per year
plt.figure(figsize=(12, 6))
ax = papers_per_year.plot(kind='bar', color='steelblue')
plt.title('Number of Papers Published per Year', fontsize=16)
plt.xlabel('Year', fontsize=14)
plt.ylabel('Number of Papers', fontsize=14)
plt.xticks(rotation=45)
plt.grid(axis='y', linestyle='--', alpha=0.7)

# Add data labels on top of each bar
for i, v in enumerate(papers_per_year):
    ax.text(i, v + 10, f"{v:,}", ha='center', fontweight='bold')

plt.tight_layout()
plt.show()

## 2. Author Analytics

Let's analyze the most prolific authors and co-authorship patterns.

In [None]:
# Function to extract all authors from papers
def extract_authors(papers_df):
    all_authors = []
    author_paper_count = Counter()
    
    for _, paper in papers_df.iterrows():
        if 'authors' in paper and paper['authors']:
            paper_authors = paper['authors']
            # Add to all authors list
            all_authors.extend(paper_authors)
            # Count papers per author
            for author in paper_authors:
                author_paper_count[author] += 1
    
    return all_authors, author_paper_count

# Get author data
all_authors, author_paper_count = extract_authors(papers_df)

# Most prolific authors (top 20)
top_authors = pd.DataFrame(author_paper_count.most_common(20), 
                           columns=['Author', 'Paper Count'])

# Plot top authors
plt.figure(figsize=(14, 8))
bars = plt.barh(top_authors['Author'], top_authors['Paper Count'], color='steelblue')
plt.title('Top 20 Most Prolific Authors', fontsize=16)
plt.xlabel('Number of Papers', fontsize=14)
plt.ylabel('Author', fontsize=14)
plt.gca().invert_yaxis()  # Invert y-axis to have highest count at the top

# Add count labels
for bar in bars:
    width = bar.get_width()
    plt.text(width + 0.5, bar.get_y() + bar.get_height()/2, 
             f"{width:,}", ha='left', va='center', fontweight='bold')

plt.tight_layout()
plt.show()

## 3. Category Distribution Analysis

Let's analyze the distribution of categories across papers.

In [None]:
# Function to extract all categories from papers
def extract_categories(papers_df):
    all_categories = []
    category_paper_count = Counter()
    
    for _, paper in papers_df.iterrows():
        if 'categories' in paper and paper['categories']:
            paper_cats = paper['categories']
            # Add to all categories list
            all_categories.extend(paper_cats)
            # Count papers per category
            for cat in paper_cats:
                category_paper_count[cat] += 1
    
    return all_categories, category_paper_count

# Get category data
all_categories, category_paper_count = extract_categories(papers_df)

# Top categories (top 20)
top_categories = pd.DataFrame(category_paper_count.most_common(20), 
                             columns=['Category', 'Paper Count'])

# Plot top categories
plt.figure(figsize=(14, 10))
bars = plt.barh(top_categories['Category'], top_categories['Paper Count'], color='darkorange')
plt.title('Top 20 Most Common Categories', fontsize=16)
plt.xlabel('Number of Papers', fontsize=14)
plt.ylabel('Category', fontsize=14)
plt.gca().invert_yaxis()  # Invert y-axis to have highest count at the top

# Add count labels
for bar in bars:
    width = bar.get_width()
    plt.text(width + 0.5, bar.get_y() + bar.get_height()/2, 
             f"{width:,}", ha='left', va='center', fontweight='bold')

plt.tight_layout()
plt.show()

## 4. Database Health and Performance Metrics

Let's check database health metrics and collection stats.

In [None]:
# Database statistics
try:
    db_stats = db.command("dbStats")
    
    # Extract key metrics
    metrics = {
        "Collections": db_stats['collections'],
        "Objects": db_stats['objects'],
        "Data Size (MB)": round(db_stats['dataSize'] / (1024 * 1024), 2),
        "Storage Size (MB)": round(db_stats['storageSize'] / (1024 * 1024), 2),
        "Indexes": db_stats['indexes'],
        "Index Size (MB)": round(db_stats['indexSize'] / (1024 * 1024), 2)
    }
    
    print("MongoDB Database Statistics:")
    for key, value in metrics.items():
        print(f"- {key}: {value:,}")
    
    # Get collection stats
    print("\nCollection Statistics:")
    collections = db.list_collection_names()
    for collection in collections:
        stats = db.command("collStats", collection)
        print(f"\n{collection}:")
        print(f"- Document Count: {stats['count']:,}")
        print(f"- Data Size: {stats['size'] / (1024 * 1024):.2f} MB")
        print(f"- Storage Size: {stats['storageSize'] / (1024 * 1024):.2f} MB")
        print(f"- Index Size: {stats['totalIndexSize'] / (1024 * 1024):.2f} MB")
        print(f"- Avg Document Size: {stats.get('avgObjSize', 0) / 1024:.2f} KB")
    
except Exception as e:
    print(f"Error getting database statistics: {e}")

## 5. Summary and Conclusions

Based on our analysis of the ArXiv paper metadata in MongoDB, we've observed:

1. **Publication Trends**: [Add observation about publication trends]
2. **Author Patterns**: [Add observation about author patterns]
3. **Category Distribution**: [Add observation about category distribution]
4. **Database Performance**: [Add observation about database metrics]

These insights can help inform research priorities and future enhancements to the ArXiv pipeline.

In [None]:
# Close MongoDB connection
client.close()
print("MongoDB connection closed")