# 03 - OpenAlex Citation Enrichment

This notebook enriches the dataset with citation data from OpenAlex API.

**Input:** Clustered papers with DOIs  
**Output:** Papers enriched with citation networks and metadata

In [None]:
import sys
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
import time
import warnings
warnings.filterwarnings('ignore')

# Add src to path
sys.path.append('../src')

from openalex import OpenAlexClient, enrich_dataframe, save_cache_summary

# Set style
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")

print("Libraries loaded successfully!")

## Load Clustered Data

In [None]:
# Load the clustered dataset
data_dir = Path('../data')
df = pd.read_csv(data_dir / 'papers_clustered.csv')

print(f"Loaded {len(df)} papers")
print(f"Columns: {list(df.columns)}")

# Check DOI availability
doi_stats = {
    'total_papers': len(df),
    'papers_with_doi': df['doi'].notna().sum(),
    'unique_dois': df['doi'].nunique(),
    'doi_coverage': df['doi'].notna().mean() * 100
}

print(f"\n=== DOI Statistics ===")
for key, value in doi_stats.items():
    if isinstance(value, float):
        print(f"{key}: {value:.1f}")
    else:
        print(f"{key}: {value}")

## Initialize OpenAlex Client

In [None]:
# Initialize OpenAlex client with caching
cache_dir = data_dir / 'openalex_cache'
client = OpenAlexClient(
    cache_dir=str(cache_dir),
    rate_limit=0.1,  # 100ms between requests (be nice to the API)
    timeout=30,
    max_retries=3
)

print(f"OpenAlex client initialized")
print(f"Cache directory: {cache_dir}")

# Check existing cache
if cache_dir.exists():
    cache_summary = save_cache_summary(str(cache_dir))
    print(f"\nExisting cache: {cache_summary['successful']} successful, {cache_summary['errors']} errors")
else:
    print("No existing cache found")

## Test OpenAlex API

In [None]:
# Test with a few DOIs first
test_dois = df['doi'].dropna().head(3).tolist()
print(f"Testing OpenAlex API with {len(test_dois)} DOIs...")

for i, doi in enumerate(test_dois):
    print(f"\nTesting DOI {i+1}: {doi}")
    
    start_time = time.time()
    result = client.fetch_work(doi)
    elapsed = time.time() - start_time
    
    if result and 'error' not in result:
        print(f"  ✓ Success ({elapsed:.2f}s)")
        print(f"  Title: {result.get('title', 'N/A')[:60]}...")
        print(f"  Citations: {result.get('cited_by_count', 0)}")
        print(f"  References: {len(result.get('referenced_works', []))}")
    else:
        print(f"  ✗ Failed ({elapsed:.2f}s)")
        if result:
            print(f"  Error: {result.get('error', 'Unknown')}")

## Enrich Dataset with OpenAlex

In [None]:
# Check if enrichment already exists
enriched_path = data_dir / 'papers_enriched.csv'

if enriched_path.exists():
    print("Loading existing enriched dataset...")
    enriched_df = pd.read_csv(enriched_path)
    print(f"Loaded {len(enriched_df)} enriched papers")
else:
    print("Starting OpenAlex enrichment...")
    print("This will take some time depending on the number of DOIs and API rate limits.")
    print(f"Estimated time: {df['doi'].notna().sum() * 0.1 / 60:.1f} minutes")
    
    # Enrich the dataframe
    enriched_df = enrich_dataframe(
        df,
        doi_column='doi',
        client=client,
        max_workers=3  # Conservative to avoid overwhelming the API
    )
    
    # Save enriched dataset
    enriched_df.to_csv(enriched_path, index=False)
    print(f"Saved enriched dataset to {enriched_path}")

print(f"\nEnriched dataset shape: {enriched_df.shape}")
print(f"New columns: {set(enriched_df.columns) - set(df.columns)}")

## Analyze Enrichment Results

In [None]:
# Enrichment statistics
enrichment_stats = {
    'papers_with_openalex_id': enriched_df['openalex_id'].notna().sum(),
    'papers_with_citations': enriched_df['cited_by_count'].notna().sum(),
    'papers_with_references': enriched_df['referenced_works'].notna().sum(),
    'papers_with_concepts': enriched_df['concepts'].notna().sum() if 'concepts' in enriched_df.columns else 0,
    'enrichment_rate': enriched_df['openalex_id'].notna().mean() * 100
}

print("=== Enrichment Statistics ===")
for key, value in enrichment_stats.items():
    if isinstance(value, float):
        print(f"{key}: {value:.1f}")
    else:
        print(f"{key}: {value}")

# Citation statistics
if 'cited_by_count' in enriched_df.columns:
    citation_stats = enriched_df['cited_by_count'].describe()
    print(f"\n=== Citation Statistics ===")
    print(citation_stats)

In [None]:
# Visualize citation distribution
if 'cited_by_count' in enriched_df.columns:
    citations = enriched_df['cited_by_count'].dropna()
    
    plt.figure(figsize=(15, 5))
    
    # Histogram
    plt.subplot(1, 3, 1)
    plt.hist(citations, bins=50, alpha=0.7, edgecolor='black')
    plt.xlabel('Citation Count')
    plt.ylabel('Frequency')
    plt.title('Citation Distribution')
    plt.grid(True, alpha=0.3)
    
    # Log scale
    plt.subplot(1, 3, 2)
    plt.hist(citations[citations > 0], bins=50, alpha=0.7, edgecolor='black')
    plt.xlabel('Citation Count (log scale)')
    plt.ylabel('Frequency')
    plt.title('Citation Distribution (Log Scale)')
    plt.yscale('log')
    plt.grid(True, alpha=0.3)
    
    # Box plot by cluster
    plt.subplot(1, 3, 3)
    top_clusters = enriched_df['cluster'].value_counts().head(8).index
    cluster_citations = [enriched_df[enriched_df['cluster'] == c]['cited_by_count'].dropna() 
                        for c in top_clusters]
    plt.boxplot(cluster_citations, labels=[f'C{c}' for c in top_clusters])
    plt.xlabel('Cluster')
    plt.ylabel('Citation Count')
    plt.title('Citations by Cluster')
    plt.xticks(rotation=45)
    plt.grid(True, alpha=0.3)
    
    plt.tight_layout()
    plt.show()

## Reference Network Analysis

In [None]:
# Analyze reference networks
if 'referenced_works' in enriched_df.columns:
    # Parse referenced works
    all_references = []
    papers_with_refs = enriched_df[enriched_df['referenced_works'].notna()]
    
    print(f"Analyzing references from {len(papers_with_refs)} papers...")
    
    for _, row in papers_with_refs.iterrows():
        refs = row['referenced_works']
        if isinstance(refs, str):
            try:
                # Try to parse as list (might be string representation)
                import ast
                refs = ast.literal_eval(refs)
            except:
                refs = []
        
        if isinstance(refs, list):
            all_references.extend(refs)
    
    print(f"Total references: {len(all_references)}")
    print(f"Unique references: {len(set(all_references))}")
    
    # Find internal citations (references within our dataset)
    our_openalex_ids = set(enriched_df['openalex_id'].dropna())
    internal_refs = [ref for ref in all_references if ref in our_openalex_ids]
    
    print(f"Internal citations: {len(internal_refs)} ({len(internal_refs)/len(all_references)*100:.1f}%)")
    
    # Reference count distribution
    ref_counts = papers_with_refs['referenced_works'].apply(
        lambda x: len(x) if isinstance(x, list) else (len(ast.literal_eval(x)) if isinstance(x, str) else 0)
    )
    
    plt.figure(figsize=(10, 6))
    plt.hist(ref_counts, bins=30, alpha=0.7, edgecolor='black')
    plt.xlabel('Number of References')
    plt.ylabel('Frequency')
    plt.title('Distribution of Reference Counts per Paper')
    plt.grid(True, alpha=0.3)
    plt.show()
    
    print(f"Mean references per paper: {ref_counts.mean():.1f}")
    print(f"Median references per paper: {ref_counts.median():.1f}")

## Most Cited Papers Analysis

In [None]:
# Identify most cited papers
if 'cited_by_count' in enriched_df.columns:
    most_cited = enriched_df.nlargest(10, 'cited_by_count')
    
    print("=== Top 10 Most Cited Papers ===")
    for i, (_, row) in enumerate(most_cited.iterrows(), 1):
        print(f"{i}. {row.get('title', 'N/A')[:80]}...")
        print(f"   Authors: {row.get('authors', 'N/A')[:60]}...")
        print(f"   Year: {row.get('year', 'N/A')}, Journal: {row.get('journal', 'N/A')}")
        print(f"   Citations: {row.get('cited_by_count', 0)}, Cluster: {row.get('cluster', 'N/A')}")
        print()

## Temporal Citation Patterns

In [None]:
# Analyze citation patterns over time
if 'cited_by_count' in enriched_df.columns and 'year' in enriched_df.columns:
    # Citations by year
    year_citations = enriched_df.groupby('year')['cited_by_count'].agg(['mean', 'median', 'sum', 'count'])
    
    plt.figure(figsize=(15, 10))
    
    # Mean citations per year
    plt.subplot(2, 2, 1)
    year_citations['mean'].plot(kind='line', marker='o')
    plt.xlabel('Year')
    plt.ylabel('Mean Citations')
    plt.title('Mean Citations per Paper by Year')
    plt.grid(True, alpha=0.3)
    
    # Total citations per year
    plt.subplot(2, 2, 2)
    year_citations['sum'].plot(kind='bar', alpha=0.7)
    plt.xlabel('Year')
    plt.ylabel('Total Citations')
    plt.title('Total Citations by Publication Year')
    plt.xticks(rotation=45)
    plt.grid(True, alpha=0.3)
    
    # Citation vs age
    plt.subplot(2, 2, 3)
    current_year = enriched_df['year'].max()
    enriched_df['age'] = current_year - enriched_df['year']
    
    age_citations = enriched_df.groupby('age')['cited_by_count'].mean()
    age_citations.plot(kind='line', marker='o')
    plt.xlabel('Paper Age (years)')
    plt.ylabel('Mean Citations')
    plt.title('Citations vs Paper Age')
    plt.grid(True, alpha=0.3)
    
    # Scatter plot: year vs citations
    plt.subplot(2, 2, 4)
    plt.scatter(enriched_df['year'], enriched_df['cited_by_count'], alpha=0.6, s=20)
    plt.xlabel('Publication Year')
    plt.ylabel('Citation Count')
    plt.title('Citations vs Publication Year')
    plt.grid(True, alpha=0.3)
    
    plt.tight_layout()
    plt.show()

## Journal and Venue Analysis

In [None]:
# Analyze host venues from OpenAlex
if 'host_venue_name' in enriched_df.columns:
    venue_stats = enriched_df['host_venue_name'].value_counts().head(10)
    
    print("=== Top 10 Host Venues (OpenAlex) ===")
    display(venue_stats)
    
    # Compare with original journal field
    if 'journal' in enriched_df.columns:
        # Venue matching analysis
        venue_matches = 0
        total_with_both = 0
        
        for _, row in enriched_df.iterrows():
            journal = str(row.get('journal', '')).lower()
            venue = str(row.get('host_venue_name', '')).lower()
            
            if journal and venue and journal != 'nan' and venue != 'nan':
                total_with_both += 1
                if journal in venue or venue in journal:
                    venue_matches += 1
        
        match_rate = venue_matches / total_with_both * 100 if total_with_both > 0 else 0
        print(f"\nVenue matching: {venue_matches}/{total_with_both} ({match_rate:.1f}%)")

## Open Access Analysis

In [None]:
# Analyze open access status
if 'is_oa' in enriched_df.columns:
    oa_stats = enriched_df['is_oa'].value_counts()
    oa_rate = enriched_df['is_oa'].mean() * 100 if enriched_df['is_oa'].notna().any() else 0
    
    print(f"=== Open Access Statistics ===")
    print(f"Open Access rate: {oa_rate:.1f}%")
    print(oa_stats)
    
    if 'oa_status' in enriched_df.columns:
        oa_status_counts = enriched_df['oa_status'].value_counts()
        print(f"\nOA Status breakdown:")
        display(oa_status_counts)
        
        # Visualize OA status
        plt.figure(figsize=(12, 5))
        
        plt.subplot(1, 2, 1)
        oa_stats.plot(kind='pie', autopct='%1.1f%%', startangle=90)
        plt.title('Open Access Distribution')
        plt.ylabel('')
        
        plt.subplot(1, 2, 2)
        oa_status_counts.plot(kind='bar', alpha=0.7)
        plt.xlabel('OA Status')
        plt.ylabel('Count')
        plt.title('Open Access Status Types')
        plt.xticks(rotation=45)
        plt.grid(True, alpha=0.3)
        
        plt.tight_layout()
        plt.show()

## Cache Summary and Performance

In [None]:
# Final cache summary
final_cache_summary = save_cache_summary(str(cache_dir))

print("=== Final Cache Summary ===")
for key, value in final_cache_summary.items():
    print(f"{key}: {value}")

# Performance metrics
success_rate = final_cache_summary['successful'] / final_cache_summary['total_files'] * 100 if final_cache_summary['total_files'] > 0 else 0
print(f"\nAPI Success Rate: {success_rate:.1f}%")

# Estimate API calls made
unique_dois = enriched_df['doi'].nunique()
print(f"Unique DOIs processed: {unique_dois}")
print(f"Cache files created: {final_cache_summary['total_files']}")

## Save Enriched Dataset

In [None]:
# Final save of enriched dataset
enriched_df.to_csv(data_dir / 'papers_enriched_final.csv', index=False)
print(f"Saved final enriched dataset: {len(enriched_df)} papers")

# Save enrichment summary
enrichment_summary = {
    'total_papers': len(enriched_df),
    'papers_with_dois': enriched_df['doi'].notna().sum(),
    'papers_enriched': enriched_df['openalex_id'].notna().sum(),
    'enrichment_rate': enriched_df['openalex_id'].notna().mean(),
    'total_citations': enriched_df['cited_by_count'].sum() if 'cited_by_count' in enriched_df.columns else 0,
    'total_references': len(all_references) if 'all_references' in locals() else 0,
    'internal_citations': len(internal_refs) if 'internal_refs' in locals() else 0,
    'api_performance': final_cache_summary
}

import json
with open(data_dir / 'enrichment_summary.json', 'w') as f:
    json.dump(enrichment_summary, f, indent=2, default=str)
print("Saved enrichment summary")

# Display final statistics
print(f"\n=== Final Enrichment Results ===")
print(f"Papers enriched: {enrichment_summary['papers_enriched']}/{enrichment_summary['total_papers']} ({enrichment_summary['enrichment_rate']*100:.1f}%)")
print(f"Total citations collected: {enrichment_summary['total_citations']:,}")
print(f"Total references collected: {enrichment_summary['total_references']:,}")
print(f"Internal citation network edges: {enrichment_summary['internal_citations']:,}")

## Summary

This notebook successfully:

1. **Connected to OpenAlex API** with robust caching and rate limiting
2. **Enriched paper metadata** with citation counts, referenced works, and venue information
3. **Analyzed citation patterns** across time, clusters, and journals
4. **Built reference networks** identifying internal citations within the corpus
5. **Examined open access status** and publication venue consistency
6. **Optimized API usage** with intelligent caching to avoid redundant requests

**Key Results:**
- Enriched **{enrichment_summary['papers_enriched']} papers** with OpenAlex data
- Collected **{enrichment_summary['total_citations']:,} total citations**
- Identified **{enrichment_summary['internal_citations']:,} internal citation links**
- Achieved **{enrichment_summary['enrichment_rate']*100:.1f}% enrichment rate**

**Next Steps:**
- Proceed to `04_networks_and_backbones.ipynb` for citation network analysis
- Use enriched data for RPYS analysis and main path detection
- Generate comprehensive dialog cards with citation-based insights