# 01 - BibTeX Parsing and Data Preparation

This notebook parses the BibTeX file and prepares the dataset for analysis.

**Input:** `refs_2016_2025_AMR_MISQ_ORSC_ISR.bib`  
**Output:** Cleaned DataFrame with paper metadata

In [None]:
import sys
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path

# Add src to path
sys.path.append('../src')

from parse_bib import load_bib, get_corpus_stats

# Set style
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")

print("Libraries loaded successfully!")

## Load and Parse BibTeX File

In [None]:
# Load the BibTeX file
bib_path = "../refs_2016_2025_AMR_MISQ_ORSC_ISR.bib"

print(f"Loading BibTeX file: {bib_path}")
df = load_bib(bib_path)

print(f"\nLoaded {len(df)} papers")
print(f"Columns: {list(df.columns)}")

## Dataset Overview

In [None]:
# Basic statistics
stats = get_corpus_stats(df)

print("=== Dataset Statistics ===")
for key, value in stats.items():
    if key not in ['journals', 'papers_per_year']:
        print(f"{key}: {value}")

In [None]:
# Display sample records
print("=== Sample Records ===")
display(df[['title', 'authors', 'year', 'journal', 'doi']].head())

## Data Quality Analysis

In [None]:
# Missing data analysis
missing_data = df.isnull().sum()
missing_pct = (missing_data / len(df)) * 100

missing_df = pd.DataFrame({
    'Missing Count': missing_data,
    'Missing %': missing_pct
}).sort_values('Missing %', ascending=False)

print("=== Missing Data Analysis ===")
display(missing_df)

In [None]:
# Text length analysis
if 'text' in df.columns:
    df['text_length'] = df['text'].str.len()
    
    print("=== Text Length Statistics ===")
    print(df['text_length'].describe())
    
    # Plot text length distribution
    plt.figure(figsize=(10, 6))
    plt.hist(df['text_length'].dropna(), bins=50, alpha=0.7, edgecolor='black')
    plt.xlabel('Text Length (characters)')
    plt.ylabel('Frequency')
    plt.title('Distribution of Text Length (Title + Abstract)')
    plt.grid(True, alpha=0.3)
    plt.show()

## Temporal Analysis

In [None]:
# Publications per year
if 'year' in df.columns:
    year_counts = df['year'].value_counts().sort_index()
    
    plt.figure(figsize=(12, 6))
    year_counts.plot(kind='bar', alpha=0.8)
    plt.xlabel('Year')
    plt.ylabel('Number of Papers')
    plt.title('Publications per Year')
    plt.xticks(rotation=45)
    plt.grid(True, alpha=0.3)
    plt.tight_layout()
    plt.show()
    
    print(f"Peak year: {year_counts.idxmax()} ({year_counts.max()} papers)")
    print(f"Total papers: {year_counts.sum()}")

## Journal Analysis

In [None]:
# Journal distribution
if 'journal' in df.columns:
    journal_counts = df['journal'].value_counts()
    
    print("=== Journal Distribution ===")
    display(journal_counts.head(10))
    
    # Plot journal distribution
    plt.figure(figsize=(12, 6))
    journal_counts.head(10).plot(kind='bar', alpha=0.8)
    plt.xlabel('Journal')
    plt.ylabel('Number of Papers')
    plt.title('Top 10 Journals by Paper Count')
    plt.xticks(rotation=45, ha='right')
    plt.grid(True, alpha=0.3)
    plt.tight_layout()
    plt.show()

In [None]:
# Journal-year heatmap
if 'journal' in df.columns and 'year' in df.columns:
    # Focus on top journals
    top_journals = journal_counts.head(4).index
    journal_year = df[df['journal'].isin(top_journals)].groupby(['journal', 'year']).size().unstack(fill_value=0)
    
    plt.figure(figsize=(14, 6))
    sns.heatmap(journal_year, annot=True, fmt='d', cmap='YlOrRd', cbar_kws={'label': 'Number of Papers'})
    plt.xlabel('Year')
    plt.ylabel('Journal')
    plt.title('Publications by Journal and Year (Top 4 Journals)')
    plt.tight_layout()
    plt.show()

## Citation Analysis

In [None]:
# References analysis
if 'references_count' in df.columns:
    ref_stats = df['references_count'].describe()
    
    print("=== Reference Count Statistics ===")
    print(ref_stats)
    
    # Plot reference count distribution
    plt.figure(figsize=(10, 6))
    plt.hist(df['references_count'].dropna(), bins=50, alpha=0.7, edgecolor='black')
    plt.xlabel('Number of References')
    plt.ylabel('Frequency')
    plt.title('Distribution of Reference Counts')
    plt.grid(True, alpha=0.3)
    plt.show()

In [None]:
# Cited DOI analysis
if 'cited_doi' in df.columns:
    papers_with_citations = df['cited_doi'].notna().sum()
    citation_coverage = papers_with_citations / len(df) * 100
    
    print(f"Papers with cited DOIs: {papers_with_citations} ({citation_coverage:.1f}%)")
    
    # Sample cited DOIs
    sample_citations = df[df['cited_doi'].notna()]['cited_doi'].iloc[0]
    print(f"\nSample cited DOIs (first paper):")
    print(sample_citations[:200] + "..." if len(sample_citations) > 200 else sample_citations)

## Data Preparation for Analysis

In [None]:
# Filter and prepare data for embedding/clustering
# Keep papers with text (title + abstract or just title)
analysis_df = df[df['text'].notna() & (df['text'].str.len() > 10)].copy()

print(f"Papers suitable for analysis: {len(analysis_df)} / {len(df)} ({len(analysis_df)/len(df)*100:.1f}%)")

# Add text statistics
analysis_df['has_abstract'] = analysis_df['abstract'].notna()
analysis_df['word_count'] = analysis_df['text'].str.split().str.len()

print(f"Papers with abstracts: {analysis_df['has_abstract'].sum()} ({analysis_df['has_abstract'].mean()*100:.1f}%)")
print(f"Mean word count: {analysis_df['word_count'].mean():.1f}")
print(f"Median word count: {analysis_df['word_count'].median():.1f}")

## Save Processed Data

In [None]:
# Create data directory if it doesn't exist
data_dir = Path('../data')
data_dir.mkdir(exist_ok=True)

# Save full dataset
df.to_csv(data_dir / 'parsed_papers_full.csv', index=False)
print(f"Saved full dataset: {len(df)} papers")

# Save analysis-ready dataset
analysis_df.to_csv(data_dir / 'parsed_papers_analysis.csv', index=False)
print(f"Saved analysis dataset: {len(analysis_df)} papers")

# Save statistics
import json
with open(data_dir / 'corpus_stats.json', 'w') as f:
    json.dump(stats, f, indent=2, default=str)
print("Saved corpus statistics")

## Summary

This notebook successfully:

1. **Parsed the BibTeX file** with robust handling of encoding and LaTeX formatting
2. **Analyzed data quality** including missing values and text lengths
3. **Explored temporal patterns** showing publication trends over time
4. **Examined journal distribution** across the four target journals
5. **Analyzed citation data** including reference counts and cited DOIs
6. **Prepared clean datasets** for downstream analysis

**Next Steps:**
- Proceed to `02_embed_cluster.ipynb` for embedding generation and clustering
- The analysis-ready dataset contains papers suitable for text analysis
- Citation data is available for network analysis in later notebooks