# Bibtex Analyzer - Interactive Demo

This notebook demonstrates the interactive features of the Bibtex Analyzer.

In [None]:
# Install required packages if not already installed
!pip install bibtexparser openai plotly matplotlib numpy ipykernel

In [5]:
# Dependencies

import os
from pathlib import Path
import ipywidgets as widgets
from IPython.display import display, HTML
import pandas as pd
from dotenv import load_dotenv
import random
from typing import Optional
from wordcloud import WordCloud
import matplotlib.pyplot as plt
from typing import List, Dict, Any
import openai  # Add this line for OpenAI integration

# Load environment variables
load_dotenv()

# Verify OpenAI API key is set
if not os.getenv("OPENAI_API_KEY"):
    print("⚠️ Warning: OPENAI_API_KEY not found in environment variables")
    print("Please create a .env file with your OpenAI API key like this:")
    print("OPENAI_API_KEY=your-api-key-here")
else:
    print("✅ OpenAI API key found")

# Import bibtex analyzer components
from bibtex_analyzer import process_bibtex_file, TagGenerator
from bibtex_analyzer.visualization import create_tag_network

# Updated word cloud function
def create_wordcloud_visualization(
    data: pd.DataFrame,
    tag_column: str = "tags",
    width: int = 1000,
    height: int = 800,
    max_words: int = 100,
    background_color: str = "white",
    colormap: str = "viridis",
    **kwargs
) -> None:
    """
    Create a word cloud visualization using the wordcloud package.
    
    Args:
        data: DataFrame containing the data
        tag_column: Name of the column containing tags
        width: Width of the output figure
        height: Height of the output figure
        max_words: Maximum number of words to include
        background_color: Background color of the word cloud
        colormap: Matplotlib colormap to use
        **kwargs: Additional arguments passed to WordCloud
    """
    try:
        # Flatten all tags
        all_tags = []
        for tag_list in data[tag_column]:
            tags = [tag.strip().lower() for tag in str(tag_list).split(",")]
            all_tags.extend(tags)
        
        # Count tag frequencies
        tag_counts = {}
        for tag in all_tags:
            tag = tag.strip()
            if tag and tag.lower() != 'nan':  # Skip empty and 'nan' tags
                tag_counts[tag] = tag_counts.get(tag, 0) + 1
        
        if not tag_counts:
            print("⚠️ No valid tags found for word cloud")
            return
            
        # Create word cloud
        wordcloud = WordCloud(
            width=width,
            height=height,
            background_color=background_color,
            max_words=max_words,
            colormap=colormap,
            **kwargs
        ).generate_from_frequencies(tag_counts)
        
        # Display the generated image
        plt.figure(figsize=(width/100, height/100), dpi=100)
        plt.imshow(wordcloud, interpolation='bilinear')
        plt.axis("off")
        plt.tight_layout(pad=0)
        plt.show()
        
    except Exception as e:
        print(f"⚠️ Error generating word cloud: {str(e)}")

✅ OpenAI API key found


In [7]:
# Create widgets for user input
file_upload = widgets.FileUpload(description="Upload .bib file", accept='.bib', multiple=False)
generate_btn = widgets.Button(description="Generate Analysis")
output = widgets.Output()

# Display the widgets
display(widgets.VBox([
    widgets.HTML("<h3>Bibtex Analyzer</h3>"),
    widgets.HBox([widgets.Label("Upload BibTeX file:"), file_upload]),
    generate_btn,
    output
]))

VBox(children=(HTML(value='<h3>Bibtex Analyzer</h3>'), HBox(children=(Label(value='Upload BibTeX file:'), File…

In [8]:
def on_generate_clicked(b):
    with output:
        output.clear_output()
        
        if not file_upload.value:
            print("Please upload a .bib file first")
            return
            
        try:
            # Get the uploaded file content
            file_info = file_upload.value[0]  # Get the first file
            content = file_info['content']
            
            # Save to a temporary file
            temp_file = "temp_upload.bib"
            with open(temp_file, 'wb') as f:
                f.write(content)
            
            try:
                # Process the BibTeX file
                print("📚 Processing BibTeX file...")
                entries = process_bibtex_file(temp_file)
                
                # Filter out entries without abstracts
                entries_with_abstracts = [e for e in entries if e.get('abstract')]
                if len(entries_with_abstracts) < len(entries):
                    print(f"ℹ️ {len(entries) - len(entries_with_abstracts)} entries skipped (no abstract)")
                
                if not entries_with_abstracts:
                    print("❌ No entries with abstracts found. Please upload a .bib file with entries containing abstracts.")
                    return
                
                print(f"✅ Found {len(entries_with_abstracts)} entries with abstracts")
                
                # Take a random sample of 10 entries with abstracts for tag generation
                import random
                random.seed(42)  # For reproducibility
                sample_for_tags = min(10, len(entries_with_abstracts))
                if sample_for_tags < 3:  # Need at least 3 entries for meaningful tags
                    print("❌ Need at least 3 entries with abstracts to generate meaningful tags")
                    return
                
                tag_entries = random.sample(entries_with_abstracts, sample_for_tags)
                
                # Take a random sample of 10 entries with abstracts for tagging
                sample_to_tag = min(10, len(entries_with_abstracts))
                entries_to_tag = random.sample(entries_with_abstracts, sample_to_tag)
                
                print(f"\n🔍 Analyzing {sample_for_tags} entries to generate tags...")
                tagger = TagGenerator()
                
                try:
                    # Generate tags from the sample - ensure we're passing the full entry dicts
                    print("Generating tags from abstracts...")
                    tags = tagger.generate_tags_for_abstracts(tag_entries)  # Pass the full entry dicts
                    
                    if not tags:
                        print("⚠️ No tags were generated. The abstracts might be too short or not in English.")
                        return
                        
                    print(f"🎯 Generated {len(tags)} unique tags")
                    print("🏷️  Tags:", ", ".join(tags))
                    
                    # Tag the sample entries
                    print(f"\n🏷️  Tagging {sample_to_tag} random entries...\n")
                    all_tagged_entries = []
                    for i, entry in enumerate(entries_to_tag, 1):
                        entry_tags = tagger.assign_tags_to_abstracts([entry], list(tags))
                        print(f"📄 Entry {i}:")
                        print(f"   Title: {entry.get('title', 'No title')}")
                        print(f"   Year: {entry.get('year', 'N/A')}")
                        print(f"   Tags: {entry_tags[0].get('tags', 'No tags')}\n")
                        all_tagged_entries.append(entry_tags[0])
                    
                    # Generate word cloud if we have tags
                    # Generate word cloud if we have tags
                    # Generate word cloud if we have tags
                    if tags and all_tagged_entries:
                        print("\n🌐 Generating word cloud of tags...")
                        try:
                            # Create a DataFrame for visualization
                            df = pd.DataFrame(all_tagged_entries)
                            if not df.empty and 'tags' in df.columns and not df['tags'].isna().all():
                                create_wordcloud_visualization(
                                    df,
                                    width=1200,
                                    height=800,
                                    max_words=100,
                                    background_color='white',
                                    colormap='viridis',
                                    prefer_horizontal=0.9,
                                    scale=2,
                                    min_font_size=10,
                                    max_font_size=120
                                )
                            else:
                                print("⚠️ Not enough data to generate word cloud")
                        except Exception as e:
                            print(f"⚠️ Could not generate word cloud: {str(e)}")
                                        
                except Exception as e:
                    print(f"⚠️ Error during tagging: {str(e)}")
                    import traceback
                    traceback.print_exc()
                    
            except Exception as e:
                print(f"❌ An error occurred: {str(e)}")
                import traceback
                traceback.print_exc()
                
            finally:
                # Clean up
                if os.path.exists(temp_file):
                    os.remove(temp_file)
                    
        except Exception as e:
            print(f"❌ Error handling file upload: {str(e)}")
            import traceback
            traceback.print_exc()

# Connect button click to handler
generate_btn.on_click(on_generate_clicked)