In [None]:
import os
import json
import networkx as nx
import matplotlib.pyplot as plt
import plotly.graph_objects as go
from collections import defaultdict
import itertools
import re

KNOWN_DOMAINS = {
    # STEM Core
    "Medicine", "Biology", "Chemistry", "Physics", "Mathematics", "Computer Science",
    "Engineering", "Earth Science", "Environmental Science", "Materials Science",
    "Astronomy", "Astrophysics", "Geology", "Meteorology", "Oceanography",
    
    # Life Sciences
    "Genetics", "Molecular Biology", "Cell Biology", "Biochemistry", "Microbiology",
    "Immunology", "Pharmacology", "Pathology", "Physiology", "Anatomy",
    "Botany", "Zoology", "Ecology", "Evolutionary Biology", "Marine Biology",
    
    # Medical & Health
    "Public Health", "Epidemiology", "Clinical Medicine", "Surgery", "Psychiatry",
    "Neurology", "Cardiology", "Oncology", "Pediatrics", "Geriatrics",
    "Radiology", "Anesthesiology", "Emergency Medicine", "Family Medicine",
    
    # Technology & Computing
    "Artificial Intelligence", "Machine Learning", "Data Science", "Robotics",
    "Software Engineering", "Cybersecurity", "Human-Computer Interaction",
    "Information Systems", "Telecommunications", "Biotechnology",
    "Nanotechnology", "Quantum Computing", "Bioinformatics",
    
    # Engineering Disciplines
    "Mechanical Engineering", "Electrical Engineering", "Civil Engineering",
    "Chemical Engineering", "Aerospace Engineering", "Biomedical Engineering",
    "Environmental Engineering", "Industrial Engineering", "Nuclear Engineering",
    
    # Social Sciences
    "Psychology", "Sociology", "Political Science", "Economics", "Anthropology",
    "Education", "Criminology", "Social Work", "International Relations",
    "Public Policy", "Urban Planning", "Communication",
    
    # Humanities & Arts
    "History", "Philosophy", "Literature", "Art", "Music", "Theater",
    "Linguistics", "Languages", "Cultural Studies", "Religious Studies",
    "Media Studies", "Film Studies",
    
    # Business & Management
    "Business", "Management", "Marketing", "Finance", "Accounting",
    "Operations Research", "Supply Chain", "Entrepreneurship",
    
    # Interdisciplinary
    "Neuroscience", "Cognitive Science", "Environmental Studies", 
    "Climate Science", "Sustainability", "Gender Studies", "Area Studies",
    "Science and Technology Studies", "Bioethics", "Digital Humanities",
    
    # Geography & Earth
    "Geography", "Cartography", "Geographic Information Systems", "Remote Sensing",
    
    # Law & Policy
    "Law", "Legal Studies", "Constitutional Law", "International Law",
    
    # Applied Sciences
    "Agriculture", "Forestry", "Veterinary Science", "Food Science",
    "Sports Science", "Nutrition", "Architecture", "Design"
}

# Convert to set for O(1) lookup
DOMAIN_SET = set(KNOWN_DOMAINS)

def create_domain_keywords():
    """Create keyword mappings to domains for faster matching"""
    keyword_to_domain = {}
    
    for domain in DOMAIN_SET:
        keyword_to_domain[domain.lower()] = domain
        
        words = domain.lower().split()
        for word in words:
            if len(word) > 3:  # Only meaningful words
                keyword_to_domain[word] = domain
    
    # Add specific keyword mappings
    additional_mappings = {
        # Technology
        "ai": "Artificial Intelligence", "ml": "Machine Learning",
        "deep learning": "Machine Learning", "neural network": "Machine Learning",
        "algorithm": "Computer Science", "programming": "Computer Science",
        "software": "Software Engineering", "hardware": "Engineering",
        "database": "Computer Science", "network": "Telecommunications",
        
        # Medicine
        "medical": "Medicine", "clinical": "Clinical Medicine", "patient": "Medicine",
        "treatment": "Medicine", "diagnosis": "Medicine", "therapy": "Medicine",
        "disease": "Medicine", "health": "Public Health", "healthcare": "Medicine",
        "hospital": "Medicine", "nursing": "Medicine", "pharmaceutical": "Pharmacology",
        
        # Biology
        "cell": "Cell Biology", "gene": "Genetics", "dna": "Genetics", "rna": "Genetics",
        "protein": "Biochemistry", "enzyme": "Biochemistry", "organism": "Biology",
        "species": "Biology", "evolution": "Evolutionary Biology", "genome": "Genetics",
        
        # Physics/Chemistry
        "quantum": "Physics", "particle": "Physics", "energy": "Physics",
        "molecule": "Chemistry", "reaction": "Chemistry", "catalyst": "Chemistry",
        "material": "Materials Science", "crystal": "Materials Science",
        
        # Social Sciences
        "social": "Sociology", "society": "Sociology", "culture": "Anthropology",
        "behavior": "Psychology", "psychological": "Psychology", "cognitive": "Psychology",
        "economic": "Economics", "political": "Political Science", "policy": "Public Policy",
        "education": "Education", "learning": "Education", "teaching": "Education",
        
        # Environment
        "climate": "Climate Science", "environment": "Environmental Science",
        "sustainability": "Sustainability", "ecology": "Ecology", "conservation": "Environmental Science",
        "pollution": "Environmental Science", "renewable": "Environmental Science",
        
        # Business
        "business": "Business", "management": "Management", "marketing": "Marketing",
        "finance": "Finance", "economic": "Economics", "market": "Economics",
        
        # Geography
        "geographic": "Geography", "spatial": "Geography", "mapping": "Geography",
        "urban": "Urban Planning", "city": "Urban Planning"
    }
    
    keyword_to_domain.update(additional_mappings)
    return keyword_to_domain

def extract_domains_from_article_fast(article, keyword_to_domain):
    """
    Fast domain extraction using keyword matching instead of fuzzy string matching
    """
    found_domains = set()
    
    # Collect all text to search
    text_sources = []
    text_sources.extend(article.get("domains", []))
    text_sources.extend(article.get("fields", []))
    text_sources.extend(article.get("keywords", []))
    
    # Also check title for domain keywords
    title = article.get("title", "")
    if title:
        text_sources.append(title)
    
    # Search for domain matches
    for text in text_sources:
        if not text:
            continue
            
        text_lower = text.lower()
        
        # Direct domain match
        if text_lower in keyword_to_domain:
            found_domains.add(keyword_to_domain[text_lower])
            continue
        
        # Check if any keyword is contained in the text
        for keyword, domain in keyword_to_domain.items():
            if keyword in text_lower:
                found_domains.add(domain)
    
    return list(found_domains)

def build_domain_graph_fast(year, data_folder="articles_{year}_new"):
    """Build domain graph with improved performance and statistics"""
    file_path = os.path.join(data_folder.format(year=year), "all_articles_enhanced.jsonl")
    if not os.path.exists(file_path):
        return None, {}

    keyword_to_domain = create_domain_keywords()
    G = nx.Graph()
    domain_pairs_counter = defaultdict(int)
    domain_article_count = defaultdict(int)
    total_articles = 0
    multidisciplinary_articles = 0

    
    with open(file_path, "r", encoding="utf-8") as f:
        for line_num, line in enumerate(f, 1):
            if line_num % 1000 == 0:
                
            try:
                article = json.loads(line)
                total_articles += 1
                
                domains = extract_domains_from_article_fast(article, keyword_to_domain)
                
                for domain in domains:
                    domain_article_count[domain] += 1
                
                if len(domains) > 1:
                    multidisciplinary_articles += 1
                    # Add all possible pairs
                    for d1, d2 in itertools.combinations(sorted(domains), 2):
                        domain_pairs_counter[(d1, d2)] += 1
                        
            except json.JSONDecodeError:
                continue

    for (d1, d2), weight in domain_pairs_counter.items():
        G.add_edge(d1, d2, weight=weight)

    for domain in domain_article_count:
        if domain not in G.nodes():
            G.add_node(domain)

    stats = {
        "total_articles": total_articles,
        "multidisciplinary_articles": multidisciplinary_articles,
        "total_domains": len(domain_article_count),
        "connected_domains": len([n for n in G.nodes() if G.degree(n) > 0]),
        "total_connections": len(G.edges()),
        "top_domains": sorted(domain_article_count.items(), key=lambda x: x[1], reverse=True)[:10],
        "strongest_connections": sorted(domain_pairs_counter.items(), key=lambda x: x[1], reverse=True)[:10]
    }


    return G, stats

def save_graph_enhanced(G, year, stats, output_dir="domain_graphs"):
    """Save graph with enhanced visualizations and statistics"""
    os.makedirs(output_dir, exist_ok=True)

    if len(G.nodes()) == 0:
        print(f"⚠ No nodes to visualize for {year}")
        return

    # Filter graph to show only meaningful connections
    min_weight = max(1, len(G.edges()) // 100) if len(G.edges()) > 50 else 1
    G_filtered = nx.Graph()
    
    for u, v, data in G.edges(data=True):
        if data['weight'] >= min_weight:
            G_filtered.add_edge(u, v, weight=data['weight'])
    
    # If filtered graph is too small, use original
    if len(G_filtered.nodes()) < 10:
        G_filtered = G

    pos = nx.spring_layout(G_filtered, seed=42, k=1, iterations=50)

    # === MATPLOTLIB VISUALIZATION (JPG) ===
    plt.figure(figsize=(16, 12))
    
    # Calculate node sizes based on degree
    node_sizes = [G_filtered.degree(n) * 100 + 300 for n in G_filtered.nodes()]
    
    # Calculate edge widths
    edge_weights = [G_filtered[u][v]['weight'] for u, v in G_filtered.edges()]
    max_weight = max(edge_weights) if edge_weights else 1
    edge_widths = [w/max_weight * 5 + 0.5 for w in edge_weights]

    # Draw the graph
    nx.draw_networkx_edges(G_filtered, pos, width=edge_widths, edge_color="lightgray", alpha=0.6)
    nx.draw_networkx_nodes(G_filtered, pos, node_size=node_sizes, 
                          node_color=range(len(G_filtered.nodes())), 
                          cmap=plt.cm.Set3, alpha=0.8)
    nx.draw_networkx_labels(G_filtered, pos, font_size=8, font_weight='bold')
    
    plt.title(f"Domain Network {year}\n{stats['total_articles']} articles, {stats['total_domains']} domains, {stats['total_connections']} connections", 
              fontsize=16, pad=20)
    plt.axis('off')
    plt.tight_layout()
    plt.savefig(os.path.join(output_dir, f"domains_{year}.jpg"), dpi=300, bbox_inches='tight')
    plt.close()

    # === PLOTLY VISUALIZATION (HTML) ===
    # Prepare edge traces
    edge_x = []
    edge_y = []
    edge_info = []
    
    for u, v in G_filtered.edges():
        x0, y0 = pos[u]
        x1, y1 = pos[v]
        edge_x += [x0, x1, None]
        edge_y += [y0, y1, None]
        weight = G_filtered[u][v]['weight']
        edge_info.append(f"{u} ↔ {v}<br>Collaborations: {weight}")

    edge_trace = go.Scatter(
        x=edge_x, y=edge_y,
        line=dict(width=0.8, color='rgba(125, 125, 125, 0.5)'),
        hoverinfo='none',
        mode='lines'
    )

    # Prepare node traces
    node_x = []
    node_y = []
    node_text = []
    node_info = []
    node_sizes = []
    node_colors = []
    
    for i, node in enumerate(G_filtered.nodes()):
        x, y = pos[node]
        node_x.append(x)
        node_y.append(y)
        
        # Calculate metrics for this node
        degree = G_filtered.degree(node)
        neighbors = list(G_filtered.neighbors(node))
        
        # Node size based on degree
        size = min(degree * 8 + 15, 60)  # Cap maximum size
        node_sizes.append(size)
        node_colors.append(degree)
        
        # Node label
        node_text.append(node)
        
        # Hover info
        neighbor_text = "<br>".join([f"• {n}" for n in neighbors[:10]])  # Show max 10 neighbors
        if len(neighbors) > 10:
            neighbor_text += f"<br>... and {len(neighbors)-10} more"
            
        hover_text = f"<b>{node}</b><br>" \
                    f"Connections: {degree}<br>" \
                    f"Connected to:<br>{neighbor_text}"
        node_info.append(hover_text)

    node_trace = go.Scatter(
        x=node_x, y=node_y,
        mode='markers+text',
        text=node_text,
        textposition="middle center",
        textfont=dict(size=10, color="white"),
        hovertext=node_info,
        hoverinfo='text',
        marker=dict(
            size=node_sizes,
            color=node_colors,
            # colorscale='Viridis',
            showscale=True,
            colorbar=dict(
                title="Number of<br>Connections",
                tickmode="linear",
                thickness=15
            ),
            line=dict(width=2, color='white')
        )
    )

    # Create the figure
    fig = go.Figure(data=[edge_trace, node_trace],
                    layout=go.Layout(
                        title=dict(
                            text=f"<b>Domain Collaboration Network {year}</b><br>" +
                                 f"<i>{stats['total_articles']:,} articles • {stats['total_domains']} domains • " +
                                 f"{stats['total_connections']} connections • {stats['multidisciplinary_articles']:,} multidisciplinary</i>",
                            x=0.5,
                            font=dict(size=18)
                        ),
                        showlegend=False,
                        hovermode='closest',
                        margin=dict(b=20, l=5, r=5, t=100),
                        annotations=[
                            dict(
                                text="Node size = number of connections<br>" +
                                     "Edge thickness = collaboration strength<br>" +
                                     "Hover over nodes for details",
                                showarrow=False,
                                xref="paper", yref="paper",
                                x=0.005, y=-0.002,
                                xanchor='left', yanchor='bottom',
                                font=dict(size=12, color="gray")
                            )
                        ],
                        xaxis=dict(showgrid=False, zeroline=False, showticklabels=False),
                        yaxis=dict(showgrid=False, zeroline=False, showticklabels=False),
                        plot_bgcolor='white',
                        paper_bgcolor='white'
                    ))

    # Save HTML
    html_file = os.path.join(output_dir, f"domains_{year}.html")
    fig.write_html(html_file)

    # Save statistics
    with open(os.path.join(output_dir, f"stats_{year}.json"), "w") as f:
        json.dump(stats, f, indent=2)


# ========= Main execution =========
def main():
    years = [2018, 2019, 2020, 2021, 2022, 2023, 2024]
    
    for year in years:
        print(f"\n🔄 Processing {year}...")
        G, stats = build_domain_graph_fast(year)
        
        if G and len(G.nodes()) > 0:
            save_graph_enhanced(G, year, stats)
        else:
            print(f"⚠ No valid graph generated for {year}")

if __name__ == "__main__":
    main()

In [None]:
## FIX


In [15]:
import os
import json
import networkx as nx
import matplotlib.pyplot as plt
import plotly.graph_objects as go
from collections import defaultdict
import itertools
import re

# ========= Expanded domain list with better categorization =========
KNOWN_DOMAINS = {
    # STEM Core
    "Medicine", "Biology", "Chemistry", "Physics", "Mathematics", "Computer Science",
    "Engineering", "Earth Science", "Environmental Science", "Materials Science",
    "Astronomy", "Astrophysics", "Geology", "Meteorology", "Oceanography",
    
    # Life Sciences
    "Genetics", "Molecular Biology", "Cell Biology", "Biochemistry", "Microbiology",
    "Immunology", "Pharmacology", "Pathology", "Physiology", "Anatomy",
    "Botany", "Zoology", "Ecology", "Evolutionary Biology", "Marine Biology",
    
    # Medical & Health
    "Public Health", "Epidemiology", "Clinical Medicine", "Surgery", "Psychiatry",
    "Neurology", "Cardiology", "Oncology", "Pediatrics", "Geriatrics",
    "Radiology", "Anesthesiology", "Emergency Medicine", "Family Medicine",
    
    # Technology & Computing
    "Artificial Intelligence", "Machine Learning", "Data Science", "Robotics",
    "Software Engineering", "Cybersecurity", "Human-Computer Interaction",
    "Information Systems", "Telecommunications", "Biotechnology",
    "Nanotechnology", "Quantum Computing", "Bioinformatics",
    
    # Engineering Disciplines
    "Mechanical Engineering", "Electrical Engineering", "Civil Engineering",
    "Chemical Engineering", "Aerospace Engineering", "Biomedical Engineering",
    "Environmental Engineering", "Industrial Engineering", "Nuclear Engineering",
    
    # Social Sciences
    "Psychology", "Sociology", "Political Science", "Economics", "Anthropology",
    "Education", "Criminology", "Social Work", "International Relations",
    "Public Policy", "Urban Planning", "Communication",
    
    # Humanities & Arts
    "History", "Philosophy", "Literature", "Art", "Music", "Theater",
    "Linguistics", "Languages", "Cultural Studies", "Religious Studies",
    "Media Studies", "Film Studies",
    
    # Business & Management
    "Business", "Management", "Marketing", "Finance", "Accounting",
    "Operations Research", "Supply Chain", "Entrepreneurship",
    
    # Interdisciplinary
    "Neuroscience", "Cognitive Science", "Environmental Studies", 
    "Climate Science", "Sustainability", "Gender Studies", "Area Studies",
    "Science and Technology Studies", "Bioethics", "Digital Humanities",
    
    # Geography & Earth
    "Geography", "Cartography", "Geographic Information Systems", "Remote Sensing",
    
    # Law & Policy
    "Law", "Legal Studies", "Constitutional Law", "International Law",
    
    # Applied Sciences
    "Agriculture", "Forestry", "Veterinary Science", "Food Science",
    "Sports Science", "Nutrition", "Architecture", "Design"
}

# Convert to set for O(1) lookup
DOMAIN_SET = set(KNOWN_DOMAINS)

# ========= Create keyword mappings for faster lookup =========
def create_domain_keywords():
    """Create keyword mappings to domains for faster matching"""
    keyword_to_domain = {}
    
    # Direct mappings
    for domain in DOMAIN_SET:
        # Add the domain itself
        keyword_to_domain[domain.lower()] = domain
        
        # Add common variations and keywords
        words = domain.lower().split()
        for word in words:
            if len(word) > 3:  # Only meaningful words
                keyword_to_domain[word] = domain
    
    # Add specific keyword mappings
    additional_mappings = {
        # Technology
        "ai": "Artificial Intelligence", "ml": "Machine Learning",
        "deep learning": "Machine Learning", "neural network": "Machine Learning",
        "algorithm": "Computer Science", "programming": "Computer Science",
        "software": "Software Engineering", "hardware": "Engineering",
        "database": "Computer Science", "network": "Telecommunications",
        
        # Medicine
        "medical": "Medicine", "clinical": "Clinical Medicine", "patient": "Medicine",
        "treatment": "Medicine", "diagnosis": "Medicine", "therapy": "Medicine",
        "disease": "Medicine", "health": "Public Health", "healthcare": "Medicine",
        "hospital": "Medicine", "nursing": "Medicine", "pharmaceutical": "Pharmacology",
        
        # Biology
        "cell": "Cell Biology", "gene": "Genetics", "dna": "Genetics", "rna": "Genetics",
        "protein": "Biochemistry", "enzyme": "Biochemistry", "organism": "Biology",
        "species": "Biology", "evolution": "Evolutionary Biology", "genome": "Genetics",
        
        # Physics/Chemistry
        "quantum": "Physics", "particle": "Physics", "energy": "Physics",
        "molecule": "Chemistry", "reaction": "Chemistry", "catalyst": "Chemistry",
        "material": "Materials Science", "crystal": "Materials Science",
        
        # Social Sciences
        "social": "Sociology", "society": "Sociology", "culture": "Anthropology",
        "behavior": "Psychology", "psychological": "Psychology", "cognitive": "Psychology",
        "economic": "Economics", "political": "Political Science", "policy": "Public Policy",
        "education": "Education", "learning": "Education", "teaching": "Education",
        
        # Environment
        "climate": "Climate Science", "environment": "Environmental Science",
        "sustainability": "Sustainability", "ecology": "Ecology", "conservation": "Environmental Science",
        "pollution": "Environmental Science", "renewable": "Environmental Science",
        
        # Business
        "business": "Business", "management": "Management", "marketing": "Marketing",
        "finance": "Finance", "economic": "Economics", "market": "Economics",
        
        # Geography
        "geographic": "Geography", "spatial": "Geography", "mapping": "Geography",
        "urban": "Urban Planning", "city": "Urban Planning"
    }
    
    keyword_to_domain.update(additional_mappings)
    return keyword_to_domain

# ========= Fast domain extraction =========
def extract_domains_from_article_fast(article, keyword_to_domain):
    """
    Fast domain extraction using keyword matching instead of fuzzy string matching
    """
    found_domains = set()
    
    # Collect all text to search
    text_sources = []
    text_sources.extend(article.get("domains", []))
    text_sources.extend(article.get("fields", []))
    text_sources.extend(article.get("keywords", []))
    
    # Also check title for domain keywords
    title = article.get("title", "")
    if title:
        text_sources.append(title)
    
    # Search for domain matches
    for text in text_sources:
        if not text:
            continue
            
        text_lower = text.lower()
        
        # Direct domain match
        if text_lower in keyword_to_domain:
            found_domains.add(keyword_to_domain[text_lower])
            continue
        
        # Check if any keyword is contained in the text
        for keyword, domain in keyword_to_domain.items():
            if keyword in text_lower:
                found_domains.add(domain)
    
    return list(found_domains)

# ========= Enhanced graph building with statistics =========
def build_domain_graph_fast(year, data_folder="articles_{year}_new"):
    """Build domain graph with improved performance and statistics"""
    file_path = os.path.join(data_folder.format(year=year), "all_articles_enhanced.jsonl")
    if not os.path.exists(file_path):
        print(f"⚠ No data for {year}")
        return None, {}

    keyword_to_domain = create_domain_keywords()
    G = nx.Graph()
    domain_pairs_counter = defaultdict(int)
    domain_article_count = defaultdict(int)
    total_articles = 0
    multidisciplinary_articles = 0

    print(f"Processing articles for {year}...")
    
    with open(file_path, "r", encoding="utf-8") as f:
        for line_num, line in enumerate(f, 1):
            if line_num % 1000 == 0:
                print(f"  Processed {line_num} articles...")
                
            try:
                article = json.loads(line)
                total_articles += 1
                
                domains = extract_domains_from_article_fast(article, keyword_to_domain)
                
                # Count domain occurrences
                for domain in domains:
                    domain_article_count[domain] += 1
                
                if len(domains) > 1:
                    multidisciplinary_articles += 1
                    # Add all possible pairs
                    for d1, d2 in itertools.combinations(sorted(domains), 2):
                        domain_pairs_counter[(d1, d2)] += 1
                        
            except json.JSONDecodeError:
                print(f"  Warning: Skipped malformed JSON at line {line_num}")
                continue

    # Build graph with edge weights
    for (d1, d2), weight in domain_pairs_counter.items():
        G.add_edge(d1, d2, weight=weight)

    # Add isolated nodes (domains that appear but don't have connections)
    for domain in domain_article_count:
        if domain not in G.nodes():
            G.add_node(domain)

    # Calculate statistics
    stats = {
        "total_articles": total_articles,
        "multidisciplinary_articles": multidisciplinary_articles,
        "total_domains": len(domain_article_count),
        "connected_domains": len([n for n in G.nodes() if G.degree(n) > 0]),
        "total_connections": len(G.edges()),
        "top_domains": sorted(domain_article_count.items(), key=lambda x: x[1], reverse=True)[:10],
        "strongest_connections": sorted(domain_pairs_counter.items(), key=lambda x: x[1], reverse=True)[:10]
    }

    print(f"✅ Graph built for {year}:")
    print(f"   📊 {stats['total_articles']} articles processed")
    print(f"   🔬 {stats['total_domains']} unique domains found")
    print(f"   🔗 {stats['total_connections']} domain connections")
    print(f"   🌐 {stats['multidisciplinary_articles']} multidisciplinary articles")

    return G, stats

# ========= Enhanced visualization =========
def save_graph_enhanced(G, year, stats, output_dir="domain_graphs"):
    """Save graph with enhanced visualizations and statistics"""
    os.makedirs(output_dir, exist_ok=True)

    if len(G.nodes()) == 0:
        print(f"⚠ No nodes to visualize for {year}")
        return

    # Filter graph to show only meaningful connections
    # Calculate better thresholds based on data distribution
    edge_weights = [data['weight'] for u, v, data in G.edges(data=True)]
    if len(edge_weights) > 0:
        # Use 75th percentile as threshold to show only strong connections
        import numpy as np
        threshold_75 = np.percentile(edge_weights, 75)
        threshold_90 = np.percentile(edge_weights, 90)
        
        # For very dense networks, use 90th percentile
        min_weight = threshold_90 if len(G.edges()) > 1000 else threshold_75
        min_weight = max(min_weight, 2)  # At least 2 collaborations
    else:
        min_weight = 1
    
    G_filtered = nx.Graph()
    
    for u, v, data in G.edges(data=True):
        if data['weight'] >= min_weight:
            G_filtered.add_edge(u, v, weight=data['weight'])
    
    # If filtered graph is too small, lower the threshold
    if len(G_filtered.nodes()) < 20:
        min_weight = max(1, min_weight // 2)
        G_filtered = nx.Graph()
        for u, v, data in G.edges(data=True):
            if data['weight'] >= min_weight:
                G_filtered.add_edge(u, v, weight=data['weight'])
    
    print(f"   🎯 Using threshold: {min_weight} (showing {len(G_filtered.edges())}/{len(G.edges())} connections)")

    # Better layout for readability
    pos = nx.spring_layout(G_filtered, seed=42, k=2.0, iterations=100)

    # === MATPLOTLIB VISUALIZATION (JPG) ===
    plt.figure(figsize=(20, 16))
    
    # Calculate node sizes based on degree (bigger for more connections)
    node_sizes = [G_filtered.degree(n) * 150 + 500 for n in G_filtered.nodes()]
    
    # Calculate edge widths based on collaboration strength
    edge_weights = [G_filtered[u][v]['weight'] for u, v in G_filtered.edges()]
    max_weight = max(edge_weights) if edge_weights else 1
    edge_widths = [w/max_weight * 6 + 1 for w in edge_weights]

    # Draw edges first (behind nodes)
    nx.draw_networkx_edges(G_filtered, pos, width=edge_widths, 
                          edge_color="lightgray", alpha=0.4)
    
    # Draw nodes with better colors
    nx.draw_networkx_nodes(G_filtered, pos, node_size=node_sizes, 
                          node_color=[G_filtered.degree(n) for n in G_filtered.nodes()], 
                          cmap=plt.cm.viridis, alpha=0.8, 
                          edgecolors='white', linewidths=2)
    
    # Draw labels with better positioning
    labels = {}
    for node in G_filtered.nodes():
        # Shorten long domain names for readability
        label = node.replace(' and ', ' & ').replace('Science', 'Sci.')
        if len(label) > 15:
            words = label.split()
            if len(words) > 1:
                label = words[0] + '\n' + ' '.join(words[1:])
        labels[node] = label
    
    nx.draw_networkx_labels(G_filtered, pos, labels, font_size=9, 
                           font_weight='bold', font_color='white')
    
    plt.title(f"Domain Collaboration Network {year}\n" +
              f"{stats['total_articles']:,} articles • {len(G_filtered.nodes())} domains shown • " +
              f"Min {min_weight} collaborations per connection", 
              fontsize=18, pad=30)
    plt.axis('off')
    plt.tight_layout()
    plt.savefig(os.path.join(output_dir, f"domains_{year}.jpg"), 
                dpi=300, bbox_inches='tight', facecolor='white')
    plt.close()

    # === PLOTLY VISUALIZATION (HTML) ===
    # Prepare edge traces
    edge_x = []
    edge_y = []
    edge_info = []
    
    for u, v in G_filtered.edges():
        x0, y0 = pos[u]
        x1, y1 = pos[v]
        edge_x += [x0, x1, None]
        edge_y += [y0, y1, None]
        weight = G_filtered[u][v]['weight']
        edge_info.append(f"{u} ↔ {v}<br>Collaborations: {weight}")

    edge_trace = go.Scatter(
        x=edge_x, y=edge_y,
        line=dict(width=0.8, color='rgba(125, 125, 125, 0.5)'),
        hoverinfo='none',
        mode='lines'
    )

    # Prepare node traces
    node_x = []
    node_y = []
    node_text = []
    node_info = []
    node_sizes = []
    node_colors = []
    
    for i, node in enumerate(G_filtered.nodes()):
        x, y = pos[node]
        node_x.append(x)
        node_y.append(y)
        
        # Calculate metrics for this node
        degree = G_filtered.degree(node)
        neighbors = list(G_filtered.neighbors(node))
        
        # Node size based on degree (bigger for more connections)
        size = min(degree * 12 + 20, 80)  # Cap maximum size
        node_sizes.append(size)
        node_colors.append(degree)
        
        # Shorter node labels for readability
        short_name = node.replace(' and ', ' & ').replace('Science', 'Sci.')
        if len(short_name) > 12:
            words = short_name.split()
            if len(words) > 1:
                short_name = words[0] + '<br>' + ' '.join(words[1:])
        node_text.append(short_name)
        
        # Enhanced hover info with collaboration strengths
        neighbor_info = []
        for neighbor in neighbors:
            weight = G_filtered[node][neighbor]['weight']
            neighbor_info.append(f"• {neighbor}: {weight} articles")
        
        neighbor_text = "<br>".join(neighbor_info[:8])  # Show max 8 neighbors
        if len(neighbors) > 8:
            neighbor_text += f"<br>... and {len(neighbors)-8} more"
            
        hover_text = f"<b>{node}</b><br>" \
                    f"<b>Total Connections:</b> {degree}<br>" \
                    f"<b>Collaboration Details:</b><br>{neighbor_text}"
        node_info.append(hover_text)

    node_trace = go.Scatter(
        x=node_x, y=node_y,
        mode='markers+text',
        text=node_text,
        textposition="middle center",
        textfont=dict(size=8, color="white", family="Arial Black"),
        hovertext=node_info,
        hoverinfo='text',
        marker=dict(
            size=node_sizes,
            color=node_colors,
            # colorscale='Viridis',
            showscale=True,
            colorbar=dict(
                title=dict(
                    text="Connections",
                    side="right"
                ),
                tickmode="linear",
                thickness=12,
                len=0.7
            ),
            line=dict(width=3, color='white'),
            opacity=0.9
        )
    )

    # Create the figure with better layout
    fig = go.Figure(data=[edge_trace, node_trace],
                    layout=go.Layout(
                        title=dict(
                            text=f"<b>Domain Collaboration Network {year}</b><br>" +
                                 f"<i>{stats['total_articles']:,} articles • {len(G_filtered.nodes())} domains shown • " +
                                 f"Min {min_weight} collaborations • Threshold: {min_weight}+ articles</i>",
                            x=0.5,
                            font=dict(size=16)
                        ),
                        showlegend=False,
                        hovermode='closest',
                        margin=dict(b=40, l=20, r=60, t=120),
                        annotations=[
                            dict(
                                text="<b>Guide:</b> Node size = # connections | Edge thickness = collaboration strength<br>" +
                                     f"Only showing connections with {min_weight}+ shared articles | Hover for details",
                                showarrow=False,
                                xref="paper", yref="paper",
                                x=0.5, y=-0.08,
                                xanchor='center', yanchor='top',
                                font=dict(size=11, color="gray")
                            )
                        ],
                        xaxis=dict(showgrid=False, zeroline=False, showticklabels=False),
                        yaxis=dict(showgrid=False, zeroline=False, showticklabels=False),
                        plot_bgcolor='white',
                        paper_bgcolor='white',
                        width=1200,
                        height=800
                    ))

    # Save HTML
    html_file = os.path.join(output_dir, f"domains_{year}.html")
    fig.write_html(html_file)

    # Save statistics
    with open(os.path.join(output_dir, f"stats_{year}.json"), "w") as f:
        json.dump(stats, f, indent=2)

    print(f"💾 Saved graph and statistics for {year}:")
    print(f"   📊 JPG: domains_{year}.jpg")
    print(f"   🌐 HTML: domains_{year}.html") 
    print(f"   📈 Stats: stats_{year}.json")

# ========= Main execution =========
def main():
    years = [2018, 2019, 2020, 2021, 2022, 2023, 2024]
    
    for year in years:
        print(f"\n🔄 Processing {year}...")
        G, stats = build_domain_graph_fast(year)
        
        if G and len(G.nodes()) > 0:
            save_graph_enhanced(G, year, stats)
        else:
            print(f"⚠ No valid graph generated for {year}")

if __name__ == "__main__":
    main()


🔄 Processing 2018...
Processing articles for 2018...
  Processed 1000 articles...
  Processed 2000 articles...
  Processed 3000 articles...
  Processed 4000 articles...
  Processed 5000 articles...
  Processed 6000 articles...
  Processed 7000 articles...
  Processed 8000 articles...
  Processed 9000 articles...
  Processed 10000 articles...
  Processed 11000 articles...
  Processed 12000 articles...
  Processed 13000 articles...
  Processed 14000 articles...
  Processed 15000 articles...
✅ Graph built for 2018:
   📊 15797 articles processed
   🔬 122 unique domains found
   🔗 5721 domain connections
   🌐 15797 multidisciplinary articles
   🎯 Using threshold: 298.0 (showing 574/5721 connections)
💾 Saved graph and statistics for 2018:
   📊 JPG: domains_2018.jpg
   🌐 HTML: domains_2018.html
   📈 Stats: stats_2018.json

🔄 Processing 2019...
Processing articles for 2019...
  Processed 1000 articles...
  Processed 2000 articles...
  Processed 3000 articles...
  Processed 4000 articles...
  

In [18]:
import os
import json
import networkx as nx
import matplotlib.pyplot as plt
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import plotly.express as px
from collections import defaultdict
import itertools
import numpy as np
from sklearn.cluster import SpectralClustering
import pandas as pd

# ========= Expanded domain list with better categorization =========
KNOWN_DOMAINS = {
    # STEM Core
    "Medicine", "Biology", "Chemistry", "Physics", "Mathematics", "Computer Science",
    "Engineering", "Earth Science", "Environmental Science", "Materials Science",
    "Astronomy", "Astrophysics", "Geology", "Meteorology", "Oceanography",
    
    # Life Sciences
    "Genetics", "Molecular Biology", "Cell Biology", "Biochemistry", "Microbiology",
    "Immunology", "Pharmacology", "Pathology", "Physiology", "Anatomy",
    "Botany", "Zoology", "Ecology", "Evolutionary Biology", "Marine Biology",
    
    # Medical & Health
    "Public Health", "Epidemiology", "Clinical Medicine", "Surgery", "Psychiatry",
    "Neurology", "Cardiology", "Oncology", "Pediatrics", "Geriatrics",
    "Radiology", "Anesthesiology", "Emergency Medicine", "Family Medicine",
    
    # Technology & Computing
    "Artificial Intelligence", "Machine Learning", "Data Science", "Robotics",
    "Software Engineering", "Cybersecurity", "Human-Computer Interaction",
    "Information Systems", "Telecommunications", "Biotechnology",
    "Nanotechnology", "Quantum Computing", "Bioinformatics",
    
    # Engineering Disciplines
    "Mechanical Engineering", "Electrical Engineering", "Civil Engineering",
    "Chemical Engineering", "Aerospace Engineering", "Biomedical Engineering",
    "Environmental Engineering", "Industrial Engineering", "Nuclear Engineering",
    
    # Social Sciences
    "Psychology", "Sociology", "Political Science", "Economics", "Anthropology",
    "Education", "Criminology", "Social Work", "International Relations",
    "Public Policy", "Urban Planning", "Communication",
    
    # Humanities & Arts
    "History", "Philosophy", "Literature", "Art", "Music", "Theater",
    "Linguistics", "Languages", "Cultural Studies", "Religious Studies",
    "Media Studies", "Film Studies",
    
    # Business & Management
    "Business", "Management", "Marketing", "Finance", "Accounting",
    "Operations Research", "Supply Chain", "Entrepreneurship",
    
    # Interdisciplinary
    "Neuroscience", "Cognitive Science", "Environmental Studies", 
    "Climate Science", "Sustainability", "Gender Studies", "Area Studies",
    "Science and Technology Studies", "Bioethics", "Digital Humanities",
    
    # Geography & Earth
    "Geography", "Cartography", "Geographic Information Systems", "Remote Sensing",
    
    # Law & Policy
    "Law", "Legal Studies", "Constitutional Law", "International Law",
    
    # Applied Sciences
    "Agriculture", "Forestry", "Veterinary Science", "Food Science",
    "Sports Science", "Nutrition", "Architecture", "Design"
}

DOMAIN_SET = set(KNOWN_DOMAINS)

def create_domain_keywords():
    """Create keyword mappings to domains for faster matching"""
    keyword_to_domain = {}
    
    for domain in DOMAIN_SET:
        keyword_to_domain[domain.lower()] = domain
        words = domain.lower().split()
        for word in words:
            if len(word) > 3:
                keyword_to_domain[word] = domain
    
    additional_mappings = {
        # Technology
        "ai": "Artificial Intelligence", "ml": "Machine Learning",
        "deep learning": "Machine Learning", "neural network": "Machine Learning",
        "algorithm": "Computer Science", "programming": "Computer Science",
        "software": "Software Engineering", "hardware": "Engineering",
        "database": "Computer Science", "network": "Telecommunications",
        
        # Medicine
        "medical": "Medicine", "clinical": "Clinical Medicine", "patient": "Medicine",
        "treatment": "Medicine", "diagnosis": "Medicine", "therapy": "Medicine",
        "disease": "Medicine", "health": "Public Health", "healthcare": "Medicine",
        "hospital": "Medicine", "nursing": "Medicine", "pharmaceutical": "Pharmacology",
        
        # Biology
        "cell": "Cell Biology", "gene": "Genetics", "dna": "Genetics", "rna": "Genetics",
        "protein": "Biochemistry", "enzyme": "Biochemistry", "organism": "Biology",
        "species": "Biology", "evolution": "Evolutionary Biology", "genome": "Genetics",
        
        # Physics/Chemistry
        "quantum": "Physics", "particle": "Physics", "energy": "Physics",
        "molecule": "Chemistry", "reaction": "Chemistry", "catalyst": "Chemistry",
        "material": "Materials Science", "crystal": "Materials Science",
        
        # Social Sciences
        "social": "Sociology", "society": "Sociology", "culture": "Anthropology",
        "behavior": "Psychology", "psychological": "Psychology", "cognitive": "Psychology",
        "economic": "Economics", "political": "Political Science", "policy": "Public Policy",
        "education": "Education", "learning": "Education", "teaching": "Education",
        
        # Environment
        "climate": "Climate Science", "environment": "Environmental Science",
        "sustainability": "Sustainability", "ecology": "Ecology", "conservation": "Environmental Science",
        "pollution": "Environmental Science", "renewable": "Environmental Science",
        
        # Business
        "business": "Business", "management": "Management", "marketing": "Marketing",
        "finance": "Finance", "economic": "Economics", "market": "Economics",
        
        # Geography
        "geographic": "Geography", "spatial": "Geography", "mapping": "Geography",
        "urban": "Urban Planning", "city": "Urban Planning"
    }
    
    keyword_to_domain.update(additional_mappings)
    return keyword_to_domain

def extract_domains_from_article_fast(article, keyword_to_domain):
    """Fast domain extraction using keyword matching"""
    found_domains = set()
    
    text_sources = []
    text_sources.extend(article.get("domains", []))
    text_sources.extend(article.get("fields", []))
    text_sources.extend(article.get("keywords", []))
    
    title = article.get("title", "")
    if title:
        text_sources.append(title)
    
    for text in text_sources:
        if not text:
            continue
            
        text_lower = text.lower()
        
        if text_lower in keyword_to_domain:
            found_domains.add(keyword_to_domain[text_lower])
            continue
        
        for keyword, domain in keyword_to_domain.items():
            if keyword in text_lower:
                found_domains.add(domain)
    
    return list(found_domains)

def build_domain_graph_fast(year, data_folder="articles_{year}_new"):
    """Build domain graph with improved performance and statistics"""
    file_path = os.path.join(data_folder.format(year=year), "all_articles_enhanced.jsonl")
    if not os.path.exists(file_path):
        print(f"⚠ No data for {year}")
        return None, {}

    keyword_to_domain = create_domain_keywords()
    G = nx.Graph()
    domain_pairs_counter = defaultdict(int)
    domain_article_count = defaultdict(int)
    total_articles = 0
    multidisciplinary_articles = 0

    print(f"Processing articles for {year}...")
    
    with open(file_path, "r", encoding="utf-8") as f:
        for line_num, line in enumerate(f, 1):
            if line_num % 1000 == 0:
                print(f"  Processed {line_num} articles...")
                
            try:
                article = json.loads(line)
                total_articles += 1
                
                domains = extract_domains_from_article_fast(article, keyword_to_domain)
                
                for domain in domains:
                    domain_article_count[domain] += 1
                
                if len(domains) > 1:
                    multidisciplinary_articles += 1
                    for d1, d2 in itertools.combinations(sorted(domains), 2):
                        domain_pairs_counter[(d1, d2)] += 1
                        
            except json.JSONDecodeError:
                print(f"  Warning: Skipped malformed JSON at line {line_num}")
                continue

    for (d1, d2), weight in domain_pairs_counter.items():
        G.add_edge(d1, d2, weight=weight)

    for domain in domain_article_count:
        if domain not in G.nodes():
            G.add_node(domain)

    stats = {
        "total_articles": total_articles,
        "multidisciplinary_articles": multidisciplinary_articles,
        "total_domains": len(domain_article_count),
        "connected_domains": len([n for n in G.nodes() if G.degree(n) > 0]),
        "total_connections": len(G.edges()),
        "top_domains": sorted(domain_article_count.items(), key=lambda x: x[1], reverse=True)[:15],
        "strongest_connections": sorted(domain_pairs_counter.items(), key=lambda x: x[1], reverse=True)[:15],
        "domain_article_count": domain_article_count
    }

    print(f"✅ Graph built for {year}:")
    print(f"   📊 {stats['total_articles']} articles processed")
    print(f"   🔬 {stats['total_domains']} unique domains found")
    print(f"   🔗 {stats['total_connections']} domain connections")
    print(f"   🌐 {stats['multidisciplinary_articles']} multidisciplinary articles")

    return G, stats

def create_domain_clusters(G, stats):
    """Create meaningful domain clusters"""
    
    # Define domain categories manually for better interpretation
    domain_categories = {
        "Life Sciences": ["Biology", "Genetics", "Molecular Biology", "Cell Biology", "Biochemistry", 
                         "Microbiology", "Immunology", "Botany", "Zoology", "Ecology", "Evolutionary Biology", 
                         "Marine Biology", "Physiology", "Anatomy"],
        
        "Medicine & Health": ["Medicine", "Clinical Medicine", "Public Health", "Epidemiology", "Surgery", 
                             "Psychiatry", "Neurology", "Cardiology", "Oncology", "Pediatrics", "Geriatrics",
                             "Radiology", "Anesthesiology", "Emergency Medicine", "Family Medicine", 
                             "Pharmacology", "Pathology"],
        
        "Technology & Computing": ["Computer Science", "Artificial Intelligence", "Machine Learning", 
                                  "Data Science", "Software Engineering", "Robotics", "Cybersecurity",
                                  "Human-Computer Interaction", "Information Systems", "Telecommunications",
                                  "Quantum Computing", "Bioinformatics"],
        
        "Physical Sciences": ["Physics", "Chemistry", "Mathematics", "Astronomy", "Astrophysics", 
                             "Materials Science", "Nanotechnology"],
        
        "Engineering": ["Engineering", "Mechanical Engineering", "Electrical Engineering", "Civil Engineering",
                       "Chemical Engineering", "Aerospace Engineering", "Biomedical Engineering",
                       "Environmental Engineering", "Industrial Engineering", "Nuclear Engineering"],
        
        "Earth & Environment": ["Earth Science", "Environmental Science", "Geology", "Meteorology", 
                               "Oceanography", "Climate Science", "Environmental Studies", "Sustainability"],
        
        "Social Sciences": ["Psychology", "Sociology", "Political Science", "Economics", "Anthropology",
                           "Education", "Criminology", "Social Work", "International Relations",
                           "Public Policy", "Urban Planning", "Communication", "Neuroscience", 
                           "Cognitive Science"],
        
        "Humanities": ["History", "Philosophy", "Literature", "Art", "Music", "Theater", "Linguistics",
                      "Languages", "Cultural Studies", "Religious Studies", "Media Studies", "Film Studies"],
        
        "Business & Applied": ["Business", "Management", "Marketing", "Finance", "Accounting",
                              "Operations Research", "Supply Chain", "Entrepreneurship", "Law", 
                              "Legal Studies", "Architecture", "Design"]
    }
    
    # Assign domains to categories
    domain_to_category = {}
    for category, domains in domain_categories.items():
        for domain in domains:
            if domain in G.nodes():
                domain_to_category[domain] = category
    
    # Handle unassigned domains
    for domain in G.nodes():
        if domain not in domain_to_category:
            domain_to_category[domain] = "Other"
    
    return domain_to_category

def save_multiple_views(G, year, stats, output_dir="domain_graphs"):
    """Create multiple clear views of the network"""
    os.makedirs(output_dir, exist_ok=True)
    
    if len(G.nodes()) == 0:
        print(f"⚠ No nodes to visualize for {year}")
        return
    
    domain_to_category = create_domain_clusters(G, stats)
    
    # ========= VIEW 1: TOP DOMAINS BAR CHART =========
    top_domains = stats['top_domains'][:20]
    
    fig_bar = go.Figure(data=[
        go.Bar(
            x=[count for domain, count in top_domains],
            y=[domain for domain, count in top_domains],
            orientation='h',
            # marker_color='viridis',
            text=[f'{count:,}' for domain, count in top_domains],
            textposition='auto',
        )
    ])
    
    fig_bar.update_layout(
        title=f"Top 20 Research Domains by Article Count ({year})",
        xaxis_title="Number of Articles",
        yaxis_title="Domain",
        height=600,
        yaxis={'categoryorder':'total ascending'}
    )
    
    fig_bar.write_html(os.path.join(output_dir, f"top_domains_{year}.html"))
    
    # ========= VIEW 2: STRONGEST COLLABORATIONS =========
    top_collabs = stats['strongest_connections'][:15]
    
    collab_data = []
    for (d1, d2), weight in top_collabs:
        collab_data.append({
            'Domain 1': d1,
            'Domain 2': d2,
            'Collaborations': weight,
            'Pair': f"{d1} ↔ {d2}"
        })
    
    df_collab = pd.DataFrame(collab_data)
    
    fig_collab = go.Figure(data=[
        go.Bar(
            x=df_collab['Collaborations'],
            y=df_collab['Pair'],
            orientation='h',
            # marker_color='plasma',
            text=df_collab['Collaborations'],
            textposition='auto',
        )
    ])
    
    fig_collab.update_layout(
        title=f"Top 15 Domain Collaborations ({year})",
        xaxis_title="Number of Collaborative Articles",
        yaxis_title="Domain Pairs",
        height=600,
        yaxis={'categoryorder':'total ascending'}
    )
    
    fig_collab.write_html(os.path.join(output_dir, f"top_collaborations_{year}.html"))
    
    # ========= VIEW 3: CATEGORY-BASED NETWORK =========
    # Create a simplified network showing only category-level connections
    category_graph = nx.Graph()
    category_connections = defaultdict(int)
    
    for u, v, data in G.edges(data=True):
        cat_u = domain_to_category.get(u, "Other")
        cat_v = domain_to_category.get(v, "Other")
        if cat_u != cat_v:  # Only inter-category connections
            pair = tuple(sorted([cat_u, cat_v]))
            category_connections[pair] += data['weight']
    
    for (cat1, cat2), weight in category_connections.items():
        category_graph.add_edge(cat1, cat2, weight=weight)
    
    # Visualize category network
    if len(category_graph.nodes()) > 0:
        pos_cat = nx.spring_layout(category_graph, seed=42, k=3, iterations=100)
        
        edge_x = []
        edge_y = []
        for u, v in category_graph.edges():
            x0, y0 = pos_cat[u]
            x1, y1 = pos_cat[v]
            edge_x += [x0, x1, None]
            edge_y += [y0, y1, None]
        
        edge_trace = go.Scatter(
            x=edge_x, y=edge_y,
            line=dict(width=2, color='gray'),
            hoverinfo='none',
            mode='lines'
        )
        
        node_x = []
        node_y = []
        node_text = []
        node_size = []
        node_info = []
        
        for node in category_graph.nodes():
            x, y = pos_cat[node]
            node_x.append(x)
            node_y.append(y)
            node_text.append(node)
            
            # Count domains in this category
            domains_in_cat = [d for d, cat in domain_to_category.items() if cat == node]
            node_size.append(len(domains_in_cat) * 20 + 30)
            
            # Hover info
            domain_list = "<br>".join([f"• {d}" for d in domains_in_cat[:10]])
            if len(domains_in_cat) > 10:
                domain_list += f"<br>... and {len(domains_in_cat)-10} more"
            
            node_info.append(f"<b>{node}</b><br>Domains: {len(domains_in_cat)}<br>{domain_list}")
        
        node_trace = go.Scatter(
            x=node_x, y=node_y,
            mode='markers+text',
            text=node_text,
            textposition="middle center",
            textfont=dict(size=12, color="white", family="Arial Black"),
            hovertext=node_info,
            hoverinfo='text',
            marker=dict(
                size=node_size,
                color=list(range(len(node_x))),
                # colorscale='Set3',
                line=dict(width=2, color='white')
            )
        )
        
        fig_cat = go.Figure(data=[edge_trace, node_trace],
                           layout=go.Layout(
                               title=f"Research Domain Categories Network ({year})",
                               showlegend=False,
                               hovermode='closest',
                               margin=dict(b=20, l=5, r=5, t=40),
                               xaxis=dict(showgrid=False, zeroline=False, showticklabels=False),
                               yaxis=dict(showgrid=False, zeroline=False, showticklabels=False),
                               width=800,
                               height=600
                           ))
        
        fig_cat.write_html(os.path.join(output_dir, f"category_network_{year}.html"))
    
    # ========= VIEW 4: FOCUSED SUBNETWORKS =========
    # Create focused views of the most connected domains
    top_domain_names = [domain for domain, count in stats['top_domains'][:15]]
    subgraph = G.subgraph(top_domain_names).copy()
    
    if len(subgraph.edges()) > 0:
        # Filter to strong connections only
        edges_to_keep = [(u, v) for u, v, d in subgraph.edges(data=True) if d['weight'] >= 5]
        focused_graph = subgraph.edge_subgraph(edges_to_keep).copy()
        
        if len(focused_graph.nodes()) > 0:
            pos_focused = nx.spring_layout(focused_graph, seed=42, k=2, iterations=100)
            
            # Create focused network visualization
            edge_x = []
            edge_y = []
            for u, v in focused_graph.edges():
                x0, y0 = pos_focused[u]
                x1, y1 = pos_focused[v]
                edge_x += [x0, x1, None]
                edge_y += [y0, y1, None]
            
            edge_trace = go.Scatter(
                x=edge_x, y=edge_y,
                line=dict(width=1, color='lightgray'),
                hoverinfo='none',
                mode='lines'
            )
            
            node_x = []
            node_y = []
            node_text = []
            node_size = []
            node_info = []
            
            for node in focused_graph.nodes():
                x, y = pos_focused[node]
                node_x.append(x)
                node_y.append(y)
                
                # Shorter labels
                short_name = node.replace(' and ', ' & ').replace('Science', 'Sci.')
                node_text.append(short_name)
                
                degree = focused_graph.degree(node)
                node_size.append(degree * 15 + 25)
                
                # Hover with collaboration details
                neighbors = []
                for neighbor in focused_graph.neighbors(node):
                    weight = focused_graph[node][neighbor]['weight']
                    neighbors.append(f"• {neighbor}: {weight}")
                
                neighbor_text = "<br>".join(neighbors)
                node_info.append(f"<b>{node}</b><br>Connections: {degree}<br>{neighbor_text}")
            
            node_trace = go.Scatter(
                x=node_x, y=node_y,
                mode='markers+text',
                text=node_text,
                textposition="middle center",
                textfont=dict(size=10, color="white"),
                hovertext=node_info,
                hoverinfo='text',
                marker=dict(
                    size=node_size,
                    color=[focused_graph.degree(n) for n in focused_graph.nodes()],
                    # colorscale='Viridis',
                    showscale=True,
                    line=dict(width=2, color='white')
                )
            )
            
            fig_focused = go.Figure(data=[edge_trace, node_trace],
                                   layout=go.Layout(
                                       title=f"Top 15 Domains Network ({year}) - Strong Connections Only",
                                       showlegend=False,
                                       hovermode='closest',
                                       margin=dict(b=20, l=5, r=5, t=40),
                                       xaxis=dict(showgrid=False, zeroline=False, showticklabels=False),
                                       yaxis=dict(showgrid=False, zeroline=False, showticklabels=False),
                                       width=900,
                                       height=700
                                   ))
            
            fig_focused.write_html(os.path.join(output_dir, f"focused_network_{year}.html"))
    
    # Save statistics
    with open(os.path.join(output_dir, f"stats_{year}.json"), "w") as f:
        json.dump(stats, f, indent=2)
    
    print(f"💾 Created multiple visualizations for {year}:")
    print(f"   📊 Top domains: top_domains_{year}.html")
    print(f"   🤝 Collaborations: top_collaborations_{year}.html") 
    print(f"   🏷️  Categories: category_network_{year}.html")
    print(f"   🎯 Focused network: focused_network_{year}.html")
    print(f"   📈 Stats: stats_{year}.json")

def main():
    years = [2018, 2019, 2020, 2021, 2022, 2023, 2024]
    
    for year in years:
        print(f"\n🔄 Processing {year}...")
        G, stats = build_domain_graph_fast(year)
        
        if G and len(G.nodes()) > 0:
            save_multiple_views(G, year, stats)
        else:
            print(f"⚠ No valid graph generated for {year}")

if __name__ == "__main__":
    main()


🔄 Processing 2018...
Processing articles for 2018...
  Processed 1000 articles...
  Processed 2000 articles...
  Processed 3000 articles...
  Processed 4000 articles...
  Processed 5000 articles...
  Processed 6000 articles...
  Processed 7000 articles...
  Processed 8000 articles...
  Processed 9000 articles...
  Processed 10000 articles...
  Processed 11000 articles...
  Processed 12000 articles...
  Processed 13000 articles...
  Processed 14000 articles...
  Processed 15000 articles...
✅ Graph built for 2018:
   📊 15797 articles processed
   🔬 122 unique domains found
   🔗 5721 domain connections
   🌐 15797 multidisciplinary articles
💾 Created multiple visualizations for 2018:
   📊 Top domains: top_domains_2018.html
   🤝 Collaborations: top_collaborations_2018.html
   🏷️  Categories: category_network_2018.html
   🎯 Focused network: focused_network_2018.html
   📈 Stats: stats_2018.json

🔄 Processing 2019...
Processing articles for 2019...
  Processed 1000 articles...
  Processed 200