In [1]:
"""
Charting Collaboration and Impact: 
A Data Analysis of SFB1002’s Research Output

"""

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
import plotly.figure_factory as ff
from plotly.subplots import make_subplots
import networkx as nx
from collections import Counter, defaultdict
import os
from datetime import datetime
import warnings
warnings.filterwarnings('ignore')

# Set style for static plots
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")

class SFB1002Analyzer:
    def __init__(self, file_path):
        """Initialize the analyzer with the Excel file"""
        self.file_path = file_path
        self.output_dir = os.path.join(os.path.expanduser("~"), "Desktop", "SFB1002_Analysis")
        self.create_output_structure()
        
    def create_output_structure(self):
        """Create organized output folder structure"""
        directories = [
            "01_cleaned_data",
            "02_visualizations/static",
            "02_visualizations/interactive", 
            "02_visualizations/networks",
            "03_reports",
            "04_statistics",
            "05_temporal_analysis",
            "06_collaboration_metrics",
            "07_productivity_analysis",
            "08_impact_analysis",
            "09_strategic_insights"
        ]
        
        for directory in directories:
            path = os.path.join(self.output_dir, directory)
            os.makedirs(path, exist_ok=True)
            
        print(f"✅ Output structure created at: {self.output_dir}")
    
    def load_and_clean_data(self):
        """Load Excel file and perform comprehensive data cleaning"""
        print("📊 Loading and cleaning data...")
        
        # Load the Excel file
        df = pd.read_excel(self.file_path)
        
        print(f"📋 Loaded {len(df)} rows and {len(df.columns)} columns")
        print("🔍 First 10 column names:")
        for i, col in enumerate(df.columns[:10]):
            print(f"   {i}: {col}")
        
        # Identify core columns (first 23 with actual data)
        core_columns = df.columns[:23].tolist()
        external_columns = df.columns[23:].tolist()
        
        # Create main dataset with core columns
        self.main_df = df[core_columns].copy()
        
        # Create external resources dataset
        self.external_df = df[external_columns].copy()
        
        # Clean main dataset
        self.clean_main_dataset()
        
        # Create specialized datasets
        self.create_specialized_datasets()
        
        # Save cleaned datasets
        self.save_cleaned_datasets()
        
        print(f"✅ Data cleaned: {len(self.main_df)} publications from {self.main_df['Publication_Year'].min()}-{self.main_df['Publication_Year'].max()}")
    
    def clean_main_dataset(self):
        """Clean the main dataset"""
        df = self.main_df
        
        print("🧹 Starting data cleaning...")
        print(f"   Original columns: {list(df.columns)}")
        
        # Create a mapping of original to clean column names
        # Handle the exact column names from your Excel file
        expected_columns = [
            'RDP - Link', 'Working Groups', 'Subproject', 'Open Access', 'Publication Type',
            'Peer Reviewed', 'PMID', 'DOI', 'Publication Year', 'Title', 'Journal',
            'ISSN', 'eISSN', 'URL', 'Pages', 'Issue', 'Volume', 'Journal Abbreviation',
            'Extra', 'Authors', 'First Author', 'Last Author', 'Creation Time'
        ]
        
        # Map actual columns to expected columns (handle slight differences)
        column_mapping = {}
        for i, actual_col in enumerate(df.columns[:len(expected_columns)]):
            if i < len(expected_columns):
                column_mapping[actual_col] = expected_columns[i]
        
        # Rename columns
        df = df.rename(columns=column_mapping)
        
        # Create clean column names for processing
        clean_column_names = [
            'RDP_Link', 'Working_Groups', 'Subproject', 'Open_Access', 'Publication_Type',
            'Peer_Reviewed', 'PMID', 'DOI', 'Publication_Year', 'Title', 'Journal',
            'ISSN', 'eISSN', 'URL', 'Pages', 'Issue', 'Volume', 'Journal_Abbreviation',
            'Extra', 'Authors', 'First_Author', 'Last_Author', 'Creation_Time'
        ]
        
        # Create another mapping for clean names
        final_mapping = {}
        for i, col in enumerate(df.columns[:len(clean_column_names)]):
            if i < len(clean_column_names):
                final_mapping[col] = clean_column_names[i]
        
        df = df.rename(columns=final_mapping)
        
        print(f"   Cleaned columns: {list(df.columns[:10])}...")
        
        # Clean publication year
        if 'Publication_Year' in df.columns:
            df['Publication_Year'] = pd.to_numeric(df['Publication_Year'], errors='coerce')
            df = df.dropna(subset=['Publication_Year'])
            df['Publication_Year'] = df['Publication_Year'].astype(int)
            
            # Filter to complete years (2014-2024)
            df = df[(df['Publication_Year'] >= 2014) & (df['Publication_Year'] <= 2024)]
            print(f"   Filtered to years 2014-2024: {len(df)} publications")
        else:
            print("   ⚠️ Warning: Publication_Year column not found!")
            return
        
        # Clean Working Groups
        if 'Working_Groups' in df.columns:
            df['Working_Groups'] = df['Working_Groups'].fillna('Unknown')
            df['Working_Groups_Clean'] = df['Working_Groups'].str.replace('ag_', '', regex=False)
        
        # Clean Publication Type
        if 'Publication_Type' in df.columns:
            df['Publication_Type'] = df['Publication_Type'].fillna('Unknown')
        
        # Clean Open Access
        if 'Open_Access' in df.columns:
            df['Open_Access_Binary'] = df['Open_Access'].map({'Yes': 1, 'No': 0})
            df['Open_Access_Binary'] = df['Open_Access_Binary'].fillna(0)
        
        # Add derived columns
        df['Year_Period'] = pd.cut(df['Publication_Year'], 
                                 bins=[2013, 2016, 2019, 2022, 2024], 
                                 labels=['2014-2016', '2017-2019', '2020-2022', '2023-2024'])
        
        # Extract number of collaborating groups
        if 'Working_Groups' in df.columns:
            df['Num_Collaborating_Groups'] = df['Working_Groups'].str.count(',') + 1
            df['Is_Collaboration'] = df['Num_Collaborating_Groups'] > 1
        
        # Clean journal information
        if 'Journal' in df.columns:
            df['Journal'] = df['Journal'].fillna('Unknown Journal')
        
        self.main_df = df
        print(f"✅ Data cleaning completed: {len(df)} publications ready for analysis")
    
    def create_specialized_datasets(self):
        """Create specialized datasets for different analysis perspectives"""
        
        print("📊 Creating specialized datasets...")
        
        # Check required columns exist
        required_cols = ['Working_Groups', 'Subproject', 'Authors']
        missing_cols = [col for col in required_cols if col not in self.main_df.columns]
        
        if missing_cols:
            print(f"   ⚠️ Warning: Missing columns {missing_cols}")
            print(f"   Available columns: {list(self.main_df.columns)}")
        
        # Dataset A: Individual groups (split multi-group publications)
        individual_rows = []
        for _, row in self.main_df.iterrows():
            if pd.notna(row['Working_Groups']):
                groups = [g.strip() for g in str(row['Working_Groups']).split(',')]
                subprojects = [s.strip() for s in str(row.get('Subproject', '')).split(',')]
                
                # Ensure equal length arrays
                max_len = max(len(groups), len(subprojects))
                groups.extend([''] * (max_len - len(groups)))
                subprojects.extend([''] * (max_len - len(subprojects)))
                
                for group, subproject in zip(groups, subprojects):
                    new_row = row.copy()
                    new_row['Working_Groups'] = group
                    new_row['Working_Groups_Clean'] = group.replace('ag_', '')
                    new_row['Subproject'] = subproject
                    individual_rows.append(new_row)
        
        self.individual_df = pd.DataFrame(individual_rows)
        print(f"   ✅ Individual groups dataset: {len(self.individual_df)} rows")
        
        # Dataset B: Collaboration-focused (keep combined groups)
        if 'Is_Collaboration' in self.main_df.columns:
            self.collaboration_df = self.main_df[self.main_df['Is_Collaboration'] == True].copy()
        else:
            self.collaboration_df = pd.DataFrame()
        print(f"   ✅ Collaboration dataset: {len(self.collaboration_df)} rows")
        
        # Author-based dataset
        self.create_author_dataset()
    
    def create_author_dataset(self):
        """Create author-based dataset for person-level network analysis"""
        print("👥 Creating author dataset...")
        
        author_rows = []
        
        if 'Authors' not in self.main_df.columns:
            print("   ⚠️ Warning: 'Authors' column not found, creating empty author dataset")
            self.author_df = pd.DataFrame()
            return
        
        for _, row in self.main_df.iterrows():
            if pd.notna(row['Authors']):
                # Split authors by comma
                authors = [a.strip() for a in str(row['Authors']).split(',')]
                
                for author in authors:
                    if author and len(author) > 2:  # Filter out initials only
                        new_row = row.copy()
                        new_row['Author_Name'] = author
                        
                        # Check if first/last author columns exist
                        if 'First_Author' in row.index:
                            new_row['Is_First_Author'] = (author == row['First_Author'])
                        else:
                            new_row['Is_First_Author'] = False
                            
                        if 'Last_Author' in row.index:
                            new_row['Is_Last_Author'] = (author == row['Last_Author'])
                        else:
                            new_row['Is_Last_Author'] = False
                            
                        author_rows.append(new_row)
        
        self.author_df = pd.DataFrame(author_rows)
        print(f"   ✅ Author dataset: {len(self.author_df)} rows")
    
    def save_cleaned_datasets(self):
        """Save all cleaned datasets"""
        datasets = {
            'main_dataset.csv': self.main_df,
            'individual_groups.csv': self.individual_df,
            'collaboration_dataset.csv': self.collaboration_df,
            'author_dataset.csv': self.author_df,
            'external_resources.csv': self.external_df
        }
        
        for filename, df in datasets.items():
            path = os.path.join(self.output_dir, "01_cleaned_data", filename)
            df.to_csv(path, index=False)
            print(f"💾 Saved: {filename} ({len(df)} rows)")
    
    def generate_basic_statistics(self):
        """Generate comprehensive basic statistics"""
        print("📈 Generating basic statistics...")
        
        try:
            # Check if we have the required data
            if len(self.main_df) == 0:
                print("   ⚠️ Warning: No data available for statistics")
                return {}
            
            # Safe statistics generation with error handling
            stats = {
                'Overview': {
                    'Total Publications': len(self.main_df),
                    'Unique Working Groups': self.individual_df['Working_Groups'].nunique() if len(self.individual_df) > 0 else 0,
                    'Unique Authors': self.author_df['Author_Name'].nunique() if len(self.author_df) > 0 else 0,
                    'Unique Journals': self.main_df['Journal'].nunique() if 'Journal' in self.main_df.columns else 0,
                    'Year Range': f"{self.main_df['Publication_Year'].min()}-{self.main_df['Publication_Year'].max()}" if 'Publication_Year' in self.main_df.columns else 'Unknown',
                    'Collaboration Rate': f"{(self.main_df['Is_Collaboration'].sum() / len(self.main_df) * 100):.1f}%" if 'Is_Collaboration' in self.main_df.columns else 'N/A',
                    'Open Access Rate': f"{(self.main_df['Open_Access_Binary'].sum() / len(self.main_df) * 100):.1f}%" if 'Open_Access_Binary' in self.main_df.columns else 'N/A'
                }
            }
            
            # Add additional statistics only if data is available
            if 'Publication_Type' in self.main_df.columns:
                stats['Publication Types'] = dict(self.main_df['Publication_Type'].value_counts())
                
            if len(self.individual_df) > 0 and 'Working_Groups_Clean' in self.individual_df.columns:
                stats['Top Working Groups'] = dict(self.individual_df['Working_Groups_Clean'].value_counts().head(10))
                
            if 'Journal' in self.main_df.columns:
                stats['Top Journals'] = dict(self.main_df['Journal'].value_counts().head(10))
                
            if len(self.author_df) > 0 and 'Author_Name' in self.author_df.columns:
                stats['Top Authors'] = dict(self.author_df['Author_Name'].value_counts().head(15))
                
            if 'Publication_Year' in self.main_df.columns:
                stats['Yearly Publications'] = dict(self.main_df['Publication_Year'].value_counts().sort_index())
            
            # Save statistics
            stats_path = os.path.join(self.output_dir, "04_statistics", "basic_statistics.txt")
            with open(stats_path, 'w', encoding='utf-8') as f:
                for section, data in stats.items():
                    f.write(f"\n{'='*50}\n{section.upper()}\n{'='*50}\n")
                    if isinstance(data, dict):
                        for key, value in data.items():
                            f.write(f"{key}: {value}\n")
                    else:
                        f.write(f"{data}\n")
            
            print(f"   ✅ Statistics saved to: {stats_path}")
            return stats
            
        except Exception as e:
            print(f"   ⚠️ Error generating statistics: {str(e)}")
            return {}
    
    def create_temporal_analysis(self):
        """Comprehensive temporal analysis"""
        print("⏰ Creating temporal analysis...")
        
        # 1. Publications over time by group
        yearly_groups = self.individual_df.groupby(['Publication_Year', 'Working_Groups_Clean']).size().unstack(fill_value=0)
        
        # Static plot
        plt.figure(figsize=(15, 8))
        top_groups = self.individual_df['Working_Groups_Clean'].value_counts().head(10).index
        yearly_groups[top_groups].plot(kind='line', marker='o', linewidth=2, markersize=6)
        plt.title('Publication Trends: Top 10 Working Groups Over Time', fontsize=16, fontweight='bold')
        plt.xlabel('Year', fontsize=12)
        plt.ylabel('Number of Publications', fontsize=12)
        plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left')
        plt.grid(True, alpha=0.3)
        plt.tight_layout()
        plt.savefig(os.path.join(self.output_dir, "02_visualizations/static", "temporal_trends_groups.png"), dpi=300, bbox_inches='tight')
        plt.close()
        
        # Interactive temporal analysis
        fig = px.line(
            yearly_groups[top_groups].reset_index(), 
            x='Publication_Year', 
            y=top_groups.tolist(),
            title='Interactive Publication Trends by Working Group',
            labels={'value': 'Publications', 'variable': 'Working Group'}
        )
        fig.write_html(os.path.join(self.output_dir, "02_visualizations/interactive", "temporal_trends_interactive.html"))
        
        # 2. Collaboration evolution over time
        collab_evolution = self.main_df.groupby(['Publication_Year', 'Is_Collaboration']).size().unstack(fill_value=0)
        collab_evolution['Collaboration_Rate'] = collab_evolution[True] / (collab_evolution[True] + collab_evolution[False]) * 100
        
        plt.figure(figsize=(12, 6))
        collab_evolution['Collaboration_Rate'].plot(kind='line', marker='o', linewidth=3, markersize=8, color='red')
        plt.title('Collaboration Rate Evolution Over Time', fontsize=16, fontweight='bold')
        plt.xlabel('Year', fontsize=12)
        plt.ylabel('Collaboration Rate (%)', fontsize=12)
        plt.grid(True, alpha=0.3)
        plt.tight_layout()
        plt.savefig(os.path.join(self.output_dir, "02_visualizations/static", "collaboration_evolution.png"), dpi=300, bbox_inches='tight')
        plt.close()
        
        # Save temporal data
        temporal_data = {
            'yearly_publications_by_group': yearly_groups,
            'collaboration_evolution': collab_evolution
        }
        
        for name, data in temporal_data.items():
            data.to_csv(os.path.join(self.output_dir, "05_temporal_analysis", f"{name}.csv"))
    
    def create_collaboration_networks(self):
        """Create comprehensive collaboration networks"""
        print("🕸️ Creating collaboration networks...")
        
        # 1. Working Group Collaboration Network
        self.create_group_network()
        
        # 2. Author Collaboration Network
        self.create_author_network()
        
        # 3. Cross-Project Collaboration Network
        self.create_project_network()
        
        # 4. Temporal collaboration networks
        self.create_temporal_networks()
    
    def create_group_network(self):
        """Create working group collaboration network"""
        
        try:
            G = nx.Graph()
            
            if len(self.collaboration_df) == 0:
                print("   ⚠️ No collaboration data available for group network")
                return G, {}
            
            # Add nodes and edges based on co-publications
            for _, row in self.collaboration_df.iterrows():
                if pd.notna(row.get('Working_Groups', '')):
                    groups = [g.strip().replace('ag_', '') for g in str(row['Working_Groups']).split(',')]
                    
                    # Add nodes
                    for group in groups:
                        if group:  # Only add non-empty groups
                            if G.has_node(group):
                                G.nodes[group]['publications'] += 1
                            else:
                                G.add_node(group, publications=1)
                    
                    # Add edges (collaborations)
                    for i in range(len(groups)):
                        for j in range(i+1, len(groups)):
                            if groups[i] and groups[j]:  # Only add edges for non-empty groups
                                if G.has_edge(groups[i], groups[j]):
                                    G[groups[i]][groups[j]]['weight'] += 1
                                else:
                                    G.add_edge(groups[i], groups[j], weight=1)
            
            if len(G.nodes()) == 0:
                print("   ⚠️ No valid groups found for network")
                return G, {}
            
            # Calculate network metrics
            centrality_metrics = {}
            
            try:
                centrality_metrics['betweenness'] = nx.betweenness_centrality(G)
                centrality_metrics['closeness'] = nx.closeness_centrality(G)
                centrality_metrics['degree'] = nx.degree_centrality(G)
                
                # Only calculate eigenvector centrality if graph is connected
                if nx.is_connected(G):
                    centrality_metrics['eigenvector'] = nx.eigenvector_centrality(G)
                else:
                    # For disconnected graphs, calculate for largest component
                    largest_cc = max(nx.connected_components(G), key=len)
                    G_largest = G.subgraph(largest_cc)
                    eig_centrality = nx.eigenvector_centrality(G_largest)
                    # Fill in zeros for nodes not in largest component
                    centrality_metrics['eigenvector'] = {node: eig_centrality.get(node, 0) for node in G.nodes()}
                    
            except Exception as e:
                print(f"   ⚠️ Error calculating centrality metrics: {str(e)}")
                centrality_metrics = {'degree': nx.degree_centrality(G)}
            
            # Save network metrics
            if centrality_metrics:
                metrics_df = pd.DataFrame(centrality_metrics)
                metrics_df.to_csv(os.path.join(self.output_dir, "06_collaboration_metrics", "group_centrality_metrics.csv"))
            
            # Visualize network
            plt.figure(figsize=(16, 12))
            pos = nx.spring_layout(G, k=3, iterations=50)
            
            # Node sizes based on publications
            node_sizes = [max(G.nodes[node]['publications'] * 100, 100) for node in G.nodes()]
            
            # Edge weights
            edge_weights = [G[u][v]['weight'] for u, v in G.edges()]
            
            nx.draw_networkx_nodes(G, pos, node_size=node_sizes, node_color='lightblue', alpha=0.7)
            if edge_weights:  # Only draw edges if they exist
                nx.draw_networkx_edges(G, pos, width=[w*0.5 for w in edge_weights], alpha=0.5)
            nx.draw_networkx_labels(G, pos, font_size=8, font_weight='bold')
            
            plt.title('Working Group Collaboration Network\n(Node size = Publications, Edge width = Collaborations)', 
                     fontsize=16, fontweight='bold')
            plt.axis('off')
            plt.tight_layout()
            plt.savefig(os.path.join(self.output_dir, "02_visualizations/networks", "group_collaboration_network.png"), 
                       dpi=300, bbox_inches='tight')
            plt.close()
            
            # Save network
            nx.write_gexf(G, os.path.join(self.output_dir, "02_visualizations/networks", "group_network.gexf"))
            
            print(f"   ✅ Group network created with {len(G.nodes())} nodes and {len(G.edges())} edges")
            return G, centrality_metrics
            
        except Exception as e:
            print(f"   ⚠️ Error creating group network: {str(e)}")
            return nx.Graph(), {}
    
    def create_author_network(self):
        """Create author collaboration network"""
        print("👥 Creating author network...")
        
        # Focus on prolific authors (>= 3 publications)
        author_counts = self.author_df['Author_Name'].value_counts()
        prolific_authors = author_counts[author_counts >= 3].index
        
        G = nx.Graph()
        
        # Group authors by publication
        pub_authors = defaultdict(list)
        for _, row in self.author_df.iterrows():
            if row['Author_Name'] in prolific_authors:
                pub_id = f"{row['DOI']}_{row['Publication_Year']}"
                pub_authors[pub_id].append(row['Author_Name'])
        
        # Create collaboration edges
        for pub_id, authors in pub_authors.items():
            for i in range(len(authors)):
                for j in range(i+1, len(authors)):
                    if G.has_edge(authors[i], authors[j]):
                        G[authors[i]][authors[j]]['weight'] += 1
                    else:
                        G.add_edge(authors[i], authors[j], weight=1)
        
        # Add node attributes
        for author in prolific_authors:
            if author in G.nodes():
                G.nodes[author]['publications'] = author_counts[author]
        
        # Network layout and visualization
        plt.figure(figsize=(20, 15))
        pos = nx.spring_layout(G, k=2, iterations=50)
        
        node_sizes = [G.nodes[node].get('publications', 1) * 50 for node in G.nodes()]
        edge_weights = [G[u][v]['weight'] for u, v in G.edges()]
        
        nx.draw_networkx_nodes(G, pos, node_size=node_sizes, node_color='lightcoral', alpha=0.7)
        nx.draw_networkx_edges(G, pos, width=[w*0.3 for w in edge_weights], alpha=0.4)
        nx.draw_networkx_labels(G, pos, font_size=6)
        
        plt.title('Author Collaboration Network\n(Prolific Authors: ≥3 Publications)', 
                 fontsize=16, fontweight='bold')
        plt.axis('off')
        plt.tight_layout()
        plt.savefig(os.path.join(self.output_dir, "02_visualizations/networks", "author_collaboration_network.png"), 
                   dpi=300, bbox_inches='tight')
        plt.close()
        
        # Get the 20 most connected authors based on degree centrality
        degree_centrality = nx.degree_centrality(G)
        top_20_authors = sorted(degree_centrality.items(), key=lambda x: x[1], reverse=True)[:20]
        top_20_nodes = [author for author, centrality in top_20_authors]

        # Create subgraph with only top 20 authors and their connections
        G_top20 = G.subgraph(top_20_nodes).copy()

        # Network layout and visualization
        plt.figure(figsize=(20, 15))
        pos = nx.spring_layout(G_top20, k=2, iterations=50)

        node_sizes = [G_top20.nodes[node].get('publications', 1) * 100 for node in G_top20.nodes()]
        edge_weights = [G_top20[u][v]['weight'] for u, v in G_top20.edges()]

        nx.draw_networkx_nodes(G_top20, pos, node_size=node_sizes, node_color='lightcoral', alpha=0.7)
        nx.draw_networkx_edges(G_top20, pos, width=[w*0.5 for w in edge_weights], alpha=0.4)
        nx.draw_networkx_labels(G_top20, pos, font_size=8)

        plt.title('Author Collaboration Network\n(Top 20 Most Connected Authors)', 
                fontsize=16, fontweight='bold')
        plt.axis('off')
        plt.tight_layout()
        plt.savefig(os.path.join(self.output_dir, "02_visualizations/networks", "top20_author_collaboration_network.png"), 
                dpi=300, bbox_inches='tight')
        plt.close()

        # Calculate author centrality
        if len(G.nodes()) > 0:
            author_centrality = {
                'betweenness': nx.betweenness_centrality(G),
                'degree': nx.degree_centrality(G),
                'closeness': nx.closeness_centrality(G)
            }
            
            author_metrics_df = pd.DataFrame(author_centrality)
            author_metrics_df.to_csv(os.path.join(self.output_dir, "06_collaboration_metrics", "author_centrality_metrics.csv"))
        
        nx.write_gexf(G, os.path.join(self.output_dir, "02_visualizations/networks", "author_network.gexf"))
        
        return G
    
    def create_project_network(self):
        """Create cross-project collaboration network"""
        print("🔬 Creating project collaboration network...")
        
        try:
            # Clean subproject data
            project_collabs = []
            
            if len(self.collaboration_df) == 0:
                print("   ⚠️ No collaboration data available for project network")
                return nx.Graph()
            
            for _, row in self.collaboration_df.iterrows():
                if pd.notna(row.get('Subproject', '')) and ',' in str(row.get('Subproject', '')):
                    projects = [p.strip() for p in str(row['Subproject']).split(',')]
                    projects = [p for p in projects if p and len(p) >= 2]  # Filter valid project codes
                    if len(projects) > 1:
                        for i in range(len(projects)):
                            for j in range(i+1, len(projects)):
                                project_collabs.append((projects[i], projects[j], row['Publication_Year']))
            
            if not project_collabs:
                print("   ⚠️ No cross-project collaborations found")
                return nx.Graph()
            
            # Create network
            G = nx.Graph()
            for proj1, proj2, year in project_collabs:
                if G.has_edge(proj1, proj2):
                    G[proj1][proj2]['weight'] += 1
                    G[proj1][proj2]['years'].append(year)
                else:
                    G.add_edge(proj1, proj2, weight=1, years=[year])
            
            if len(G.nodes()) > 0:
                # Visualize
                plt.figure(figsize=(14, 10))
                pos = nx.spring_layout(G, k=3, iterations=50)
                
                edge_weights = [G[u][v]['weight'] for u, v in G.edges()]
                node_sizes = [G.degree(node) * 200 + 300 for node in G.nodes()]
                
                nx.draw_networkx_nodes(G, pos, node_size=node_sizes, node_color='lightgreen', alpha=0.8)
                nx.draw_networkx_edges(G, pos, width=[w*2 for w in edge_weights], alpha=0.6)
                nx.draw_networkx_labels(G, pos, font_size=10, font_weight='bold')
                
                plt.title('Cross-Project Collaboration Network\n(Subproject Collaborations)', 
                         fontsize=16, fontweight='bold')
                plt.axis('off')
                plt.tight_layout()
                plt.savefig(os.path.join(self.output_dir, "02_visualizations/networks", "project_collaboration_network.png"), 
                           dpi=300, bbox_inches='tight')
                plt.close()
                
                nx.write_gexf(G, os.path.join(self.output_dir, "02_visualizations/networks", "project_network.gexf"))
                print(f"   ✅ Project network created with {len(G.nodes())} nodes and {len(G.edges())} edges")
            else:
                print("   ⚠️ No project network nodes to visualize")
            
            return G
            
        except Exception as e:
            print(f"   ⚠️ Error creating project network: {str(e)}")
            return nx.Graph()
    
    def create_temporal_networks(self):
        """Create networks showing collaboration evolution"""
        print("📅 Creating temporal collaboration networks...")
        
        try:
            periods = ['2014-2016', '2017-2019', '2020-2022', '2023-2024']
            
            for period in periods:
                if 'Year_Period' not in self.main_df.columns:
                    print(f"   ⚠️ Year_Period column not found, skipping temporal networks")
                    return
                
                period_data = self.main_df[self.main_df['Year_Period'] == period]
                
                if len(period_data) == 0:
                    continue
                    
                period_collabs = period_data[period_data.get('Is_Collaboration', False) == True]
                
                if len(period_collabs) == 0:
                    continue
                
                G = nx.Graph()
                
                for _, row in period_collabs.iterrows():
                    if pd.notna(row.get('Working_Groups', '')):
                        groups = [g.strip().replace('ag_', '') for g in str(row['Working_Groups']).split(',')]
                        
                        for i in range(len(groups)):
                            for j in range(i+1, len(groups)):
                                if G.has_edge(groups[i], groups[j]):
                                    G[groups[i]][groups[j]]['weight'] += 1
                                else:
                                    G.add_edge(groups[i], groups[j], weight=1)
                
                if len(G.nodes()) > 0:
                    plt.figure(figsize=(12, 8))
                    pos = nx.spring_layout(G, k=2, iterations=50)
                    
                    edge_weights = [G[u][v]['weight'] for u, v in G.edges()]
                    node_sizes = [G.degree(node) * 100 + 200 for node in G.nodes()]
                    
                    nx.draw_networkx_nodes(G, pos, node_size=node_sizes, node_color='orange', alpha=0.7)
                    nx.draw_networkx_edges(G, pos, width=[w for w in edge_weights], alpha=0.5)
                    nx.draw_networkx_labels(G, pos, font_size=8)
                    
                    plt.title(f'Working Group Collaborations: {period}', fontsize=14, fontweight='bold')
                    plt.axis('off')
                    plt.tight_layout()
                    plt.savefig(os.path.join(self.output_dir, "02_visualizations/networks", f"network_{period.replace('-', '_')}.png"), 
                               dpi=300, bbox_inches='tight')
                    plt.close()
                    
                    print(f"   ✅ Created network for {period}: {len(G.nodes())} nodes, {len(G.edges())} edges")
                else:
                    print(f"   ⚠️ No collaboration data for {period}")
                    
        except Exception as e:
            print(f"   ⚠️ Error creating temporal networks: {str(e)}")
    
    def create_comprehensive_visualizations(self):
        """Create comprehensive static and interactive visualizations"""
        print("🎨 Creating comprehensive visualizations...")
        
        # 1. Publication timeline
        self.create_publication_timeline()
        
        # 2. Working group activity
        self.create_group_activity_charts()
        
        # 3. Collaboration analysis
        self.create_collaboration_analysis()
        
        # 4. Publication type analysis
        self.create_publication_type_analysis()
        
        # 5. Journal and impact analysis
        self.create_journal_analysis()
        
        # 6. Open access analysis
        self.create_open_access_analysis()
    
    def create_publication_timeline(self):
        """Create detailed publication timeline visualizations"""
        
        # Static timeline
        plt.figure(figsize=(14, 8))
        yearly_counts = self.main_df['Publication_Year'].value_counts().sort_index()
        
        bars = plt.bar(yearly_counts.index, yearly_counts.values, color='skyblue', alpha=0.8, edgecolor='navy')
        plt.plot(yearly_counts.index, yearly_counts.values, color='red', marker='o', linewidth=2, markersize=6)
        
        # Add value labels on bars
        for bar in bars:
            height = bar.get_height()
            plt.text(bar.get_x() + bar.get_width()/2., height + 1,
                    f'{int(height)}', ha='center', va='bottom', fontweight='bold')
        
        plt.title('SFB1002 Publications Timeline (2014-2024)', fontsize=16, fontweight='bold')
        plt.xlabel('Year', fontsize=12)
        plt.ylabel('Number of Publications', fontsize=12)
        plt.grid(True, alpha=0.3)
        plt.tight_layout()
        plt.savefig(os.path.join(self.output_dir, "02_visualizations/static", "publication_timeline.png"), dpi=300, bbox_inches='tight')
        plt.close()
        
        # Interactive timeline with additional metrics
        fig = make_subplots(
            rows=2, cols=1,
            subplot_titles=('Publications per Year', 'Collaboration Rate per Year'),
            vertical_spacing=0.1
        )
        
        # Publications
        fig.add_trace(
            go.Scatter(x=yearly_counts.index, y=yearly_counts.values, mode='lines+markers', name='Publications'),
            row=1, col=1
        )
        
        # Collaboration rate
        collab_rate = self.main_df.groupby('Publication_Year')['Is_Collaboration'].mean() * 100
        fig.add_trace(
            go.Scatter(x=collab_rate.index, y=collab_rate.values, mode='lines+markers', 
                      name='Collaboration Rate (%)', line=dict(color='red')),
            row=2, col=1
        )
        
        fig.update_layout(height=600, title_text="SFB1002 Publication Metrics Over Time")
        fig.write_html(os.path.join(self.output_dir, "02_visualizations/interactive", "publication_timeline_interactive.html"))
    
    def create_group_activity_charts(self):
        """Create working group activity analysis"""
        
        # Top 15 most active groups
        top_groups = self.individual_df['Working_Groups_Clean'].value_counts().head(15)
        
        # Static horizontal bar chart
        plt.figure(figsize=(12, 10))
        bars = plt.barh(range(len(top_groups)), top_groups.values, color='lightcoral')
        plt.yticks(range(len(top_groups)), top_groups.index)
        plt.xlabel('Number of Publications', fontsize=12)
        plt.title('Top 15 Most Active Working Groups', fontsize=16, fontweight='bold')
        
        # Add value labels
        for i, (bar, value) in enumerate(zip(bars, top_groups.values)):
            plt.text(value + 0.5, i, str(value), va='center', fontweight='bold')
        
        plt.tight_layout()
        plt.savefig(os.path.join(self.output_dir, "02_visualizations/static", "top_working_groups.png"), dpi=300, bbox_inches='tight')
        plt.close()
        
        # Interactive sunburst chart for groups by period
        group_period = self.individual_df.groupby(['Year_Period', 'Working_Groups_Clean']).size().reset_index(name='Publications')
        
        fig = px.sunburst(
            group_period, 
            path=['Year_Period', 'Working_Groups_Clean'], 
            values='Publications',
            title='Working Group Activity by Time Period'
        )
        fig.write_html(os.path.join(self.output_dir, "02_visualizations/interactive", "groups_by_period_sunburst.html"))
    
    def create_collaboration_analysis(self):
        """Create detailed collaboration analysis"""
        
        # Collaboration patterns
        collab_patterns = self.main_df['Num_Collaborating_Groups'].value_counts().sort_index()
        
        plt.figure(figsize=(10, 6))
        bars = plt.bar(collab_patterns.index, collab_patterns.values, color='lightgreen', alpha=0.8)
        plt.xlabel('Number of Collaborating Groups', fontsize=12)
        plt.ylabel('Number of Publications', fontsize=12)
        plt.title('Distribution of Collaboration Patterns', fontsize=16, fontweight='bold')
        
        for bar in bars:
            height = bar.get_height()
            plt.text(bar.get_x() + bar.get_width()/2., height + 1,
                    f'{int(height)}', ha='center', va='bottom', fontweight='bold')
        
        plt.tight_layout()
        plt.savefig(os.path.join(self.output_dir, "02_visualizations/static", "collaboration_patterns.png"), dpi=300, bbox_inches='tight')
        plt.close()
        
        # Top collaborating pairs
        pair_collabs = Counter()
        for _, row in self.collaboration_df.iterrows():
            groups = [g.strip().replace('ag_', '') for g in row['Working_Groups'].split(',')]
            for i in range(len(groups)):
                for j in range(i+1, len(groups)):
                    pair = tuple(sorted([groups[i], groups[j]]))
                    pair_collabs[pair] += 1
        
        # Save top pairs
        top_pairs = dict(pair_collabs.most_common(20))
        top_pairs_df = pd.DataFrame(list(top_pairs.items()), columns=['Group_Pair', 'Collaborations'])
        top_pairs_df['Group1'] = top_pairs_df['Group_Pair'].apply(lambda x: x[0])
        top_pairs_df['Group2'] = top_pairs_df['Group_Pair'].apply(lambda x: x[1])
        top_pairs_df.to_csv(os.path.join(self.output_dir, "06_collaboration_metrics", "top_collaborating_pairs.csv"), index=False)
        
        # Collaboration heatmap for top groups
        top_15_groups = self.individual_df['Working_Groups_Clean'].value_counts().head(15).index
        
        # Create collaboration matrix
        collab_matrix = pd.DataFrame(0, index=top_15_groups, columns=top_15_groups)
        
        for _, row in self.collaboration_df.iterrows():
            groups = [g.strip().replace('ag_', '') for g in row['Working_Groups'].split(',')]
            groups = [g for g in groups if g in top_15_groups]
            
            for i in range(len(groups)):
                for j in range(i+1, len(groups)):
                    collab_matrix.loc[groups[i], groups[j]] += 1
                    collab_matrix.loc[groups[j], groups[i]] += 1
        
        plt.figure(figsize=(14, 12))
        sns.heatmap(collab_matrix, annot=True, cmap='Blues', fmt='d', square=True)
        plt.title('Collaboration Heatmap: Top 15 Working Groups', fontsize=16, fontweight='bold')
        plt.xlabel('Working Groups', fontsize=12)
        plt.ylabel('Working Groups', fontsize=12)
        plt.xticks(rotation=45, ha='right')
        plt.yticks(rotation=0)
        plt.tight_layout()
        plt.savefig(os.path.join(self.output_dir, "02_visualizations/static", "collaboration_heatmap.png"), dpi=300, bbox_inches='tight')
        plt.close()
    
    def create_publication_type_analysis(self):
        """Analyze publication types"""
        
        pub_types = self.main_df['Publication_Type'].value_counts()
        
        # Pie chart
        plt.figure(figsize=(10, 8))
        colors = sns.color_palette("husl", len(pub_types))
        wedges, texts, autotexts = plt.pie(pub_types.values, labels=pub_types.index, autopct='%1.1f%%', 
                                          colors=colors, startangle=90)
        
        # Enhance text
        for autotext in autotexts:
            autotext.set_color('white')
            autotext.set_fontweight('bold')
        
        plt.title('Distribution of Publication Types', fontsize=16, fontweight='bold')
        plt.axis('equal')
        plt.tight_layout()
        plt.savefig(os.path.join(self.output_dir, "02_visualizations/static", "publication_types_pie.png"), dpi=300, bbox_inches='tight')
        plt.close()
        
        # Publication types over time
        pub_type_time = pd.crosstab(self.main_df['Publication_Year'], self.main_df['Publication_Type'])
        
        # Interactive stacked area chart
        fig = px.area(
            pub_type_time.reset_index(), 
            x='Publication_Year', 
            y=pub_type_time.columns.tolist(),
            title='Publication Types Evolution Over Time'
        )
        fig.write_html(os.path.join(self.output_dir, "02_visualizations/interactive", "publication_types_timeline.html"))
    
    def create_journal_analysis(self):
        """Analyze journal patterns and impact"""
        
        # Top journals
        top_journals = self.main_df['Journal'].value_counts().head(20)
        
        plt.figure(figsize=(14, 10))
        bars = plt.barh(range(len(top_journals)), top_journals.values, color='mediumpurple')
        plt.yticks(range(len(top_journals)), top_journals.index)
        plt.xlabel('Number of Publications', fontsize=12)
        plt.title('Top 20 Journals by Publication Count', fontsize=16, fontweight='bold')
        
        # Add value labels
        for i, (bar, value) in enumerate(zip(bars, top_journals.values)):
            plt.text(value + 0.1, i, str(value), va='center', fontweight='bold')
        
        plt.tight_layout()
        plt.savefig(os.path.join(self.output_dir, "02_visualizations/static", "top_journals.png"), dpi=300, bbox_inches='tight')
        plt.close()
        
        # Journal diversity over time
        yearly_journal_diversity = self.main_df.groupby('Publication_Year')['Journal'].nunique()
        
        plt.figure(figsize=(12, 6))
        plt.plot(yearly_journal_diversity.index, yearly_journal_diversity.values, 
                marker='o', linewidth=3, markersize=8, color='purple')
        plt.title('Journal Diversity Over Time', fontsize=16, fontweight='bold')
        plt.xlabel('Year', fontsize=12)
        plt.ylabel('Number of Unique Journals', fontsize=12)
        plt.grid(True, alpha=0.3)
        plt.tight_layout()
        plt.savefig(os.path.join(self.output_dir, "02_visualizations/static", "journal_diversity.png"), dpi=300, bbox_inches='tight')
        plt.close()
    
    def create_open_access_analysis(self):
        """Analyze open access patterns"""
        
        # Open access rate over time
        oa_time = self.main_df.groupby('Publication_Year')['Open_Access_Binary'].mean() * 100
        
        plt.figure(figsize=(12, 6))
        bars = plt.bar(oa_time.index, oa_time.values, color='forestgreen', alpha=0.8)
        plt.plot(oa_time.index, oa_time.values, color='darkgreen', marker='o', linewidth=2, markersize=6)
        
        # Add percentage labels
        for bar in bars:
            height = bar.get_height()
            plt.text(bar.get_x() + bar.get_width()/2., height + 1,
                    f'{height:.1f}%', ha='center', va='bottom', fontweight='bold')
        
        plt.title('Open Access Rate Over Time', fontsize=16, fontweight='bold')
        plt.xlabel('Year', fontsize=12)
        plt.ylabel('Open Access Rate (%)', fontsize=12)
        plt.ylim(0, 100)
        plt.grid(True, alpha=0.3)
        plt.tight_layout()
        plt.savefig(os.path.join(self.output_dir, "02_visualizations/static", "open_access_trends.png"), dpi=300, bbox_inches='tight')
        plt.close()
        
        # Open access by publication type
        oa_by_type = self.main_df.groupby('Publication_Type')['Open_Access_Binary'].agg(['count', 'sum', 'mean']).round(3)
        oa_by_type['oa_rate'] = oa_by_type['mean'] * 100
        oa_by_type.to_csv(os.path.join(self.output_dir, "08_impact_analysis", "open_access_by_type.csv"))
        
        # Interactive stacked bar for OA by groups
        oa_groups = self.individual_df.groupby(['Working_Groups_Clean', 'Open_Access']).size().unstack(fill_value=0)
        oa_groups['Total'] = oa_groups.sum(axis=1)
        oa_groups = oa_groups.sort_values('Total', ascending=False).head(15)
        
        fig = px.bar(
            oa_groups.reset_index(), 
            x='Working_Groups_Clean', 
            y=['Yes', 'No'],
            title='Open Access Distribution by Top 15 Working Groups',
            labels={'value': 'Publications', 'variable': 'Open Access'}
        )
        fig.update_xaxes(tickangle=45)
        fig.write_html(os.path.join(self.output_dir, "02_visualizations/interactive", "open_access_by_groups.html"))
    
    def create_productivity_analysis(self):
        """Analyze productivity patterns and trends"""
        print("📊 Creating productivity analysis...")
        
        # Group productivity metrics
        group_productivity = self.individual_df.groupby('Working_Groups_Clean').agg({
            'Publication_Year': ['count', 'min', 'max'],
            'Is_Collaboration': 'mean',
            'Open_Access_Binary': 'mean'
        }).round(3)
        
        group_productivity.columns = ['Total_Publications', 'First_Year', 'Last_Year', 'Collaboration_Rate', 'Open_Access_Rate']
        group_productivity['Active_Years'] = group_productivity['Last_Year'] - group_productivity['First_Year'] + 1
        group_productivity['Publications_Per_Year'] = group_productivity['Total_Publications'] / group_productivity['Active_Years']
        
        group_productivity = group_productivity.sort_values('Total_Publications', ascending=False)
        group_productivity.to_csv(os.path.join(self.output_dir, "07_productivity_analysis", "group_productivity_metrics.csv"))
        
        # Productivity vs collaboration scatter plot
        top_20_groups = group_productivity.head(20)
        
        plt.figure(figsize=(12, 8))
        scatter = plt.scatter(top_20_groups['Publications_Per_Year'], top_20_groups['Collaboration_Rate'] * 100,
                             s=top_20_groups['Total_Publications'] * 10, alpha=0.6, c='coral')
        
        # Add labels for top groups
        for idx, row in top_20_groups.head(10).iterrows():
            plt.annotate(idx, (row['Publications_Per_Year'], row['Collaboration_Rate'] * 100),
                        xytext=(5, 5), textcoords='offset points', fontsize=8)
        
        plt.xlabel('Publications per Year', fontsize=12)
        plt.ylabel('Collaboration Rate (%)', fontsize=12)
        plt.title('Productivity vs Collaboration Rate\n(Bubble size = Total Publications)', fontsize=16, fontweight='bold')
        plt.grid(True, alpha=0.3)
        plt.tight_layout()
        plt.savefig(os.path.join(self.output_dir, "02_visualizations/static", "productivity_vs_collaboration.png"), dpi=300, bbox_inches='tight')
        plt.close()
        
        # Author productivity analysis
        author_productivity = self.author_df.groupby('Author_Name').agg({
            'Publication_Year': ['count', 'min', 'max'],
            'Is_First_Author': 'sum',
            'Is_Last_Author': 'sum',
            'Working_Groups_Clean': 'nunique'
        }).round(3)
        
        author_productivity.columns = ['Total_Publications', 'First_Year', 'Last_Year', 'First_Author_Count', 'Last_Author_Count', 'Working_Groups_Count']
        author_productivity['Active_Years'] = author_productivity['Last_Year'] - author_productivity['First_Year'] + 1
        author_productivity = author_productivity[author_productivity['Total_Publications'] >= 3].sort_values('Total_Publications', ascending=False)
        
        author_productivity.to_csv(os.path.join(self.output_dir, "07_productivity_analysis", "author_productivity_metrics.csv"))
    
    def create_strategic_insights(self):
        """Generate strategic insights and recommendations"""
        print("🎯 Generating strategic insights...")
        
        insights = {
            'Research Output Trends': self.analyze_output_trends(),
            'Collaboration Effectiveness': self.analyze_collaboration_effectiveness(),
            'Impact and Visibility': self.analyze_impact_visibility(),
            'Strategic Recommendations': self.generate_recommendations()
        }
        
        # Create comprehensive report
        report_html = self.create_executive_report(insights)
        
        with open(os.path.join(self.output_dir, "03_reports", "executive_summary.html"), 'w', encoding='utf-8') as f:
            f.write(report_html)
        
        # Save insights as structured data
        import json
        with open(os.path.join(self.output_dir, "09_strategic_insights", "insights_summary.json"), 'w') as f:
            json.dump(insights, f, indent=2, default=str)
    
    def analyze_output_trends(self):
        """Analyze research output trends"""
        current_year = 2024
        recent_years = self.main_df[self.main_df['Publication_Year'] >= current_year - 2]
        earlier_years = self.main_df[self.main_df['Publication_Year'] < current_year - 2]
        
        trends = {
            'total_publications': len(self.main_df),
            'recent_output': len(recent_years),
            'earlier_output': len(earlier_years),
            'growth_rate': (len(recent_years) / max(len(earlier_years), 1) - 1) * 100,
            'peak_year': int(self.main_df['Publication_Year'].value_counts().idxmax()),
            'peak_year_count': int(self.main_df['Publication_Year'].value_counts().max()),
            'avg_publications_per_year': len(self.main_df) / (current_year - 2014 + 1),
            'most_productive_groups': dict(self.individual_df['Working_Groups_Clean'].value_counts().head(5)),
            'emerging_groups': self.identify_emerging_groups()
        }
        
        return trends
    
    def identify_emerging_groups(self):
        """Identify groups with increasing publication trends"""
        recent_groups = self.individual_df[self.individual_df['Publication_Year'] >= 2022]['Working_Groups_Clean'].value_counts()
        earlier_groups = self.individual_df[self.individual_df['Publication_Year'] < 2022]['Working_Groups_Clean'].value_counts()
        
        emerging = {}
        for group in recent_groups.index:
            recent_count = recent_groups.get(group, 0)
            earlier_count = earlier_groups.get(group, 0)
            if recent_count > 0 and earlier_count > 0:
                growth = (recent_count / earlier_count - 1) * 100
                if growth > 50:  # 50% growth threshold
                    emerging[group] = round(growth, 1)
        
        return dict(sorted(emerging.items(), key=lambda x: x[1], reverse=True)[:5])
    
    def analyze_collaboration_effectiveness(self):
        """Analyze collaboration patterns and effectiveness"""
        
        collab_stats = {
            'collaboration_rate': round(self.main_df['Is_Collaboration'].mean() * 100, 1),
            'avg_collaborators_per_publication': round(self.main_df['Num_Collaborating_Groups'].mean(), 2),
            'max_collaboration_size': int(self.main_df['Num_Collaborating_Groups'].max()),
            'most_collaborative_groups': dict(
                self.individual_df[self.individual_df['Is_Collaboration'] == True]['Working_Groups_Clean'].value_counts().head(5)
            ),
            'top_collaboration_pairs': self.get_top_collaboration_pairs(),
            'cross_project_collaborations': self.count_cross_project_collaborations(),
            'collaboration_trends': self.analyze_collaboration_trends()
        }
        
        return collab_stats
    
    def get_top_collaboration_pairs(self):
        """Get top collaborating pairs"""
        pair_collabs = Counter()
        for _, row in self.collaboration_df.iterrows():
            groups = [g.strip().replace('ag_', '') for g in row['Working_Groups'].split(',')]
            for i in range(len(groups)):
                for j in range(i+1, len(groups)):
                    pair = f"{groups[i]} + {groups[j]}"
                    pair_collabs[pair] += 1
        
        return dict(pair_collabs.most_common(5))
    
    def count_cross_project_collaborations(self):
        """Count collaborations across different subprojects"""
        cross_project = 0
        for _, row in self.collaboration_df.iterrows():
            if pd.notna(row['Subproject']) and ',' in row['Subproject']:
                projects = [p.strip() for p in row['Subproject'].split(',')]
                unique_projects = set([p[0] for p in projects if len(p) > 0])  # First letter (A, B, C, etc.)
                if len(unique_projects) > 1:
                    cross_project += 1
        
        return cross_project
    
    def analyze_collaboration_trends(self):
        """Analyze how collaboration has evolved"""
        yearly_collab = self.main_df.groupby('Publication_Year')['Is_Collaboration'].mean() * 100
        
        return {
            'early_period_rate': round(yearly_collab[yearly_collab.index <= 2018].mean(), 1),
            'recent_period_rate': round(yearly_collab[yearly_collab.index >= 2019].mean(), 1),
            'trend_direction': 'increasing' if yearly_collab.iloc[-1] > yearly_collab.iloc[0] else 'decreasing'
        }
    
    def analyze_impact_visibility(self):
        """Analyze impact and visibility metrics"""
        
        impact_stats = {
            'open_access_rate': round(self.main_df['Open_Access_Binary'].mean() * 100, 1),
            'journal_diversity': self.main_df['Journal'].nunique(),
            'top_journals': dict(self.main_df['Journal'].value_counts().head(5)),
            'publication_types': dict(self.main_df['Publication_Type'].value_counts()),
            'international_visibility': self.assess_international_visibility(),
            'research_field_diversity': self.assess_field_diversity()
        }
        
        return impact_stats
    
    def assess_international_visibility(self):
        """Assess international research visibility"""
        # This is a simplified assessment based on journal names and patterns
        international_journals = ['Cell', 'Nature', 'Science', 'PNAS', 'Journal', 'European', 'International']
        
        international_pubs = 0
        for journal in self.main_df['Journal']:
            if any(keyword in str(journal) for keyword in international_journals):
                international_pubs += 1
        
        return {
            'international_publications': international_pubs,
            'international_rate': round(international_pubs / len(self.main_df) * 100, 1)
        }
    
    def assess_field_diversity(self):
        """Assess research field diversity based on journal patterns"""
        # Simplified field classification based on journal keywords
        field_keywords = {
            'Cardiology': ['cardio', 'heart', 'cardiac'],
            'Cell Biology': ['cell', 'molecular', 'biology'],
            'Physiology': ['physiol', 'function'],
            'Biochemistry': ['biochem', 'protein', 'enzyme'],
            'Medical': ['medical', 'clinical', 'medicine']
        }
        
        field_counts = defaultdict(int)
        for journal in self.main_df['Journal']:
            journal_lower = str(journal).lower()
            for field, keywords in field_keywords.items():
                if any(keyword in journal_lower for keyword in keywords):
                    field_counts[field] += 1
                    break
            else:
                field_counts['Other'] += 1
        
        return dict(field_counts)
    
    def generate_recommendations(self):
        """Generate strategic recommendations"""
        
        recommendations = {
            'Collaboration Enhancement': [
                "Consider facilitating more cross-project (A-B-C) collaborations to increase innovation",
                "Organize regular collaboration workshops for underconnected groups",
                "Create incentives for multi-group publications"
            ],
            'Research Output Optimization': [
                "Support emerging high-growth working groups with additional resources",
                "Encourage consistent publication output from all groups",
                "Consider establishing publication targets and support mechanisms"
            ],
            'Visibility and Impact': [
                f"Increase open access rate from current {self.main_df['Open_Access_Binary'].mean() * 100:.1f}% to >80%",
                "Target more high-impact international journals",
                "Develop strategic communication plan for research dissemination"
            ],
            'Network Strengthening': [
                "Identify and support key connector groups in collaboration networks",
                "Foster new collaboration pathways between isolated groups",
                "Create mentorship programs pairing high-output with emerging groups"
            ]
        }
        
        return recommendations
    
    def create_executive_report(self, insights):
        """Create comprehensive HTML executive report"""
        
        html_template = f"""
        <!DOCTYPE html>
        <html>
        <head>
            <title>SFB1002 Publication Analysis - Executive Summary</title>
            <style>
                body {{ font-family: 'Segoe UI', Arial, sans-serif; margin: 40px; line-height: 1.6; color: #333; }}
                .header {{ background: linear-gradient(135deg, #667eea 0%, #764ba2 100%); color: white; padding: 30px; border-radius: 10px; margin-bottom: 30px; }}
                .section {{ margin: 30px 0; padding: 20px; border-left: 4px solid #667eea; background: #f8f9fa; border-radius: 5px; }}
                .metric {{ display: inline-block; margin: 10px 20px 10px 0; padding: 15px; background: white; border-radius: 8px; box-shadow: 0 2px 4px rgba(0,0,0,0.1); }}
                .metric-value {{ font-size: 2em; font-weight: bold; color: #667eea; }}
                .metric-label {{ font-size: 0.9em; color: #666; }}
                .recommendation {{ background: #e8f5e8; padding: 15px; margin: 10px 0; border-radius: 5px; border-left: 4px solid #28a745; }}
                ul {{ margin: 15px 0; }}
                li {{ margin: 5px 0; }}
                .top-list {{ background: white; padding: 15px; border-radius: 5px; margin: 10px 0; }}
            </style>
        </head>
        <body>
            <div class="header">
                <h1>SFB1002 Publication Analysis</h1>
                <h2>Executive Summary & Strategic Insights</h2>
                <p>Analysis Period: 2014-2024 | Generated: {datetime.now().strftime('%Y-%m-%d %H:%M')}</p>
            </div>
            
            <div class="section">
                <h2>📊 Key Performance Metrics</h2>
                <div class="metric">
                    <div class="metric-value">{insights['Research Output Trends']['total_publications']}</div>
                    <div class="metric-label">Total Publications</div>
                </div>
                <div class="metric">
                    <div class="metric-value">{len(set(self.individual_df['Working_Groups_Clean']))}</div>
                    <div class="metric-label">Active Working Groups</div>
                </div>
                <div class="metric">
                    <div class="metric-value">{insights['Collaboration Effectiveness']['collaboration_rate']}%</div>
                    <div class="metric-label">Collaboration Rate</div>
                </div>
                <div class="metric">
                    <div class="metric-value">{insights['Impact and Visibility']['open_access_rate']}%</div>
                    <div class="metric-label">Open Access Rate</div>
                </div>
            </div>
            
            <div class="section">
                <h2>📈 Research Output Trends</h2>
                <p><strong>Peak Performance:</strong> {insights['Research Output Trends']['peak_year']} with {insights['Research Output Trends']['peak_year_count']} publications</p>
                <p><strong>Average Output:</strong> {insights['Research Output Trends']['avg_publications_per_year']:.1f} publications per year</p>
                
                <div class="top-list">
                    <h3>Most Productive Working Groups:</h3>
                    <ul>
                        {''.join([f"<li>{group}: {count} publications</li>" for group, count in insights['Research Output Trends']['most_productive_groups'].items()])}
                    </ul>
                </div>
                
                <div class="top-list">
                    <h3>Emerging High-Growth Groups:</h3>
                    <ul>
                        {''.join([f"<li>{group}: +{growth}% growth</li>" for group, growth in insights['Research Output Trends']['emerging_groups'].items()])}
                    </ul>
                </div>
            </div>
            
            <div class="section">
                <h2>🤝 Collaboration Analysis</h2>
                <p><strong>Collaboration Effectiveness:</strong> {insights['Collaboration Effectiveness']['collaboration_rate']}% of publications involve multiple groups</p>
                <p><strong>Average Collaborators:</strong> {insights['Collaboration Effectiveness']['avg_collaborators_per_publication']} groups per publication</p>
                <p><strong>Cross-Project Collaborations:</strong> {insights['Collaboration Effectiveness']['cross_project_collaborations']} publications span multiple subprojects</p>
                
                <div class="top-list">
                    <h3>Top Collaboration Pairs:</h3>
                    <ul>
                        {''.join([f"<li>{pair}: {count} joint publications</li>" for pair, count in insights['Collaboration Effectiveness']['top_collaboration_pairs'].items()])}
                    </ul>
                </div>
            </div>
            
            <div class="section">
                <h2>🎯 Impact & Visibility</h2>
                <p><strong>Journal Diversity:</strong> {insights['Impact and Visibility']['journal_diversity']} unique journals</p>
                <p><strong>International Visibility:</strong> {insights['Impact and Visibility']['international_visibility']['international_rate']}% publications in international journals</p>
                
                <div class="top-list">
                    <h3>Top Publication Venues:</h3>
                    <ul>
                        {''.join([f"<li>{journal}: {count} publications</li>" for journal, count in insights['Impact and Visibility']['top_journals'].items()])}
                    </ul>
                </div>
            </div>
            
            <div class="section">
                <h2>💡 Strategic Recommendations</h2>
                {self.format_recommendations(insights['Strategic Recommendations'])}
            </div>
            
            <div class="section">
                <h2>📁 Generated Outputs</h2>
                <p>This analysis has generated comprehensive outputs organized in:</p>
                <ul>
                    <li><strong>Cleaned Data:</strong> 5 specialized datasets for different analysis perspectives</li>
                    <li><strong>Visualizations:</strong> 15+ static and interactive charts, plus network diagrams</li>
                    <li><strong>Network Analysis:</strong> Collaboration networks with centrality metrics</li>
                    <li><strong>Temporal Analysis:</strong> Evolution of collaboration patterns over time</li>
                    <li><strong>Productivity Metrics:</strong> Group and author performance indicators</li>
                    <li><strong>Strategic Insights:</strong> Data-driven recommendations for future planning</li>
                </ul>
            </div>
        </body>
        </html>
        """
        
        return html_template
    
    def format_recommendations(self, recommendations):
        """Format recommendations for HTML report"""
        html = ""
        for category, rec_list in recommendations.items():
            html += f"<h3>{category}</h3>"
            for rec in rec_list:
                html += f'<div class="recommendation">{rec}</div>'
        return html
    
    def run_complete_analysis(self):
        """Run the complete analysis pipeline"""
        print("🚀 Starting SFB1002 Complete Publication Analysis...")
        print("=" * 60)
        
        # Step 1: Load and clean data
        self.load_and_clean_data()
        
        # Step 2: Generate basic statistics
        stats = self.generate_basic_statistics()
        
        # Step 3: Temporal analysis
        self.create_temporal_analysis()
        
        # Step 4: Network analysis
        self.create_collaboration_networks()
        
        # Step 5: Comprehensive visualizations
        self.create_comprehensive_visualizations()
        
        # Step 6: Productivity analysis
        self.create_productivity_analysis()
        
        # Step 7: Strategic insights
        self.create_strategic_insights()
        
        print("\n" + "=" * 60)
        print("✅ Analysis Complete!")
        print(f"📁 All outputs saved to: {self.output_dir}")
        print("\n📋 Summary:")
        print(f"   • {stats['Overview']['Total Publications']} publications analyzed")
        print(f"   • {stats['Overview']['Unique Working Groups']} working groups")
        print(f"   • {stats['Overview']['Unique Authors']} unique authors")
        print(f"   • {stats['Overview']['Collaboration Rate']} collaboration rate")
        print(f"   • 10 organized output folders with 50+ analysis files")
        print("\n🎯 Key files to review:")
        print("   • 03_reports/executive_summary.html")
        print("   • 02_visualizations/interactive/ (for detailed exploration)")
        print("   • 06_collaboration_metrics/ (for network insights)")

# Usage Instructions
if __name__ == "__main__":
    """
    To run this analysis:
    
    1. Install required packages:
       pip install pandas numpy matplotlib seaborn plotly networkx openpyxl
    
    2. Update the file path below to your Excel file location
    
    3. Run the analysis:
       python sfb1002_analysis.py
    """
    
    # CHANGE THIS PATH TO YOUR EXCEL FILE LOCATION
    file_path = "2025-07-01_excel_export.xlsx"
    
    # Initialize and run the complete analysis
    analyzer = SFB1002Analyzer(file_path)
    analyzer.run_complete_analysis()
    
    print("\n" + "🎉" * 20)
    print("ANALYSIS COMPLETED SUCCESSFULLY!")
    print("🎉" * 20)

# Additional utility functions for extended analysis

def create_advanced_network_metrics(analyzer):
    """Create advanced network analysis with additional metrics"""
    
    # Load the group collaboration network
    network_file = os.path.join(analyzer.output_dir, "02_visualizations/networks", "group_network.gexf")
    if os.path.exists(network_file):
        G = nx.read_gexf(network_file)
        
        # Advanced centrality metrics
        advanced_metrics = {
            'clustering': nx.clustering(G),
            'pagerank': nx.pagerank(G),
            'katz_centrality': nx.katz_centrality(G, max_iter=1000),
            'harmonic_centrality': nx.harmonic_centrality(G),
            'load_centrality': nx.load_centrality(G)
        }
        
        # Network-level metrics
        network_stats = {
            'density': nx.density(G),
            'transitivity': nx.transitivity(G),
            'average_clustering': nx.average_clustering(G),
            'number_of_nodes': G.number_of_nodes(),
            'number_of_edges': G.number_of_edges(),
            'average_degree': sum(dict(G.degree()).values()) / G.number_of_nodes(),
            'diameter': nx.diameter(G) if nx.is_connected(G) else 'Graph not connected',
            'radius': nx.radius(G) if nx.is_connected(G) else 'Graph not connected'
        }
        
        # Save advanced metrics
        advanced_df = pd.DataFrame(advanced_metrics)
        advanced_df.to_csv(os.path.join(analyzer.output_dir, "06_collaboration_metrics", "advanced_centrality_metrics.csv"))
        
        # Save network statistics
        with open(os.path.join(analyzer.output_dir, "06_collaboration_metrics", "network_statistics.txt"), 'w') as f:
            f.write("NETWORK-LEVEL STATISTICS\n")
            f.write("=" * 30 + "\n\n")
            for metric, value in network_stats.items():
                f.write(f"{metric.replace('_', ' ').title()}: {value}\n")

def create_longitudinal_collaboration_analysis(analyzer):
    """Create detailed longitudinal analysis of collaboration patterns"""
    
    print("📊 Creating longitudinal collaboration analysis...")
    
    # Analyze collaboration strength over time
    yearly_collaboration_strength = {}
    
    for year in range(2014, 2025):
        year_data = analyzer.main_df[analyzer.main_df['Publication_Year'] == year]
        year_collabs = year_data[year_data['Is_Collaboration'] == True]
        
        # Calculate collaboration metrics for this year
        if len(year_collabs) > 0:
            avg_collaborators = year_collabs['Num_Collaborating_Groups'].mean()
            collaboration_rate = len(year_collabs) / len(year_data) * 100
            unique_collaborating_groups = set()
            
            for _, row in year_collabs.iterrows():
                groups = [g.strip() for g in row['Working_Groups'].split(',')]
                unique_collaborating_groups.update(groups)
            
            yearly_collaboration_strength[year] = {
                'collaboration_rate': collaboration_rate,
                'avg_collaborators_per_pub': avg_collaborators,
                'unique_collaborating_groups': len(unique_collaborating_groups),
                'total_publications': len(year_data),
                'collaborative_publications': len(year_collabs)
            }
    
    # Save longitudinal data
    longitudinal_df = pd.DataFrame(yearly_collaboration_strength).T
    longitudinal_df.to_csv(os.path.join(analyzer.output_dir, "05_temporal_analysis", "longitudinal_collaboration_metrics.csv"))
    
    # Create visualization
    fig, axes = plt.subplots(2, 2, figsize=(16, 12))
    
    # Plot 1: Collaboration rate over time
    axes[0,0].plot(longitudinal_df.index, longitudinal_df['collaboration_rate'], marker='o', linewidth=2, markersize=6)
    axes[0,0].set_title('Collaboration Rate Over Time')
    axes[0,0].set_ylabel('Collaboration Rate (%)')
    axes[0,0].grid(True, alpha=0.3)
    
    # Plot 2: Average collaborators per publication
    axes[0,1].plot(longitudinal_df.index, longitudinal_df['avg_collaborators_per_pub'], marker='s', linewidth=2, markersize=6, color='orange')
    axes[0,1].set_title('Average Collaborators per Publication')
    axes[0,1].set_ylabel('Average Collaborators')
    axes[0,1].grid(True, alpha=0.3)
    
    # Plot 3: Number of collaborating groups
    axes[1,0].bar(longitudinal_df.index, longitudinal_df['unique_collaborating_groups'], alpha=0.7, color='green')
    axes[1,0].set_title('Unique Collaborating Groups per Year')
    axes[1,0].set_ylabel('Number of Groups')
    axes[1,0].set_xlabel('Year')
    
    # Plot 4: Total vs Collaborative publications
    axes[1,1].bar(longitudinal_df.index, longitudinal_df['total_publications'], alpha=0.7, label='Total Publications', color='lightblue')
    axes[1,1].bar(longitudinal_df.index, longitudinal_df['collaborative_publications'], alpha=0.9, label='Collaborative Publications', color='darkblue')
    axes[1,1].set_title('Publications: Total vs Collaborative')
    axes[1,1].set_ylabel('Number of Publications')
    axes[1,1].set_xlabel('Year')
    axes[1,1].legend()
    
    plt.tight_layout()
    plt.savefig(os.path.join(analyzer.output_dir, "02_visualizations/static", "longitudinal_collaboration_analysis.png"), dpi=300, bbox_inches='tight')
    plt.close()

def create_author_influence_network(analyzer):
    """Create detailed author influence and collaboration network"""
    
    print("👤 Creating author influence network...")
    
    # Focus on authors with significant contributions
    author_stats = analyzer.author_df.groupby('Author_Name').agg({
        'Publication_Year': ['count', 'min', 'max'],
        'Is_First_Author': 'sum',
        'Is_Last_Author': 'sum',
        'Working_Groups_Clean': lambda x: list(set(x))
    })
    
    author_stats.columns = ['Total_Pubs', 'First_Year', 'Last_Year', 'First_Author_Count', 'Last_Author_Count', 'Working_Groups']
    author_stats['Influence_Score'] = (author_stats['First_Author_Count'] * 2 + 
                                     author_stats['Last_Author_Count'] * 3 + 
                                     author_stats['Total_Pubs'])
    
    # Select influential authors (top 30% by influence score)
    threshold = author_stats['Influence_Score'].quantile(0.7)
    influential_authors = author_stats[author_stats['Influence_Score'] >= threshold]
    
    # Create author collaboration network
    G = nx.Graph()
    
    # Add nodes with attributes
    for author, stats in influential_authors.iterrows():
        G.add_node(author, 
                  total_pubs=stats['Total_Pubs'],
                  influence_score=stats['Influence_Score'],
                  working_groups=len(stats['Working_Groups']))
    
    # Add collaboration edges
    author_publications = defaultdict(list)
    for _, row in analyzer.author_df.iterrows():
        if row['Author_Name'] in influential_authors.index:
            pub_id = f"{row['DOI']}_{row['Publication_Year']}"
            author_publications[pub_id].append(row['Author_Name'])
    
    for pub_id, authors in author_publications.items():
        for i in range(len(authors)):
            for j in range(i+1, len(authors)):
                if G.has_edge(authors[i], authors[j]):
                    G[authors[i]][authors[j]]['weight'] += 1
                else:
                    G.add_edge(authors[i], authors[j], weight=1)
    
    # Calculate centrality metrics
    if len(G.nodes()) > 0:
        author_centrality = {
            'betweenness': nx.betweenness_centrality(G),
            'closeness': nx.closeness_centrality(G),
            'degree': nx.degree_centrality(G),
            'eigenvector': nx.eigenvector_centrality(G, max_iter=1000)
        }
        
        # Combine with influence scores
        influence_analysis = pd.DataFrame(author_centrality)
        influence_analysis['Influence_Score'] = [influential_authors.loc[author, 'Influence_Score'] for author in influence_analysis.index]
        influence_analysis['Total_Publications'] = [influential_authors.loc[author, 'Total_Pubs'] for author in influence_analysis.index]
        
        influence_analysis.to_csv(os.path.join(analyzer.output_dir, "06_collaboration_metrics", "author_influence_analysis.csv"))
        
        # Visualize author network
        plt.figure(figsize=(20, 16))
        pos = nx.spring_layout(G, k=3, iterations=100)
        
        # Node sizes based on influence score
        node_sizes = [G.nodes[node]['influence_score'] * 20 for node in G.nodes()]
        
        # Node colors based on number of working groups
        node_colors = [G.nodes[node]['working_groups'] for node in G.nodes()]
        
        # Edge weights
        edge_weights = [G[u][v]['weight'] for u, v in G.edges()]
        
        # Draw network
        nodes = nx.draw_networkx_nodes(G, pos, node_size=node_sizes, node_color=node_colors, 
                                     cmap='viridis', alpha=0.7)
        nx.draw_networkx_edges(G, pos, width=[w*0.5 for w in edge_weights], alpha=0.4)
        
        # Add labels for top authors only
        top_authors = influence_analysis.nlargest(15, 'Influence_Score').index
        labels = {author: author.split()[-1] if author in top_authors else '' for author in G.nodes()}
        nx.draw_networkx_labels(G, pos, labels, font_size=8, font_weight='bold')
        
        # Add colorbar
        plt.colorbar(nodes, label='Number of Working Groups')
        
        plt.title('Author Influence Network\n(Node size = Influence Score, Color = Working Group Diversity)', 
                 fontsize=16, fontweight='bold')
        plt.axis('off')
        plt.tight_layout()
        plt.savefig(os.path.join(analyzer.output_dir, "02_visualizations/networks", "author_influence_network.png"), 
                   dpi=300, bbox_inches='tight')
        plt.close()
        
        # Save network
        nx.write_gexf(G, os.path.join(analyzer.output_dir, "02_visualizations/networks", "author_influence_network.gexf"))

def create_publication_impact_analysis(analyzer):
    """Analyze publication impact patterns"""
    
    print("📈 Creating publication impact analysis...")
    
    # Journal impact classification (simplified)
    high_impact_keywords = ['nature', 'science', 'cell', 'lancet', 'nejm', 'pnas']
    
    analyzer.main_df['Journal_Impact_Category'] = 'Standard'
    for idx, journal in enumerate(analyzer.main_df['Journal']):
        journal_lower = str(journal).lower()
        if any(keyword in journal_lower for keyword in high_impact_keywords):
            analyzer.main_df.iloc[idx, analyzer.main_df.columns.get_loc('Journal_Impact_Category')] = 'High Impact'
    
    # Impact analysis by groups
    impact_by_group = analyzer.individual_df.groupby('Working_Groups_Clean').agg({
        'Journal_Impact_Category': lambda x: (x == 'High Impact').sum(),
        'Open_Access_Binary': 'mean',
        'Publication_Year': 'count'
    })
    
    impact_by_group.columns = ['High_Impact_Count', 'Open_Access_Rate', 'Total_Publications']
    impact_by_group['High_Impact_Rate'] = impact_by_group['High_Impact_Count'] / impact_by_group['Total_Publications'] * 100
    impact_by_group = impact_by_group.sort_values('High_Impact_Rate', ascending=False)
    
    impact_by_group.to_csv(os.path.join(analyzer.output_dir, "08_impact_analysis", "impact_analysis_by_group.csv"))
    
    # Create impact visualization
    top_impact_groups = impact_by_group.head(15)
    
    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(18, 8))
    
    # High impact rate
    bars1 = ax1.barh(range(len(top_impact_groups)), top_impact_groups['High_Impact_Rate'], color='gold')
    ax1.set_yticks(range(len(top_impact_groups)))
    ax1.set_yticklabels(top_impact_groups.index)
    ax1.set_xlabel('High Impact Publication Rate (%)')
    ax1.set_title('High Impact Publication Rate by Working Group')
    
    # Open access rate
    bars2 = ax2.barh(range(len(top_impact_groups)), top_impact_groups['Open_Access_Rate'] * 100, color='lightgreen')
    ax2.set_yticks(range(len(top_impact_groups)))
    ax2.set_yticklabels(top_impact_groups.index)
    ax2.set_xlabel('Open Access Rate (%)')
    ax2.set_title('Open Access Rate by Working Group')
    
    plt.tight_layout()
    plt.savefig(os.path.join(analyzer.output_dir, "02_visualizations/static", "impact_analysis_by_group.png"), dpi=300, bbox_inches='tight')
    plt.close()

def generate_final_summary_report(analyzer):
    """Generate a comprehensive final summary with all key insights"""
    
    print("📋 Generating final comprehensive summary...")
    
    # Compile all key statistics
    summary_stats = {
        'Data Overview': {
            'Total Publications': len(analyzer.main_df),
            'Time Period': f"{analyzer.main_df['Publication_Year'].min()}-{analyzer.main_df['Publication_Year'].max()}",
            'Unique Working Groups': analyzer.individual_df['Working_Groups_Clean'].nunique(),
            'Unique Authors': analyzer.author_df['Author_Name'].nunique(),
            'Unique Journals': analyzer.main_df['Journal'].nunique(),
        },
        
        'Collaboration Metrics': {
            'Overall Collaboration Rate': f"{analyzer.main_df['Is_Collaboration'].mean() * 100:.1f}%",
            'Average Collaborators per Publication': f"{analyzer.main_df['Num_Collaborating_Groups'].mean():.2f}",
            'Maximum Collaboration Size': analyzer.main_df['Num_Collaborating_Groups'].max(),
            'Most Collaborative Groups': dict(analyzer.individual_df[analyzer.individual_df['Is_Collaboration'] == True]['Working_Groups_Clean'].value_counts().head(5))
        },
        
        'Productivity Leaders': {
            'Most Productive Groups': dict(analyzer.individual_df['Working_Groups_Clean'].value_counts().head(10)),
            'Most Prolific Authors': dict(analyzer.author_df['Author_Name'].value_counts().head(10)),
            'Peak Publication Year': analyzer.main_df['Publication_Year'].value_counts().idxmax(),
            'Peak Year Count': analyzer.main_df['Publication_Year'].value_counts().max()
        },
        
        'Quality & Impact Indicators': {
            'Open Access Rate': f"{analyzer.main_df['Open_Access_Binary'].mean() * 100:.1f}%",
            'Journal Article Rate': f"{(analyzer.main_df['Publication_Type'] == 'Journal Article').mean() * 100:.1f}%",
            'Review Article Rate': f"{(analyzer.main_df['Publication_Type'] == 'Review').mean() * 100:.1f}%",
            'Top Publishing Journals': dict(analyzer.main_df['Journal'].value_counts().head(5))
        }
    }
    
    # Create final summary file
    summary_path = os.path.join(analyzer.output_dir, "09_strategic_insights", "comprehensive_summary.txt")
    with open(summary_path, 'w', encoding='utf-8') as f:
        f.write("SFB1002 PUBLICATION ANALYSIS - COMPREHENSIVE SUMMARY\n")
        f.write("=" * 60 + "\n\n")
        f.write(f"Analysis Generated: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n")
        f.write(f"Data Source: 2025-07-01_excel_export.xlsx\n\n")
        
        for section, data in summary_stats.items():
            f.write(f"\n{section.upper()}\n")
            f.write("-" * len(section) + "\n")
            
            for key, value in data.items():
                if isinstance(value, dict):
                    f.write(f"\n{key}:\n")
                    for k, v in value.items():
                        f.write(f"  • {k}: {v}\n")
                else:
                    f.write(f"{key}: {value}\n")
        
        f.write(f"\n\nANALYSIS OUTPUTS GENERATED\n")
        f.write("=" * 30 + "\n")
        f.write("• 5 cleaned datasets in 01_cleaned_data/\n")
        f.write("• 20+ visualizations in 02_visualizations/\n")
        f.write("• Network analysis files in 02_visualizations/networks/\n")
        f.write("• Executive summary in 03_reports/\n")
        f.write("• Statistical summaries in 04_statistics/\n")
        f.write("• Temporal analysis in 05_temporal_analysis/\n")
        f.write("• Collaboration metrics in 06_collaboration_metrics/\n")
        f.write("• Productivity analysis in 07_productivity_analysis/\n")
        f.write("• Impact analysis in 08_impact_analysis/\n")
        f.write("• Strategic insights in 09_strategic_insights/\n")

# Extended analysis runner
def run_extended_analysis():
    """Run the extended analysis with additional features"""
    
    file_path = "2025-07-01_excel_export.xlsx"
    
    # Run main analysis
    analyzer = SFB1002Analyzer(file_path)
    analyzer.run_complete_analysis()
    
    # Run extended analyses
    print("\n🔬 Running Extended Analysis Components...")
    
    create_advanced_network_metrics(analyzer)
    create_longitudinal_collaboration_analysis(analyzer)
    create_author_influence_network(analyzer)
    create_publication_impact_analysis(analyzer)
    generate_final_summary_report(analyzer)
    
    print("\n" + "🎊" * 25)
    print("COMPLETE EXTENDED ANALYSIS FINISHED!")
    print("🎊" * 25)
    print(f"\n📂 Check your results in: {analyzer.output_dir}")
    print("\n🏆 ANALYSIS HIGHLIGHTS:")
    print("   • Complete collaboration network analysis")
    print("   • Author influence and centrality metrics") 
    print("   • Longitudinal trend analysis")
    print("   • Publication impact assessment")
    print("   • Strategic recommendations")
    print("   • 60+ output files across 10 categories")

# Uncomment the line below to run the extended analysis
# run_extended_analysis()

✅ Output structure created at: C:\Users\Kuntz1\Desktop\SFB1002_Analysis
🚀 Starting SFB1002 Complete Publication Analysis...
📊 Loading and cleaning data...
📋 Loaded 550 rows and 142 columns
🔍 First 10 column names:
   0: RDP - Link
   1: Working Groups
   2: Subproject
   3: Open Access
   4: Publication Type
   5: Peer Reviewed
   6: PMID
   7: DOI
   8: Publication Year
   9: Title
🧹 Starting data cleaning...
   Original columns: ['RDP - Link', 'Working Groups', 'Subproject', 'Open Access', 'Publication Type', 'Peer Reviewed', 'PMID', 'DOI', 'Publication Year', 'Title', 'Journal', 'ISSN', 'eISSN', 'URL', 'Pages', 'Issue', 'Volume', 'Journal Abbreviation', 'Extra', 'Authors', 'First Author', 'Last Author', 'Creation Time']
   Cleaned columns: ['RDP_Link', 'Working_Groups', 'Subproject', 'Open_Access', 'Publication_Type', 'Peer_Reviewed', 'PMID', 'DOI', 'Publication_Year', 'Title']...
   Filtered to years 2014-2024: 498 publications
✅ Data cleaning completed: 498 publications ready for 

<Figure size 1500x800 with 0 Axes>