In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from pathlib import Path

def analyze_github_data(csv_path, output_dir="analysis_output"):
    """
    Analyze GitHub repository data and generate statistics and visualizations
    
    Args:
        csv_path: Path to the CSV file with repository data
        output_dir: Directory to save the output files
    """
    # Create output directory if it doesn't exist
    Path(output_dir).mkdir(parents=True, exist_ok=True)
    
    # Load the data
    print(f"Loading data from {csv_path}...")
    df = pd.read_csv(csv_path)
    
    # Basic info
    print(f"Dataset has {df.shape[0]} rows and {df.shape[1]} columns")
    
    # Convert columns to numeric if needed
    numeric_columns = ['stargazers_count', 'forks_count', 'open_issues_count', 'contributors_count']
    for col in numeric_columns:
        if col in df.columns:
            df[col] = pd.to_numeric(df[col], errors='coerce')
    
    # Generate descriptive statistics
    print("\nGenerating descriptive statistics...")
    stats_df = df[numeric_columns].describe().round(2)
    
    # Add additional metrics
    stats_df.loc['median'] = df[numeric_columns].median()
    stats_df.loc['mode'] = df[numeric_columns].mode().iloc[0]
    stats_df.loc['sum'] = df[numeric_columns].sum()
    stats_df.loc['variance'] = df[numeric_columns].var()
    stats_df = stats_df.round(2)
    
    # Save statistics to CSV
    stats_file = f"{output_dir}/github_stats.csv"
    stats_df.to_csv(stats_file)
    print(f"Statistics saved to {stats_file}")
    
    # Create compact representation for large datasets
    if df.shape[0] > 200:
        print("\nCompacting dataset for analysis...")
        
        # Group by ranges of stars
        df['star_range'] = pd.cut(
            df['stargazers_count'], 
            bins=[0, 10, 100, 1000, 10000, float('inf')],
            labels=['<10', '10-100', '100-1000', '1000-10000', '>10000']
        )
        
        # Create summary by star range
        summary_by_stars = df.groupby('star_range')[numeric_columns].agg(
            ['count', 'min', 'mean', 'median', 'max', 'sum']
        )
        
        # Save compact summary to CSV
        compact_file = f"{output_dir}/compact_summary.csv"
        summary_by_stars.to_csv(compact_file)
        print(f"Compact summary saved to {compact_file}")
    
    # Generate visualizations
    print("\nGenerating visualizations...")
    
    # 1. Correlation heatmap
    plt.figure(figsize=(10, 8))
    correlation = df[numeric_columns].corr()
    sns.heatmap(correlation, annot=True, cmap='coolwarm', fmt='.2f', linewidths=0.5)
    plt.title('Correlation Between Repository Metrics', fontsize=16)
    plt.tight_layout()
    plt.savefig(f"{output_dir}/correlation_heatmap.png", dpi=300)
    print(f"Correlation heatmap saved to {output_dir}/correlation_heatmap.png")
    
    # 2. Scatter plot with regression line (Stars vs Contributors)
    plt.figure(figsize=(12, 8))
    sns.regplot(x='stargazers_count', y='contributors_count', data=df, 
                scatter_kws={'alpha':0.5}, line_kws={'color':'red'})
    plt.title('Relationship Between Stars and Contributors', fontsize=16)
    plt.xlabel('Stars Count', fontsize=14)
    plt.ylabel('Contributors Count', fontsize=14)
    plt.xscale('log')
    plt.yscale('log')
    plt.tight_layout()
    plt.savefig(f"{output_dir}/stars_vs_contributors.png", dpi=300)
    print(f"Scatter plot saved to {output_dir}/stars_vs_contributors.png")
    
    # 3. Distribution of metrics
    fig, axes = plt.subplots(2, 2, figsize=(16, 12))
    axes = axes.flatten()
    
    for i, col in enumerate(numeric_columns):
        sns.histplot(df[col], kde=True, ax=axes[i], log_scale=(True, False))
        axes[i].set_title(f'Distribution of {col.replace("_", " ").title()}', fontsize=14)
        axes[i].set_xlabel(col.replace("_", " ").title(), fontsize=12)
        axes[i].set_ylabel('Frequency', fontsize=12)
    
    plt.tight_layout()
    plt.savefig(f"{output_dir}/distributions.png", dpi=300)
    print(f"Distribution plots saved to {output_dir}/distributions.png")
    
    # 4. Boxplot of metrics
    plt.figure(figsize=(14, 8))
    # Melt dataframe to long format for boxplot
    df_melt = pd.melt(df, value_vars=numeric_columns, var_name='Metric', value_name='Value')
    sns.boxplot(x='Metric', y='Value', data=df_melt)
    plt.title('Distribution of Repository Metrics', fontsize=16)
    plt.yscale('log')
    plt.ylabel('Value (log scale)', fontsize=14)
    plt.xlabel('Repository Metric', fontsize=14)
    plt.xticks(rotation=45)
    plt.tight_layout()
    plt.savefig(f"{output_dir}/boxplots.png", dpi=300)
    print(f"Boxplots saved to {output_dir}/boxplots.png")
    
    # Return the statistics dataframe for display
    return stats_df, summary_by_stars if df.shape[0] > 200 else None

if __name__ == "__main__":
    # Replace with your CSV file path
    csv_file = "repositories_with_contributors.csv"
    
    # Analyze the data
    stats, summary = analyze_github_data(csv_file)
    
    # Print the statistics table
    print("\n=== DESCRIPTIVE STATISTICS ===")
    print(stats)
    
    if summary is not None:
        print("\n=== COMPACT SUMMARY BY STAR RANGE ===")
        # Print a more readable version of the compact summary
        for metric in ['mean', 'median', 'max', 'sum']:
            print(f"\n{metric.upper()} VALUES BY STAR RANGE:")
            print(summary.xs(metric, level=1, axis=1))
    
    print("\nAnalysis complete! Check the 'analysis_output' directory for CSV files and visualizations.")