In [5]:
#!/usr/bin/env python3
"""
PyPI Download Statistics using pepy.tech API (Free Tier)
Requires API key from https://pepy.tech/user
"""

import requests
import pandas as pd
from datetime import datetime, timedelta
import time
import json
import os

# List of libraries to analyze with their PyPI package names
LIBRARIES = [
    'pandas',
    'pyspark', 
    'dask',
    'ibis-framework',
    'cudf',
    'modin',
    'vaex-core',  # vaex main package is vaex-core
    'datatable',
    'duckdb',
    'dask-cudf',
    'polars',
    'pyarrow',
    'datasets',
    'ray'
]

# API Configuration
PEPY_BASE_URL = "https://api.pepy.tech"
FREE_TIER_RATE_LIMIT = 10  # requests per minute for free tier
REQUEST_DELAY = 60 / FREE_TIER_RATE_LIMIT + 1  # Add 1 second buffer

def get_api_key():
    """
    Get API key from environment variable or user input
    """
    # Try to get from environment variable first
    api_key = os.getenv('PEPY_API_KEY')
    
    if not api_key:
        print("Pepy.tech API Key Required")
        print("=" * 35)
        print("You can get your free API key from: https://pepy.tech/user")
        print("After getting your key, you can:")
        print("1. Set environment variable: export PEPY_API_KEY=your_key_here")
        print("2. Or enter it below (will be used for this session only)")
        print()
        api_key = input("Enter your pepy.tech API key: ").strip()
    
    if not api_key:
        print("❌ No API key provided. Exiting.")
        return None
    
    return api_key

def check_dependencies():
    """Check if required packages are available"""
    missing = []
    
    try:
        import requests
    except ImportError:
        missing.append('requests')
    
    try:
        import pandas as pd
    except ImportError:
        missing.append('pandas')
    
    if missing:
        print(f"Missing packages: {missing}")
        print(f"Please install them with: pip install {' '.join(missing)}")
        return False
    return True

def get_pepy_download_stats(package_name, api_key):
    """
    Get download statistics from pepy.tech API v2 (free tier)
    """
    try:
        url = f"{PEPY_BASE_URL}/api/v2/projects/{package_name}"
        headers = {
            'X-API-Key': api_key,
            'User-Agent': 'PyPI-Stats-Collector/1.0',
            'Accept': 'application/json'
        }
        
        print(f"    🔍 Fetching from pepy.tech API...")
        response = requests.get(url, headers=headers, timeout=15)
        
        if response.status_code == 200:
            data = response.json()
            
            # Extract basic information
            total_downloads = data.get('total_downloads', 0)
            package_id = data.get('id', package_name)
            versions = data.get('versions', [])
            downloads_by_date = data.get('downloads', {})
            
            # Calculate recent downloads (last 30 days)
            recent_downloads = 0
            if downloads_by_date:
                # Get last 30 days of data
                cutoff_date = datetime.now() - timedelta(days=30)
                for date_str, version_downloads in downloads_by_date.items():
                    try:
                        date_obj = datetime.strptime(date_str, '%Y-%m-%d')
                        if date_obj >= cutoff_date:
                            # Sum all versions for this date
                            day_total = sum(version_downloads.values()) if isinstance(version_downloads, dict) else 0
                            recent_downloads += day_total
                    except (ValueError, TypeError):
                        continue
            
            # Calculate 2025 downloads estimate
            estimated_2025 = calculate_2025_downloads(downloads_by_date)
            
            return {
                'package': package_name,
                'total_downloads': total_downloads,
                'recent_downloads_30d': recent_downloads,
                'estimated_2025_downloads': estimated_2025,
                'versions_count': len(versions),
                'latest_data_date': max(downloads_by_date.keys()) if downloads_by_date else 'Unknown',
                'pepy_status': 'success'
            }
            
        elif response.status_code == 401:
            return {
                'package': package_name,
                'total_downloads': 0,
                'pepy_status': 'invalid_api_key'
            }
        elif response.status_code == 404:
            return {
                'package': package_name,
                'total_downloads': 0,
                'pepy_status': 'not_found'
            }
        elif response.status_code == 429:
            return {
                'package': package_name,
                'total_downloads': 0,
                'pepy_status': 'rate_limited'
            }
        else:
            return {
                'package': package_name,
                'total_downloads': 0,
                'pepy_status': f'error_{response.status_code}'
            }
            
    except requests.exceptions.Timeout:
        return {
            'package': package_name,
            'total_downloads': 0,
            'pepy_status': 'timeout'
        }
    except Exception as e:
        return {
            'package': package_name,
            'total_downloads': 0,
            'pepy_status': f'error: {str(e)[:50]}'
        }

def calculate_2025_downloads(downloads_by_date):
    """
    Calculate downloads since January 1, 2025
    """
    if not downloads_by_date:
        return 0
    
    total_2025 = 0
    start_2025 = datetime(2025, 1, 1)
    
    for date_str, version_downloads in downloads_by_date.items():
        try:
            date_obj = datetime.strptime(date_str, '%Y-%m-%d')
            if date_obj >= start_2025:
                # Sum all versions for this date
                day_total = sum(version_downloads.values()) if isinstance(version_downloads, dict) else 0
                total_2025 += day_total
        except (ValueError, TypeError):
            continue
    
    return total_2025

def get_pypi_metadata(package_name):
    """
    Get package metadata from PyPI JSON API
    """
    try:
        url = f"https://pypi.org/pypi/{package_name}/json"
        headers = {'User-Agent': 'Mozilla/5.0 (compatible; PyPI-Stats-Collector/1.0)'}
        response = requests.get(url, headers=headers, timeout=10)
        
        if response.status_code == 200:
            data = response.json()
            info = data.get('info', {})
            
            # Get latest release info
            releases = data.get('releases', {})
            latest_version = info.get('version', '')
            latest_release_date = 'Unknown'
            
            if latest_version and latest_version in releases:
                release_files = releases[latest_version]
                if release_files:
                    latest_release_date = release_files[0].get('upload_time', 'Unknown')
                    if latest_release_date != 'Unknown':
                        try:
                            dt = datetime.fromisoformat(latest_release_date.replace('Z', '+00:00'))
                            latest_release_date = dt.strftime('%Y-%m-%d')
                        except:
                            pass
            
            return {
                'version': latest_version,
                'summary': (info.get('summary', 'No summary')[:100] + '...' 
                           if len(info.get('summary', '')) > 100 
                           else info.get('summary', 'No summary')),
                'author': info.get('author', 'Unknown'),
                'maintainer': info.get('maintainer', ''),
                'home_page': info.get('home_page', ''),
                'latest_release_date': latest_release_date,
                'license': info.get('license', 'Unknown'),
                'requires_python': info.get('requires_python', ''),
                'pypi_status': 'found'
            }
        else:
            return {'pypi_status': f'not_found_{response.status_code}'}
            
    except Exception as e:
        return {'pypi_status': f'error: {str(e)[:50]}'}

def analyze_package(package_name, api_key, request_count):
    """
    Comprehensive package analysis using pepy.tech API and PyPI
    """
    print(f"  📦 {package_name}")
    
    # Get download stats from pepy.tech API
    pepy_data = get_pepy_download_stats(package_name, api_key)
    
    # Check for API issues
    if pepy_data.get('pepy_status') == 'invalid_api_key':
        print(f"    ❌ Invalid API key! Please check your key.")
        return pepy_data
    elif pepy_data.get('pepy_status') == 'rate_limited':
        print(f"    ⏰ Rate limited! Waiting extra time...")
        time.sleep(30)  # Wait longer if rate limited
        return pepy_data
    
    # Get metadata from PyPI
    pypi_data = get_pypi_metadata(package_name)
    
    # Combine the data
    result = {**pepy_data, **pypi_data}
    
    # Show results
    if result.get('pepy_status') == 'success':
        total = result.get('total_downloads', 0)
        recent_30d = result.get('recent_downloads_30d', 0)
        est_2025 = result.get('estimated_2025_downloads', 0)
        version = result.get('version', 'Unknown')
        
        print(f"    ✅ v{version} | Total: {total:,} | 30d: {recent_30d:,} | 2025: {est_2025:,}")
    elif result.get('pepy_status') == 'not_found':
        if result.get('pypi_status') == 'found':
            print(f"    ⚠️  Found on PyPI v{result.get('version', 'Unknown')} but not on pepy.tech")
        else:
            print(f"    ❌ Not found on either pepy.tech or PyPI")
    else:
        print(f"    ❌ Error: {result.get('pepy_status', 'unknown')}")
    
    # Show rate limiting info
    remaining_requests = FREE_TIER_RATE_LIMIT - (request_count % FREE_TIER_RATE_LIMIT)
    if remaining_requests <= 2:
        print(f"    ⏱️  Rate limit: {remaining_requests} requests remaining this minute")
    
    return result

def main():
    """
    Main function to collect download statistics using pepy.tech API
    """
    print("Pepy.tech API Download Statistics Collector (Free Tier)")
    print("=" * 60)
    print(f"Analyzing {len(LIBRARIES)} data processing libraries...")
    print(f"Date: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
    print("Rate limit: 10 requests per minute (Free tier)")
    print()
    
    # Check dependencies
    if not check_dependencies():
        return None
    
    # Get API key
    api_key = get_api_key()
    if not api_key:
        return None
    
    print(f"✅ API key configured")
    print(f"⏱️  Request delay: {REQUEST_DELAY:.1f} seconds between requests")
    print()
    
    results = []
    
    for i, library in enumerate(LIBRARIES, 1):
        print(f"[{i}/{len(LIBRARIES)}]", end=" ")
        
        result = analyze_package(library, api_key, i)
        results.append(result)
        
        # Rate limiting for free tier (10 requests per minute)
        if i < len(LIBRARIES):  # Don't wait after the last request
            print(f"    ⏳ Waiting {REQUEST_DELAY:.1f}s for rate limit...")
            time.sleep(REQUEST_DELAY)
        
        print()
    
    # Convert to DataFrame
    df = pd.DataFrame(results)
    
    # Sort by estimated 2025 downloads
    df = df.sort_values('estimated_2025_downloads', ascending=False)
    
    # Display results
    print("\n" + "="*120)
    print("DOWNLOAD STATISTICS SUMMARY - 2025 FOCUS")
    print("="*120)
    print(f"{'Package':<18} {'Total Downloads':<15} {'Last 30 Days':<12} {'Est. 2025':<12} {'Version':<12} {'Status'}")
    print("-"*120)
    
    for _, row in df.iterrows():
        package = row['package'][:17]
        total = row.get('total_downloads', 0)
        recent_30d = row.get('recent_downloads_30d', 0)
        est_2025 = row.get('estimated_2025_downloads', 0)
        version = str(row.get('version', 'Unknown'))[:11]
        status = '✅' if row.get('pepy_status') == 'success' else '❌'
        
        print(f"{package:<18} {total:<15,} {recent_30d:<12,} {est_2025:<12,} {version:<12} {status}")
    
    # Additional metadata table
    print(f"\n{'Package':<18} {'Author':<25} {'Latest Release':<15} {'License':<15}")
    print("-"*85)
    
    for _, row in df.iterrows():
        package = row['package'][:17]
        author = str(row.get('author', 'Unknown'))[:24]
        release_date = str(row.get('latest_release_date', 'Unknown'))
        license_info = str(row.get('license', 'Unknown'))[:14]
        
        print(f"{package:<18} {author:<25} {release_date:<15} {license_info}")
    
    # Save results
    timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
    output_file = f"pepy_api_download_stats_{timestamp}.csv"
    df.to_csv(output_file, index=False)
    print(f"\nDetailed results saved to: {output_file}")
    
    # Summary statistics
    successful_packages = df[df['pepy_status'] == 'success']
    packages_with_2025_data = df[df['estimated_2025_downloads'] > 0]
    
    print(f"\nSUMMARY STATISTICS:")
    print(f"Total packages analyzed: {len(df)}")
    print(f"Successfully found on pepy.tech: {len(successful_packages)}")
    print(f"Packages with 2025 download data: {len(packages_with_2025_data)}")
    
    if len(packages_with_2025_data) > 0:
        top_package = packages_with_2025_data.iloc[0]
        total_2025_downloads = packages_with_2025_data['estimated_2025_downloads'].sum()
        
        print(f"\n🏆 TOP PERFORMERS IN 2025:")
        print(f"Most downloaded (2025): {top_package['package']} ({top_package['estimated_2025_downloads']:,} downloads)")
        print(f"Total 2025 downloads across all packages: {total_2025_downloads:,}")
        
        # Show top 5 for 2025
        print(f"\n📊 TOP 5 BY 2025 DOWNLOADS:")
        for i, (_, row) in enumerate(packages_with_2025_data.head(5).iterrows(), 1):
            print(f"{i}. {row['package']}: {row['estimated_2025_downloads']:,} downloads")
    
    # Show success rate
    success_rate = len(successful_packages) / len(df) * 100
    print(f"\n📈 API Success Rate: {success_rate:.1f}%")
    
    if success_rate < 80:
        print("💡 Tip: Some packages might not be available on pepy.tech or have different names")
    
    return df

# Run the analysis
if __name__ == "__main__":
    try:
        results = main()
        if results is not None:
            print("\n✅ Analysis completed successfully!")
            print("📊 Data includes actual download counts from pepy.tech API")
            print("🗓️  Special focus on 2025 download estimates")
            print("\n💡 Pro tip: Set PEPY_API_KEY environment variable to avoid entering key each time")
    except KeyboardInterrupt:
        print("\n❌ Analysis interrupted by user")
    except Exception as e:
        print(f"\n❌ Unexpected error: {str(e)}")
        print("Please check your API key and internet connection.")

Pepy.tech API Download Statistics Collector (Free Tier)
Analyzing 14 data processing libraries...
Date: 2025-08-20 12:54:48
Rate limit: 10 requests per minute (Free tier)

Pepy.tech API Key Required
You can get your free API key from: https://pepy.tech/user
After getting your key, you can:
1. Set environment variable: export PEPY_API_KEY=your_key_here
2. Or enter it below (will be used for this session only)

✅ API key configured
⏱️  Request delay: 7.0 seconds between requests

[1/14]   📦 pandas
    🔍 Fetching from pepy.tech API...
    ✅ v2.3.1 | Total: 9,300,523,262 | 30d: 375,590,376 | 2025: 1,092,026,615
    ⏳ Waiting 7.0s for rate limit...

[2/14]   📦 pyspark
    🔍 Fetching from pepy.tech API...
    ✅ v4.0.0 | Total: 1,489,568,760 | 30d: 43,530,972 | 2025: 135,253,095
    ⏳ Waiting 7.0s for rate limit...

[3/14]   📦 dask
    🔍 Fetching from pepy.tech API...
    ✅ v2025.7.0 | Total: 518,013,075 | 30d: 16,650,133 | 2025: 47,513,798
    ⏳ Waiting 7.0s for rate limit...

[4/14]   📦 ibi