In [1]:
import requests
from Levenshtein import ratio
import pandas as pd
from datetime import datetime, timedelta
import re
import time
import sys
import os
from joblib import dump, load
from tqdm import tqdm
from pathlib import Path


# Add parent directory to sys.path for config imports
parent_dir = str(Path().resolve().parent)
if parent_dir not in sys.path:
    sys.path.append(parent_dir)
from config.settings import API_KEYS, API_ENDPOINTS

print("üìä Solana DeFi Tracker - Data Collection")
print(f"Cache directory: {os.path.normpath('../data')}")
print(f"Collection timestamp: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")

üìä Solana DeFi Tracker - Data Collection
Cache directory: ..\data
Collection timestamp: 2025-09-05 12:07:39


#### Verify Cache Directories

In [2]:
# Verify cache directories exist, create if missing
cache_dirs = ['../data/api_responses', '../data/processed', '../data/temp']
for cache_dir in cache_dirs:
    if not os.path.exists(cache_dir):
        os.makedirs(cache_dir, exist_ok=True)
        print(f"‚úÖ Created cache directory: {cache_dir}")
    else:
        print(f"‚úÖ Cache directory exists: {cache_dir}")

‚úÖ Cache directory exists: ../data/api_responses
‚úÖ Cache directory exists: ../data/processed
‚úÖ Cache directory exists: ../data/temp


#### Helper Functions

In [3]:
def make_request(url, headers=None, params=None, max_retries=3, is_post=False):
    """Make API request with retry logic"""
    for attempt in range(max_retries):
        try:
            if is_post:
                response = requests.post(url, headers=headers, json=params, timeout=30)
            else:
                response = requests.get(url, headers=headers, params=params, timeout=30)
            response.raise_for_status()
            return response.json()
        except requests.exceptions.RequestException as e:
            print(f"‚ùå Attempt {attempt + 1} failed: {e}")
            if attempt < max_retries - 1:
                time.sleep(2 ** attempt)  # Exponential backoff
            else:
                return None

def save_raw_data(data, filename, description=""):
    """Save raw API response to api_responses directory"""
    filepath = os.path.normpath(f"../data/api_responses/{filename}")
    dump(data, filepath)
    print(f"üíæ Saved raw: {description} ‚Üí {filepath}")
    return filepath

def save_processed_data(data, filename, description=""):
    """Save processed data to processed directory"""
    filepath = os.path.normpath(f"../data/processed/{filename}")
    dump(data, filepath)
    print(f"üíæ Saved processed: {description} ‚Üí {filepath}")
    return filepath

def save_cache(data, filename, description=""):
    """Save cache data to temp directory"""
    filepath = os.path.normpath(f"../data/temp/{filename}")
    dump(data, filepath)
    print(f"üíæ Saved cache: {description} ‚Üí {filepath}")
    return filepath

def load_cache(filename):
    """Load cache data from temp directory"""
    filepath = os.path.normpath(f"../data/temp/{filename}")
    if os.path.exists(filepath):
        try:
            return load(filepath)
        except Exception as e:
            print(f"‚ö†Ô∏è Failed to load cache {filename}: {e}")
            return None
    return None

def format_currency(amount):
    """Format currency amount with appropriate units (K, M, B)"""
    if amount is None or amount == 0:
        return "$0"
    
    if amount >= 1_000_000_000:
        return f"${amount/1_000_000_000:.2f}B"
    elif amount >= 1_000_000:
        return f"${amount/1_000_000:.2f}M"
    elif amount >= 1_000:
        return f"${amount/1_000:.2f}K"
    else:
        return f"${amount:.2f}"

#### Step 1: Collect DefiLlama Protocol Data (TVL)

In [4]:
print("\nüîç Collecting Solana Protocol TVL Data...")

def get_all_solana_tvl_data():
    """
    Collect TVL data for Solana DeFi protocols from DefiLlama, excluding CEX and CEX-related protocols.
    
    """
 
    # DefiLlama API endpoint for all protocols
    base_url = API_ENDPOINTS['defillama']['base_url']
    protocols_url = f"{base_url}/protocols"
    
    # Make the API request
    all_protocols = make_request(protocols_url)
    
    if not all_protocols:
        print("‚ùå DefiLlama API failed")
        return None
    
    print(f"‚úÖ DefiLlama API working! Found {len(all_protocols)} total protocols")
    
    # Comprehensive list of CEX names to exclude
    cex_list = [
        'binance', 'bybit', 'coinbase', 'kraken', 'kucoin', 'okx',
        'crypto.com', 'crypto', 'bitfinex', 'huobi', 'htx', 'gate', 'gate.io',
        'mexc', 'bitget', 'gemini', 'bitstamp', 'bithumb', 'bitpanda',
        'bitmex', 'coinex', 'upbit', 'revolut', 'coindcx', 'bitflyer',
        'coincheck', 'bitbank', 'swissborg', 'deribit'
    ]
    
    # Filter for Solana DeFi protocols (excluding CEX and CEX-related)
    solana_protocols = []
    excluded_protocols = []
    
    for protocol in all_protocols:
        chains = protocol.get('chains', [])
        category = protocol.get('category', '').lower()
        name = protocol.get('name', '').lower()
        
        is_solana = (
            'Solana' in chains or 
            'solana' in chains or
            any('solana' in str(chain).lower() for chain in chains) or
            protocol.get('chain') == 'Solana'
        )
        
        # Exclude CEX and CEX-related protocols
        is_cex_related = (
            category == 'cex' or
            any(cex in name for cex in cex_list)
        )
        
        if is_solana and not is_cex_related:
            tvl_value = protocol.get('tvl') or 0
            
            solana_protocols.append({
                'name': protocol.get('name', 'Unknown'),
                'slug': protocol.get('slug', ''),
                'tvl': tvl_value,
                'chains': chains,
                'category': protocol.get('category', 'Unknown'),
                'change_1h': protocol.get('change_1h'),
                'change_1d': protocol.get('change_1d'),
                'change_7d': protocol.get('change_7d'),
                'mcap': protocol.get('mcap'),
                'symbol': protocol.get('symbol', ''),
                'url': protocol.get('url', ''),
                'description': protocol.get('description', ''),
                'gecko_id': protocol.get("coingeckoId"),
                'timestamp': datetime.now()
            })
        elif is_solana and is_cex_related:
            excluded_protocols.append((protocol.get('name'), category))
    
    if excluded_protocols:
        print(f"‚ö†Ô∏è Excluded {len(excluded_protocols)} CEX-related protocols: {', '.join([name for name, _ in excluded_protocols[:5]])}")
        # Save excluded protocols log to cache directory
        save_cache(excluded_protocols, f'excluded_protocols_{datetime.now().strftime("%Y%m%d_%H%M%S")}.joblib', 
                    "Excluded CEX protocols log")  
    
    if not solana_protocols:
        print("‚ùå No Solana DeFi protocols found")
        return None
    
    # Convert to DataFrame
    df = pd.DataFrame(solana_protocols)
    
    # Sort by TVL descending
    df = df.sort_values(by="tvl", ascending=False).reset_index(drop=True)
    
    print(f"üåü Found {len(df)} Solana DeFi protocols:")
    
    # Calculate statistics
    total_tvl = df['tvl'].sum()
    active_protocols = (df['tvl'] > 0).sum()
    
    print(f"üìä Total Solana DeFi TVL: ${total_tvl:,.0f}")
    print(f"üìà Active protocols (TVL > 0): {active_protocols}/{len(df)}")
    
    # Display top protocols
    print(f"\n{'Rank':<5} {'Protocol':<25} {'TVL':<15} {'Category':<20} {'1d Change':<10}")
    print("=" * 85)
    
    for i, row in df.head(20).iterrows():
        tvl_formatted = format_currency(row['tvl'])
        change_1d = row['change_1d']
        change_str = f"{change_1d:+.1f}%" if change_1d is not None else "N/A"
        
        print(f"{i+1:<5} {row['name'][:24]:<25} {tvl_formatted:<15} "
              f"{row['category'][:19]:<20} {change_str:<10}")
    
    # Show category breakdown
    category_breakdown = (
        df.groupby("category")['tvl']
        .agg(['count', 'sum'])
        .rename(columns={'count': 'protocols', 'sum': 'total_tvl'})
        .sort_values(by="total_tvl", ascending=False)
    )
    
    print(f"\nüìã Category Breakdown:")
    print(f"{'Category':<25} {'Count':<8} {'Total TVL':<15}")
    print("-" * 50)
    
    for category, row in category_breakdown.head(10).iterrows():
        print(f"{category[:24]:<25} {row['protocols']:<8} {format_currency(row['total_tvl']):<15}")
    
    # Save DataFrame to joblib
    timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
    filename = f'solana_defi_tvl_{timestamp}.joblib'
    save_raw_data(df, filename, 'Solana DeFi protocols TVL data')
    print(f"\nüíæ DataFrame saved to {filename}")
    
    return df

# Execute the collection
tvl_df = get_all_solana_tvl_data()

if tvl_df is not None:
    print(f"\n‚úÖ Successfully collected TVL data for {len(tvl_df)} Solana DeFi protocols")
    print("üìÅ DataFrame saved to joblib file for further analysis")
    print(f"üìä Dataset includes {len(tvl_df)} protocols worth ${tvl_df['tvl'].sum():,.0f} in total TVL")
else:
    print("\n‚ùå Failed to collect TVL data")

print("\n" + "=" * 50)


üîç Collecting Solana Protocol TVL Data...
‚úÖ DefiLlama API working! Found 6366 total protocols
‚ö†Ô∏è Excluded 42 CEX-related protocols: Binance CEX, OKX, Bitfinex, Bybit, Gate
üíæ Saved cache: Excluded CEX protocols log ‚Üí ..\data\temp\excluded_protocols_20250905_120743.joblib
üåü Found 252 Solana DeFi protocols:
üìä Total Solana DeFi TVL: $77,652,866,558
üìà Active protocols (TVL > 0): 220/252

Rank  Protocol                  TVL             Category             1d Change 
1     Lido                      $38.28B         Liquid Staking       +0.5%     
2     Jito Liquid Staking       $3.03B          Liquid Staking       -1.7%     
3     Portal                    $2.76B          Bridge               +1.6%     
4     Kamino Lend               $2.66B          Lending              +2.3%     
5     Sanctum Validator LSTs    $2.42B          Liquid Staking       +0.9%     
6     Jupiter Perpetual Exchan  $2.40B          Derivatives          +0.4%     
7     Raydium AMM              

#### Step 2: Collect DefiLlama Revenue Data

In [5]:
print("\nüîç Collecting Solana Protocol Revenue Data...")

def get_solana_revenue_data():
    """Collect REVENUE data from DefiLlama"""
    base_url = API_ENDPOINTS['defillama']['base_url']
    revenue_url = f"{base_url}/overview/fees/solana"
    
    #
    params = {
        'dataType': 'dailyRevenue', 
        'excludeTotalDataChart': 'true',
        'excludeTotalDataChartBreakdown': 'true'
    }
    
    data = make_request(revenue_url, params=params)
    
    if not data:
        print("‚ùå Solana Revenue API failed")
        return None
    
    print(f"‚úÖ Collected Solana revenue data")
    print(f"Total protocols found: {len(data.get('protocols', []))}")
    
    protocols = data.get('protocols', [])
    if not protocols:
        print("No protocol data found.")
        return None
    
    # Sort protocols by total24h in descending order
    sorted_protocols = sorted(protocols, 
                            key=lambda x: x.get('total24h', 0) or 0, 
                            reverse=True)
    
    # Build DataFrame
    revenue_list = []
    for protocol in sorted_protocols:
        revenue_list.append({
            'protocol': protocol.get('name', 'Unknown'),
            'revenue_24h': protocol.get('total24h', 0),  
            'revenue_7d': protocol.get('total7d', 0),    
            'revenue_30d': protocol.get('total30d', 0),  
            'revenue_all_time': protocol.get('totalAllTime', 0),
            'data_type': 'revenue',  
            'chain': 'solana',
            'timestamp': datetime.now()
        })
    
    df = pd.DataFrame(revenue_list)
    
    # Display summary
    protocols_with_data = (df['revenue_24h'] > 0).sum()
    print(f"üíµ Found revenue data for {protocols_with_data} active protocols:")
    
    # Show top 10
    top_protocols = df.sort_values(by="revenue_24h", ascending=False).head(10)
    print(f"\n{'Protocol':<25} {'24h Revenue':<15} {'7d Revenue':<15} {'30d Revenue':<15}")
    print("=" * 75)
    
    for _, row in top_protocols.iterrows():
        if row['revenue_24h'] > 0:
            print(f"{row['protocol'][:24]:<25} {format_currency(row['revenue_24h']):<15} "
                  f"{format_currency(row['revenue_7d']):<15} {format_currency(row['revenue_30d']):<15}")
    
    # Save DataFrame to joblib
    timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
    filename = f'solana_revenue_{timestamp}.joblib'
    save_raw_data(df, filename, 'Solana revenue data')
    print(f"\nüíæ DataFrame saved to {filename}")
    
    return df


print("üìä Solana Protocol Revenue Tracker")
print("=" * 50)

revenue_df = get_solana_revenue_data()

if revenue_df is not None:
    print(f"\n‚úÖ Successfully collected revenue data for {len(revenue_df)} protocols")
    print("üìÅ DataFrame saved to joblib file for further analysis")
else:
    print("\n‚ùå Failed to collect revenue data")

print("\n" + "=" * 50)


üîç Collecting Solana Protocol Revenue Data...
üìä Solana Protocol Revenue Tracker
‚úÖ Collected Solana revenue data
Total protocols found: 104
üíµ Found revenue data for 91 active protocols:

Protocol                  24h Revenue     7d Revenue      30d Revenue    
pump.fun                  $1.62M          $10.26M         $45.90M        
Axiom                     $1.56M          $10.65M         $53.68M        
Jupiter Perpetual Exchan  $785.31K        $5.74M          $23.71M        
PumpSwap                  $541.92K        $2.16M          $6.39M         
Phantom Wallet            $500.40K        $3.48M          $16.79M        
Photon                    $159.93K        $1.05M          $5.94M         
Solana                    $146.25K        $1.06M          $4.56M         
Meteora DAMM V2           $124.96K        $895.45K        $6.17M         
Binance Staked SOL        $124.92K        $572.36K        $1.61M         
Raydium AMM               $97.50K         $803.08K        $3.58

#### Step 3: Collect Solana Fees Data

In [6]:
print("\nüîç Collecting Solana Protocol Fees Data...")

def get_solana_fees_data():
    
    base_url = API_ENDPOINTS['defillama']['base_url']
    fees_url = f"{base_url}/overview/fees/solana"
    
  
    params = {
        'dataType': 'dailyFees',  
        'excludeTotalDataChart': 'true',
        'excludeTotalDataChartBreakdown': 'true'
    }
    
    data = make_request(fees_url, params=params)
    
    if not data:
        print("‚ùå Solana Fees API failed")
        return None
    
    print(f"‚úÖ Collected Solana fees data")
    print(f"Total protocols found: {len(data.get('protocols', []))}")
    
    protocols = data.get('protocols', [])
    
    if not protocols:
        print("No protocol data found.")
        return None
    
    # Sort protocols by total24h in descending order
    sorted_protocols = sorted(protocols, 
                            key=lambda x: x.get('total24h', 0) or 0, 
                            reverse=True)
    
    # Build list for DataFrame
    fees_list = []
    for protocol in sorted_protocols:
        fees_list.append({
            'protocol': protocol.get('name', 'Unknown'),
            'fees_24h': protocol.get('total24h', 0),     
            'fees_7d': protocol.get('total7d', 0),       
            'fees_30d': protocol.get('total30d', 0),     
            'fees_all_time': protocol.get('totalAllTime', 0),
            'data_type': 'fees',  # Add data type identifier
            'chain': 'solana',
            'timestamp': datetime.now()
        })
    
    # Convert to DataFrame
    df2 = pd.DataFrame(fees_list)
    
    # Display summary
    protocols_with_data = (df2['fees_24h'] > 0).sum()
    print(f"üí∞ Found fee data for {protocols_with_data} active protocols:")
    
    # Show top 10
    top_protocols = df2.sort_values(by="fees_24h", ascending=False).head(10)
    print(f"\n{'Protocol':<25} {'24h Fees':<15} {'7d Fees':<15} {'30d Fees':<15}")
    print("=" * 70)
    for _, row in top_protocols.iterrows():
        if row['fees_24h'] > 0:
            print(f"{row['protocol'][:24]:<25} {format_currency(row['fees_24h']):<15} "
                  f"{format_currency(row['fees_7d']):<15} {format_currency(row['fees_30d']):<15}")
    
    # Save DataFrame to joblib
    timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
    filename = f'solana_fees_{timestamp}.joblib'
    save_raw_data(df2, filename, 'Solana fees data')
    print(f"\nüíæ DataFrame saved to {filename}")
    
    return df2


print("üìä Solana Protocol Fees Tracker")
print("=" * 50)
    
fees_df = get_solana_fees_data()
    
if fees_df is not None:
    print(f"\n‚úÖ Successfully collected fees data for {len(fees_df)} protocols")
    print("Data saved to joblib file for further analysis")
else:
    print("\n‚ùå Failed to collect fees data")
    
print("\n" + "=" * 50)


üîç Collecting Solana Protocol Fees Data...
üìä Solana Protocol Fees Tracker
‚úÖ Collected Solana fees data
Total protocols found: 117
üí∞ Found fee data for 104 active protocols:

Protocol                  24h Fees        7d Fees         30d Fees       
Jupiter Perpetual Exchan  $3.14M          $22.95M         $94.84M        
PumpSwap                  $2.99M          $12.54M         $37.80M        
pump.fun                  $1.62M          $10.26M         $45.90M        
Axiom                     $1.56M          $10.65M         $53.68M        
Solana                    $1.24M          $9.06M          $42.67M        
Jito Liquid Staking       $1.06M          $4.13M          $12.50M        
Meteora DLMM              $988.57K        $7.56M          $53.84M        
Sanctum Validator LSTs    $905.43K        $3.64M          $12.86M        
Binance Staked SOL        $793.21K        $3.26M          $10.59M        
Raydium AMM               $650.80K        $5.37M          $24.34M        


#### Get all Solana tokens list via Jupiter API

In [7]:
# Fetch Jupiter token list
url = "https://token.jup.ag/all"
resp = requests.get(url)
tokens = resp.json()

# Convert to DataFrame
df = pd.DataFrame(tokens)

# Select useful columns
jupiter_df = df[["address", "symbol", "name", "decimals", "logoURI"]]

#### Step 4: Collect CoinGecko Price and Supply Data

In [None]:
print("\nüîç Collecting CoinGecko Price and Supply Data for Solana DeFi Protocols...")

def collect_coingecko_data_for_solana_protocols():
    """Simplified price retrieval for Solana protocols using CoinGecko Pro API."""
    print("\nüîç Collecting CoinGecko Pro Data for Solana Protocols...")

    # Initialize output dictionary and counters
    coingecko_data = {}
    processed_count = 0
    successful_count = 0

    # Load or initialize cache
    cache_filepath = os.path.normpath('../data/temp/coingecko_pro_price_cache.joblib')
    price_cache = load_cache('coingecko_pro_price_cache.joblib') or {}

    # CoinGecko Pro API setup
    base_url = API_ENDPOINTS['coingecko']['pro_base_url']   
    headers = {"x-cg-pro-api-key": API_KEYS['coingecko']}
    platform_id = "solana"

    # Match protocols with Jupiter token list for contract addresses
    token_map = {}
    unmatched_tokens = []
    
    # Convert DataFrame to records for compatibility
    solana_protocols = tvl_df.to_dict('records') if not tvl_df.empty else []
    
    for row in solana_protocols:
        symbol = str(row.get('symbol', '')).lower().strip()
        protocol_name = row.get('name', '')
        if symbol in ['-', '', 'nan'] or not symbol:  # Skip invalid symbols
            continue
        
        # Find matching token in Jupiter list
        jupiter_match = jupiter_df[jupiter_df['symbol'].str.lower() == symbol]
        if not jupiter_match.empty:
            token_map[protocol_name] = jupiter_match.iloc[0]['address']
        else:
            unmatched_tokens.append((protocol_name, symbol))
        processed_count += 1

    if unmatched_tokens:
        print(f"‚ö†Ô∏è {len(unmatched_tokens)} tokens not found in Jupiter list: {', '.join([f'{name} ({sym})' for name, sym in unmatched_tokens[:5]])}")
        save_cache(unmatched_tokens, f'unmatched_tokens_{datetime.now().strftime("%Y%m%d_%H%M%S")}.joblib', "Unmatched tokens log")

    # Batch query for prices (Pro API supports multiple addresses)
    contract_addresses = list(token_map.values())
    if not contract_addresses:
        print("‚ùå No valid contract addresses found")
        return coingecko_data, processed_count, successful_count

    # Split into chunks to respect API limits (e.g., 100 addresses per request)
    chunk_size = 100
    for i in range(0, len(contract_addresses), chunk_size):
        chunk = contract_addresses[i:i + chunk_size]
        chunk_str = ','.join(chunk)
        
        # Check cache first
        cache_key = f"solana_{chunk_str}"
        if cache_key in price_cache:
            prices = price_cache[cache_key]
        else:
            url = f"{base_url}/simple/token_price/{platform_id}?contract_addresses={chunk_str}&vs_currencies=usd&include_market_cap=true&include_24hr_change=true&include_24hr_vol=true&include_last_updated_at=true"
            prices = make_request(url, headers=headers)
            if prices:
                price_cache[cache_key] = prices
            else:
                print(f"‚ùå Failed to fetch prices for chunk {i//chunk_size + 1}")
                continue
            
            # Add delay for Pro API rate limiting
            time.sleep(0.2)

        # Process each protocol in the chunk
        for protocol_name, address in [(k, v) for k, v in token_map.items() if v in chunk]:
            if address not in prices:
                continue
            
            price_data = prices[address]
            
            # Find protocol data from tvl_df for additional info
            protocol_match = tvl_df[tvl_df['name'] == protocol_name]
            if not protocol_match.empty:
                protocol_row = protocol_match.iloc[0]
                protocol_key = protocol_row.get('slug', protocol_name.lower().replace(' ', '_'))
                symbol = protocol_row.get('symbol', '').upper() if protocol_row.get('symbol') else ''
                tvl = protocol_row.get('tvl', 0)
                category = protocol_row.get('category', '')
            else:
                protocol_key = protocol_name.lower().replace(' ', '_')
                symbol = ''
                tvl = 0
                category = ''
            
            # Create simplified data structure with only available fields
            coingecko_data[protocol_key] = {
                'protocol_name': protocol_name,
                'symbol': symbol,
                'current_price_usd': price_data.get('usd', 0) or 0,
                'market_cap_usd': price_data.get('usd_market_cap', 0) or 0,
                'price_change_24h_percent': price_data.get('usd_24h_change', 0) or 0,
                'tvl': tvl,
                'category': category,
                'collection_timestamp': datetime.now()
            }
            successful_count += 1

    # Save cache
    save_cache(price_cache, 'coingecko_pro_price_cache.joblib', "CoinGecko Pro price cache")

    return coingecko_data, processed_count, successful_count

# Execute the collection
if 'tvl_df' in locals() and not tvl_df.empty and 'jupiter_df' in locals() and not jupiter_df.empty:
    coingecko_data, processed_count, successful_count = collect_coingecko_data_for_solana_protocols()
    
    if coingecko_data:
        print(f"\n‚úÖ Successfully collected CoinGecko Pro data!")
        print(f"üìä Processed: {processed_count} protocols")
        print(f"üéØ Successful matches: {successful_count}")
        print(f"üìà Success rate: {(successful_count/processed_count*100):.1f}%")
        
        # Save raw data
        timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
        save_raw_data(coingecko_data, f'solana_coingecko_enhanced_{timestamp}.joblib', 
                     'Enhanced CoinGecko data for Solana DeFi protocols')
        
        # Display sample
        print(f"\nüìã Sample of collected data:")
        print(f"{'Protocol':<25} {'Symbol':<8} {'Price':<12} {'Market Cap':<15} {'TVL':<15}")
        print("=" * 75)
        sorted_data = sorted(coingecko_data.items(), key=lambda x: x[1].get('market_cap_usd', 0), reverse=True)
        for protocol_key, data in sorted_data[:15]:
            protocol_name = data.get('protocol_name', protocol_key)[:24]
            symbol = data.get('symbol', 'N/A')[:7]
            price = f"${data.get('current_price_usd', 0):.4f}"
            mcap = format_currency(data.get('market_cap_usd', 0))
            tvl = format_currency(data.get('tvl', 0))
            print(f"{protocol_name:<25} {symbol:<8} {price:<12} {mcap:<15} {tvl:<15}")
        
        # Summary stats
        total_market_cap = sum(data.get('market_cap_usd', 0) for data in coingecko_data.values())
        total_tvl = sum(data.get('tvl', 0) for data in coingecko_data.values())
        positive_24h = sum(1 for data in coingecko_data.values() if data.get('price_change_24h_percent', 0) > 0)
        
        print(f"\nüìä Portfolio Summary:")
        print(f"  ‚Ä¢ Total Market Cap: {format_currency(total_market_cap)}")
        print(f"  ‚Ä¢ Total TVL: {format_currency(total_tvl)}")
        print(f"  ‚Ä¢ Tokens with positive 24h change: {positive_24h}/{len(coingecko_data)}")
        
else:
    print("‚ùå Required data (tvl_df or jupiter_df) not available.")


üîç Collecting CoinGecko Price and Supply Data for Solana DeFi Protocols...

üîç Collecting CoinGecko Pro Data for Solana Protocols...
‚ö†Ô∏è 8 tokens not found in Jupiter list: Adrena Protocol (adx), Balanced Exchange (baln), Amun (amun), Divvy.Bet (dvy), Renec Lend (rel)
üíæ Saved cache: Unmatched tokens log ‚Üí ..\data\temp\unmatched_tokens_20250905_120827.joblib
üíæ Saved cache: CoinGecko Pro price cache ‚Üí ..\data\temp\coingecko_pro_price_cache.joblib

‚úÖ Successfully collected CoinGecko Pro data!
üìä Processed: 125 protocols
üéØ Successful matches: 84
üìà Success rate: 67.2%
üíæ Saved raw: Enhanced CoinGecko data for Solana DeFi protocols ‚Üí ..\data\api_responses\solana_coingecko_enhanced_20250905_120831.joblib

üìã Sample of collected data:
Protocol                  Symbol   Price        Market Cap      TVL            
Jupiter Perpetual Exchan  JUP      $0.4947      $1.54B          $2.40B         
Jupiter Staked SOL        JUP      $0.4947      $1.54B          $1.21

#### Step 5: Collect Helius Token Holder Data

In [9]:
print("\nüîç Collecting Helius Token Holder Data for Solana DeFi Protocols...")
print("=" * 50)

helius_url = f"https://mainnet.helius-rpc.com/?api-key={API_KEYS['helius']}"
headers = {"Content-Type": "application/json"}

def get_solana_token_holders(jupiter_df, coingecko_data, tvl_df):
    # Basic input validation
    if not (coingecko_data and isinstance(coingecko_data, dict)):
        print("‚ùå No CoinGecko data available.")
        return pd.DataFrame()
    if jupiter_df.empty or 'address' not in jupiter_df.columns:
        print("‚ùå Jupiter token data missing or incomplete.")
        return pd.DataFrame()
    if tvl_df.empty or 'symbol' not in tvl_df.columns:
        print("‚ùå TVL data missing or incomplete.")
        return pd.DataFrame()

    # Map protocol names to token addresses
    token_map, unmatched = {}, []
    for key, data in coingecko_data.items():
        symbol = data.get('symbol', '').lower().strip()
        protocol = data.get('protocol_name', key)
        match = jupiter_df[jupiter_df['symbol'].str.lower() == symbol]
        if not match.empty:
            token_map[protocol] = match.iloc[0]['address']
        else:
            unmatched.append((protocol, symbol, data.get('coingecko_id', '')))
    if unmatched:
        print(f"‚ö†Ô∏è {len(unmatched)} tokens not found in Jupiter list: {', '.join([f'{n} ({s})' for n, s, _ in unmatched[:5]])}")
        save_cache(unmatched, f'unmatched_tokens_{datetime.now():%Y%m%d_%H%M%S}.joblib', "Unmatched tokens log")

    # Collect holders
    holders, logs = [], []
    for name, address in tqdm(token_map.items(), desc="Fetching token holders"):
        try:
            payload = {
                "jsonrpc": "2.0", 
                "id": "1", 
                "method": "getTokenLargestAccounts", 
                "params": [address]
                }
            resp = requests.post(helius_url, json=payload, headers=headers).json()
            accounts = resp.get('result', {}).get('value', [])[:10]
            if not accounts:
                logs.append(f"‚ö†Ô∏è No accounts found for {name}")
                continue
            # Symbol fallback
            symbol = coingecko_data.get(name.lower().replace(' ', '_'), {}).get('symbol', '')
            if not symbol:
                match = tvl_df[tvl_df['name'].str.lower() == name.lower()]
                symbol = match.iloc[0]['symbol'] if not match.empty else name
            # Add holder info
            for rank, acc in enumerate(accounts, 1):
                holders.append({
                    'token_name': name, 'token_symbol': symbol, 'token_address': address,
                    'rank': rank, 'account_address': acc['address'],
                    'ui_amount': acc.get('uiAmount', 0), 'raw_amount': acc.get('amount', '0'),
                    'decimals': acc.get('decimals', 0), 'timestamp': datetime.now()
                })
            logs.append(f"‚úÖ {name}: {len(accounts)} accounts, top holder: {accounts[0].get('uiAmount', 0):,.0f} tokens")
            time.sleep(1.0)
        except Exception as e:
            logs.append(f"‚ùå Error processing {name}: {e}")

    save_cache(logs, f'token_holder_logs_{datetime.now():%Y%m%d_%H%M%S}.joblib', "Token holder collection logs")
    df = pd.DataFrame(holders)
    # Add percentage column
    if not df.empty:
        df['percentage_of_top10'] = df.groupby('token_name')['ui_amount'].transform(lambda x: x / x.sum() * 100 if x.sum() > 0 else 0)
    print(f"\n‚úÖ Fetched data for {len(token_map)} tokens. Records: {len(df)}")
    save_raw_data(df, f'solana_token_holders_{datetime.now():%Y%m%d_%H%M%S}.joblib', 'Solana token holders DataFrame')
    return df

# Run collection

holders_data = get_solana_token_holders(jupiter_df, coingecko_data, tvl_df)
print("Data saved to joblib file for further analysis" if not holders_data.empty else "\n‚ùå Failed to collect token holders data")
print("\n" + "=" * 50)


üîç Collecting Helius Token Holder Data for Solana DeFi Protocols...


Fetching token holders: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 80/80 [03:47<00:00,  2.84s/it]

üíæ Saved cache: Token holder collection logs ‚Üí ..\data\temp\token_holder_logs_20250905_121236.joblib

‚úÖ Fetched data for 80 tokens. Records: 760
üíæ Saved raw: Solana token holders DataFrame ‚Üí ..\data\api_responses\solana_token_holders_20250905_121236.joblib
Data saved to joblib file for further analysis




