In [1]:
import requests
import pandas as pd
from datetime import datetime, timedelta
import time
import sys
import os
from joblib import dump
from pathlib import Path

# Add parent directory to sys.path for config imports
parent_dir = str(Path().resolve().parent)
if parent_dir not in sys.path:
    sys.path.append(parent_dir)
from config.settings import API_KEYS, API_ENDPOINTS, SOLANA_PROTOCOLS, CACHE_CONFIG

print("📊 Solana DeFi Tracker - Data Collection")
print(f"Collecting data for {len(SOLANA_PROTOCOLS)} protocols")
print(f"Cache directory: {os.path.abspath('../data')}")
print(f"Collection timestamp: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")

📊 Solana DeFi Tracker - Data Collection
Collecting data for 7 protocols
Cache directory: c:\Users\DELL\Documents\Data Science and Analytics\Web3\grp_project\data
Collection timestamp: 2025-08-31 13:38:07


#### Verify Cache Directories

In [2]:
# Verify cache directories exist, create if missing
cache_dirs = ['../data/api_responses', '../data/processed', '../data/temp']
for cache_dir in cache_dirs:
    if not os.path.exists(cache_dir):
        os.makedirs(cache_dir, exist_ok=True)
        print(f"✅ Created cache directory: {cache_dir}")
    else:
        print(f"✅ Cache directory exists: {cache_dir}")

✅ Cache directory exists: ../data/api_responses
✅ Cache directory exists: ../data/processed
✅ Cache directory exists: ../data/temp


#### Helper Functions

In [3]:
def make_request(url, headers=None, params=None, max_retries=3, is_post=False):
    """Make API request with retry logic"""
    for attempt in range(max_retries):
        try:
            if is_post:
                response = requests.post(url, headers=headers, json=params, timeout=30)
            else:
                response = requests.get(url, headers=headers, params=params, timeout=30)
            response.raise_for_status()
            return response.json()
        except requests.exceptions.RequestException as e:
            print(f"❌ Attempt {attempt + 1} failed: {e}")
            if attempt < max_retries - 1:
                time.sleep(2 ** attempt)  # Exponential backoff
            else:
                return None

def save_raw_data(data, filename, description=""):
    """Save raw API response to api_responses directory"""
    filepath = os.path.normpath(f"../data/api_responses/{filename}")
    dump(data, filepath)
    print(f"💾 Saved raw: {description} → {filepath}")
    return filepath

def save_processed_data(data, filename, description=""):
    """Save processed data to processed directory"""
    filepath = os.path.normpath(f"../data/processed/{filename}")
    dump(data, filepath)
    print(f"💾 Saved processed: {description} → {filepath}")
    return filepath

def format_currency(amount):
    """Format currency amount with appropriate units (K, M, B)"""
    if amount is None or amount == 0:
        return "$0"
    
    if amount >= 1_000_000_000:
        return f"${amount/1_000_000_000:.2f}B"
    elif amount >= 1_000_000:
        return f"${amount/1_000_000:.2f}M"
    elif amount >= 1_000:
        return f"${amount/1_000:.2f}K"
    else:
        return f"${amount:.2f}"

#### Step 1: Collect CoinGecko Price and Supply Data

In [None]:
print("\n🔍 Collecting CoinGecko Price and Supply Data...")
# Get token IDs for protocols with CoinGecko IDs
token_ids = [protocol_info['coingecko_id'] for protocol_key, protocol_info in SOLANA_PROTOCOLS.items() if protocol_info['coingecko_id']]
print(f"Token IDs to collect: {token_ids}")

coingecko_data = {}
if API_KEYS.get('coingecko'):
    headers = {'X-Cg-Pro-Api-Key': API_KEYS['coingecko']}
    
    # Use /coins/{id} endpoint for detailed data including supply
    for token_id in token_ids:
        coin_url = f"{API_ENDPOINTS['coingecko']['base_url']}/coins/{token_id}"
        params = {
            'localization': 'false',
            'tickers': 'false',
            'market_data': 'true',
            'community_data': 'false',
            'developer_data': 'false',
            'sparkline': 'false'
        }
        
        coin_data = make_request(coin_url, headers=headers, params=params)
        if coin_data:
            coingecko_data[token_id] = {
                'usd': coin_data.get('market_data', {}).get('current_price', {}).get('usd', 0),
                'usd_market_cap': coin_data.get('market_data', {}).get('market_cap', {}).get('usd', 0),
                'usd_24h_vol': coin_data.get('market_data', {}).get('total_volume', {}).get('usd', 0),
                'usd_24h_change': coin_data.get('market_data', {}).get('price_change_percentage_24h', 0),
                'circulating_supply': coin_data.get('market_data', {}).get('circulating_supply', 0),
                'total_supply': coin_data.get('market_data', {}).get('total_supply', 0)
            }
        time.sleep(1)  # Respect CoinGecko rate limits (~10-50 calls/min for free tier)
    
    if coingecko_data:
        print(f"✅ Collected data for {len(coingecko_data)} tokens")
        # Save raw data
        timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
        save_raw_data(coingecko_data, f'coingecko_data_{timestamp}.joblib', 'CoinGecko price and supply data')
        
        # Display sample data
        for token, data in list(coingecko_data.items())[:3]:
            print(f" • {token}: ${data.get('usd', 'N/A')} (MCap: ${data.get('usd_market_cap', 'N/A'):,}, Circulating: {data.get('circulating_supply', 'N/A'):,.0f})")
    else:
        print("❌ CoinGecko API failed for all tokens")
else:
    print("⚠️ CoinGecko API key not found in .env")

#### Step 2: Collect DefiLlama Protocol Data (TVL)

In [8]:
print("\n🔍 Collecting Solana Protocol TVL Data...")

def get_all_solana_tvl_data():
    """
    Collect TVL data for Solana DeFi protocols from DefiLlama, excluding CEX
    """
    print("\n🔍 Collecting Solana DeFi Protocol TVL Data...")
    
    # DefiLlama API endpoint for all protocols
    base_url = "https://api.llama.fi"
    protocols_url = f"{base_url}/protocols"
    
    # Make the API request
    all_protocols = make_request(protocols_url)
    
    if not all_protocols:
        print("❌ DefiLlama API failed")
        return None
    
    print(f"✅ DefiLlama API working! Found {len(all_protocols)} total protocols")
    
    # Filter for Solana DeFi protocols (excluding CEX)
    solana_protocols = []
    for protocol in all_protocols:
        # Check if protocol is on Solana
        chains = protocol.get('chains', [])
        category = protocol.get('category', '').lower()
        
        # Multiple ways to identify Solana protocols
        is_solana = (
            'Solana' in chains or 
            'solana' in chains or
            any('solana' in str(chain).lower() for chain in chains) or
            protocol.get('chain') == 'Solana'
        )
        
        # Exclude CEX protocols
        if is_solana and category != 'cex':
            tvl_value = protocol.get('tvl', 0)
            # Handle None values
            if tvl_value is None:
                tvl_value = 0
                
            solana_protocols.append({
                'name': protocol.get('name', 'Unknown'),
                'slug': protocol.get('slug', ''),
                'tvl': tvl_value,
                'chains': chains,
                'category': protocol.get('category', 'Unknown'),
                'change_1h': protocol.get('change_1h'),
                'change_1d': protocol.get('change_1d'),
                'change_7d': protocol.get('change_7d'),
                'mcap': protocol.get('mcap'),
                'symbol': protocol.get('symbol', ''),
                'url': protocol.get('url', ''),
                'description': protocol.get('description', ''),
                'timestamp': datetime.now()
            })
    
    if not solana_protocols:
        print("❌ No Solana DeFi protocols found")
        return None
    
    # Sort by TVL descending
    solana_protocols.sort(key=lambda x: x['tvl'], reverse=True)
    
    print(f"🌟 Found {len(solana_protocols)} Solana DeFi protocols:")
    
    # Calculate statistics
    total_tvl = sum(p['tvl'] for p in solana_protocols)
    active_protocols = sum(1 for p in solana_protocols if p['tvl'] > 0)
    
    print(f"📊 Total Solana DeFi TVL: ${total_tvl:,.0f}")
    print(f"📈 Active protocols (TVL > 0): {active_protocols}/{len(solana_protocols)}")
    
    # Display top protocols
    print(f"\n{'Rank':<5} {'Protocol':<25} {'TVL':<15} {'Category':<20} {'1d Change':<10}")
    print("=" * 85)
    
    for i, protocol in enumerate(solana_protocols[:20], 1):  # Top 20
        tvl_formatted = format_currency(protocol['tvl'])
        change_1d = protocol.get('change_1d')
        change_str = f"{change_1d:+.1f}%" if change_1d is not None else "N/A"
        
        print(f"{i:<5} {protocol['name'][:24]:<25} {tvl_formatted:<15} "
              f"{protocol['category'][:19]:<20} {change_str:<10}")
    
    # Show category breakdown
    category_breakdown = {}
    for protocol in solana_protocols:
        category = protocol['category']
        if category not in category_breakdown:
            category_breakdown[category] = {'count': 0, 'tvl': 0}
        category_breakdown[category]['count'] += 1
        category_breakdown[category]['tvl'] += protocol['tvl']
    
    print(f"\n📋 Category Breakdown:")
    print(f"{'Category':<25} {'Count':<8} {'Total TVL':<15}")
    print("-" * 50)
    
    sorted_categories = sorted(category_breakdown.items(), 
                              key=lambda x: x[1]['tvl'], reverse=True)
    
    for category, data in sorted_categories[:10]:  # Top 10 categories
        print(f"{category[:24]:<25} {data['count']:<8} {format_currency(data['tvl']):<15}")
    
    # Create summary data structure
    solana_tvl_data = {
        'protocols': solana_protocols,
        'summary': {
            'total_protocols': len(solana_protocols),
            'active_protocols': active_protocols,
            'total_tvl': total_tvl,
            'category_breakdown': category_breakdown,
            'collection_timestamp': datetime.now()
        }
    }
    
    # Save data
    timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
    filename = f'solana_defi_tvl_{timestamp}.joblib'
    save_raw_data(solana_tvl_data, filename, 'Solana DeFi protocols TVL data')
    
    return solana_tvl_data

print("🌟 Solana DeFi Ecosystem TVL Tracker")
print("=" * 50)
    
tvl_data = get_all_solana_tvl_data()
    
if tvl_data:
    print(f"\n✅ Successfully collected TVL data for {len(tvl_data['protocols'])} Solana DeFi protocols")
    print("📁 Data saved to joblib file for further analysis")
        
    # Show file info
    protocols_count = tvl_data['summary']['total_protocols']
    total_tvl = tvl_data['summary']['total_tvl']
    print(f"📊 Dataset includes {protocols_count} protocols worth ${total_tvl:,.0f} in total TVL")
else:
    print("\n❌ Failed to collect TVL data")
    
print("\n" + "=" * 50)


🔍 Collecting Solana Protocol TVL Data...
🌟 Solana DeFi Ecosystem TVL Tracker

🔍 Collecting Solana DeFi Protocol TVL Data...
✅ DefiLlama API working! Found 6344 total protocols
🌟 Found 254 Solana DeFi protocols:
📊 Total Solana DeFi TVL: $81,319,976,918
📈 Active protocols (TVL > 0): 222/254

Rank  Protocol                  TVL             Category             1d Change 
1     Lido                      $39.19B         Liquid Staking       +2.1%     
2     Jito Liquid Staking       $3.04B          Liquid Staking       +0.3%     
3     Portal                    $2.76B          Bridge               -0.4%     
4     Kamino Lend               $2.69B          Lending              +0.4%     
5     BlackRock BUIDL           $2.41B          RWA                  -0.0%     
6     Binance Staked SOL        $2.34B          Liquid Staking       +0.6%     
7     Sanctum Validator LSTs    $2.33B          Liquid Staking       +0.7%     
8     Jupiter Perpetual Exchan  $2.29B          Derivatives         

#### Step 3: Collect DefiLlama Revenue Data

In [5]:
print("\n🔍 Collecting Solana Protocol Revenue Data...")

def get_solana_revenue_data():
    """
    Fetch protocol revenue data for all Solana-based protocols
    """
    print("\n🔍 Collecting Solana Protocol Revenue Data...")
    
    # API endpoint for Solana chain revenue data
    base_url = "https://api.llama.fi"
    revenue_url = f"{base_url}/overview/fees/solana"
    
    # Parameters to get revenue data
    params = {
        'dataType': 'dailyRevenue',
        'excludeTotalDataChart': 'true',
        'excludeTotalDataChartBreakdown': 'true'
    }
    
    # Make the API request
    data = make_request(revenue_url, params)
    
    if not data:
        print("❌ Solana Revenue API failed")
        return None
    
    print(f"✅ Collected Solana revenue data")
    print(f"Total protocols found: {len(data.get('protocols', []))}")
    
    # Process and store revenue data
    solana_revenue = {}
    protocols = data.get('protocols', [])
    
    if not protocols:
        print("No protocol data found.")
        return None
    
    # Sort protocols by total24h in descending order
    sorted_protocols = sorted(protocols, 
                            key=lambda x: x.get('total24h', 0) or 0, 
                            reverse=True)
    
    # Store data for each protocol
    for protocol in sorted_protocols:
        protocol_name = protocol.get('name', 'Unknown')
        protocol_key = protocol_name.lower().replace(' ', '_')
        
        solana_revenue[protocol_key] = {
            'name': protocol['name'],
            'total_24h': protocol.get('total24h', 0),
            'total_7d': protocol.get('total7d', 0),
            'total_30d': protocol.get('total30d', 0),
            'total_all_time': protocol.get('totalAllTime', 0),
            'chain': 'solana',
            'timestamp': datetime.now()
        }
    
    # Display summary
    protocols_with_data = sum(1 for p in solana_revenue.values() if p['total_24h'] > 0)
    
    print(f"💵 Found revenue data for {protocols_with_data} active protocols:")
    
    # Show top 10
    top_protocols = sorted(solana_revenue.items(), 
                          key=lambda x: x[1]['total_24h'], 
                          reverse=True)[:10]
    
    print(f"\n{'Protocol':<25} {'24h Revenue':<15} {'7d Revenue':<15} {'30d Revenue':<15}")
    print("=" * 75)
    
    for key, data in top_protocols:
        if data['total_24h'] > 0:
            print(f"{data['name'][:24]:<25} {format_currency(data['total_24h']):<15} "
                  f"{format_currency(data['total_7d']):<15} {format_currency(data['total_30d']):<15}")
    
    # Save raw data
    timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
    filename = f'solana_revenue_{timestamp}.joblib'
    save_raw_data(solana_revenue, filename, 'Solana revenue data')
    
    return solana_revenue


print("📊 Solana Protocol Revenue Tracker")
print("=" * 50)
    
revenue_data = get_solana_revenue_data()
    
if revenue_data:
    print(f"\n✅ Successfully collected revenue data for {len(revenue_data)} protocols")
    print("Data saved to joblib file for further analysis")
else:
    print("\n❌ Failed to collect revenue data")
    
print("\n" + "=" * 50)


🔍 Collecting Solana Protocol Revenue Data...
📊 Solana Protocol Revenue Tracker

🔍 Collecting Solana Protocol Revenue Data...
✅ Collected Solana revenue data
Total protocols found: 117
💵 Found revenue data for 97 active protocols:

Protocol                  24h Revenue     7d Revenue      30d Revenue    
Jupiter Perpetual Exchan  $1.98M          $27.61M         $94.54M        
Axiom                     $1.26M          $11.92M         $53.06M        
pump.fun                  $1.19M          $11.37M         $39.98M        
Solana                    $1.00M          $9.91M          $41.68M        
PumpSwap                  $839.19K        $7.43M          $28.81M        
Meteora DLMM              $677.93K        $8.73M          $54.30M        
Raydium AMM               $547.61K        $4.72M          $26.23M        
Meteora DAMM V2           $529.73K        $4.82M          $32.05M        
Jito MEV Tips             $521.00K        $6.73M          $34.55M        
Phantom Wallet            $4

#### Step 4: Collect Solana Fees Data

In [6]:
def get_solana_fees_data():
    """
    Fetch protocol fees data for all Solana-based protocols
    """
    print("\n🔍 Collecting Solana Protocol Fees Data...")
    
    # API endpoint for Solana chain fees data
    base_url = "https://api.llama.fi"
    fees_url = f"{base_url}/overview/fees/solana"
    
    # Parameters to get fees data
    params = {
        'dataType': 'dailyFees',
        'excludeTotalDataChart': 'true',
        'excludeTotalDataChartBreakdown': 'true'
    }
    
    # Make the API request
    data = make_request(fees_url, params)
    
    if not data:
        print("❌ Solana Fees API failed")
        return None
    
    print(f"✅ Collected Solana fees data")
    print(f"Total protocols found: {len(data.get('protocols', []))}")
    
    # Process and store fees data
    solana_fees = {}
    protocols = data.get('protocols', [])
    
    if not protocols:
        print("No protocol data found.")
        return None
    
    # Sort protocols by total24h in descending order
    sorted_protocols = sorted(protocols, 
                            key=lambda x: x.get('total24h', 0) or 0, 
                            reverse=True)
    
    # Store data for each protocol
    for protocol in sorted_protocols:
        protocol_name = protocol.get('name', 'Unknown')
        protocol_key = protocol_name.lower().replace(' ', '_')
        
        solana_fees[protocol_key] = {
            'name': protocol['name'],
            'total_24h': protocol.get('total24h', 0),
            'total_7d': protocol.get('total7d', 0),
            'total_30d': protocol.get('total30d', 0),
            'total_all_time': protocol.get('totalAllTime', 0),
            'chain': 'solana',
            'timestamp': datetime.now()
        }
    
    # Display summary
    protocols_with_data = sum(1 for p in solana_fees.values() if p['total_24h'] > 0)
    
    print(f"💰 Found fee data for {protocols_with_data} active protocols:")
    
    # Show top 10
    top_protocols = sorted(solana_fees.items(), 
                          key=lambda x: x[1]['total_24h'], 
                          reverse=True)[:10]
    
    print(f"\n{'Protocol':<25} {'24h Fees':<15} {'7d Fees':<15} {'30d Fees':<15}")
    print("=" * 70)
    
    for key, data in top_protocols:
        if data['total_24h'] > 0:
            print(f"{data['name'][:24]:<25} {format_currency(data['total_24h']):<15} "
                  f"{format_currency(data['total_7d']):<15} {format_currency(data['total_30d']):<15}")
    
    # Save raw data
    timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
    filename = f'solana_fees_{timestamp}.joblib'
    save_raw_data(solana_fees, filename, 'Solana fees data')
    
    return solana_fees



print("📊 Solana Protocol Fees Tracker")
print("=" * 50)
    
fees_data = get_solana_fees_data()
    
if fees_data:
    print(f"\n✅ Successfully collected fees data for {len(fees_data)} protocols")
    print("Data saved to joblib file for further analysis")
else:
    print("\n❌ Failed to collect fees data")
    
print("\n" + "=" * 50)

📊 Solana Protocol Fees Tracker

🔍 Collecting Solana Protocol Fees Data...
✅ Collected Solana fees data
Total protocols found: 117
💰 Found fee data for 97 active protocols:

Protocol                  24h Fees        7d Fees         30d Fees       
Jupiter Perpetual Exchan  $1.98M          $27.61M         $94.54M        
Axiom                     $1.26M          $11.92M         $53.06M        
pump.fun                  $1.19M          $11.37M         $39.98M        
Solana                    $1.00M          $9.91M          $41.68M        
PumpSwap                  $839.19K        $7.43M          $28.81M        
Meteora DLMM              $677.93K        $8.73M          $54.30M        
Raydium AMM               $547.61K        $4.72M          $26.23M        
Meteora DAMM V2           $529.73K        $4.82M          $32.05M        
Jito MEV Tips             $521.00K        $6.73M          $34.55M        
Phantom Wallet            $416.34K        $3.75M          $16.66M        
💾 Saved raw: 

#### Step 5: Collect Helius Token Holder Data

In [None]:
print("\n🔍 Collecting Helius Token Holder Data...")

# Helius API setup
helius_url = f"https://mainnet.helius-rpc.com/?api-key={API_KEYS['helius']}"
headers = {"Content-Type": "application/json"}

# Solana protocol tokens
SOLANA_PROTOCOL_TOKENS = {
    # DeFi Protocols
    "Jupiter (JUP)": "JUPyiwrYJFskUPiHa7hkeR8VUtAeFoSYbKedZNsDvCN",
    "Marinade (MNDE)": "MNDEFzGvMt87ueuHvVU9VcTqsAP5b3fTGPsHuuPA5ey",
    "Raydium (RAY)": "4k3Dyjzvzp8eMZWUXbBCjEvwSkkk59S5iCNLY3QrkX6R",
    "Orca (ORCA)": "orcaEKTdK7LKz57vaAYr9QeNsVEPfiu6QeMU1kektZE",
    "Serum (SRM)": "SRMuApVNdxXokk5GT7XD5cUUgXMBCoAz2LHeuAoKWRt",
    "Saber (SBR)": "Saber2gLauYim4Mvftnrasomsv6NvAuncvMEZwcLpD1",
    "Drift (DRIFT)": "DriFtupJYLTosbwoN8koMbEYSx54aFAVLddWsbksjwg7",
    "Mango (MNGO)": "MangoCzJ36AjZyKwVj3VnYU4GTonjfVEnJmvvWaxLac",
    # Infrastructure & Tools
    "Pyth (PYTH)": "HZ1JovNiVvGrGNiiYvEozEVgZ58xaU3RKwX8eACQBCt3",
    "Helium (HNT)": "hntyVP6YFm1Hg25TN9WGLqM12b8TQmcknKrdu1oxWux",
    "Jito (JTO)": "jtojtomepa8beP8AuQc6eXt5FriJwfFMwQx2v2f9mCL",
    "Bonk (BONK)": "DezXAZ8z7PnrnRJjz3wXBoRgixCa6xjnB7YaB1pPB263",
    # Gaming & NFTs
    "Star Atlas (ATLAS)": "ATLASXmbPQxBUYbxPsV97usA3fPQYEqzQBUHgiFCUsXx",
    "Star Atlas DAO (POLIS)": "poLisWXnNRwC6oBu1vHiuKQzFjGL4XDSu4g9qjz9qVk",
    "Step Finance (STEP)": "StepAscQoEioFxxWGnh2sLBDFp9d8rvKz2Yp39iDpyT",
    # Staking & Liquid Staking
    "Lido (stSOL)": "7dHbWXmci3dT8UFYWYZweBLXgycu7Y3iL6trKn1Y7ARj",
    "Marinade Staked SOL (mSOL)": "mSoLzYCxHdYgdzU16g5QSh3i5K3z3KZK7ytfqcJm7So",
}

# Function to get largest accounts for a token
def get_largest_accounts(token_address):
    payload = {
        "jsonrpc": "2.0",
        "id": "1",
        "method": "getTokenLargestAccounts",
        "params": [token_address]
    }
    response = requests.post(helius_url, json=payload, headers=headers)
    return response.json()

# Function to format and print results in a table
def print_largest_accounts(token_name, token_address, result):
    if 'result' not in result or 'value' not in result['result']:
        print(f"\nError fetching data for {token_name} ({token_address}): {result.get('error', 'No data returned')}")
        return

    accounts = result['result']['value'][:10]  # Limit to top 10 accounts
    if not accounts:
        print(f"\nNo accounts found for {token_name} ({token_address})")
        return

    # Prepare table data
    table_data = [
        [acc['address'], f"{acc['uiAmount']:.6f}", acc['amount']]
        for acc in accounts
    ]
    headers = ["Address", "Amount (UI)", "Amount (Raw)"]

    # Print table
    print(f"\nTop 10 Largest Accounts for {token_name} ({token_address}):")
    print(tabulate(table_data, headers=headers, tablefmt="grid", stralign="left", numalign="right"))

# Loop through tokens and get largest accounts
for token_name, token_address in SOLANA_PROTOCOL_TOKENS.items():
    result = get_largest_accounts(token_address)
    print_largest_accounts(token_name, token_address, result)

#### Step 6: Consolidate and Process Data

In [None]:
print("\n⚙️ Consolidating and Processing Data...")
# Initialize list for consolidated data
consolidated_data = []
timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')

# Combine data from all sources
for protocol_key, protocol_info in SOLANA_PROTOCOLS.items():
    metrics = {
        'protocol': protocol_info['name'],
        'symbol': protocol_info['token_symbol'],
        'category': protocol_info['category'],
        'timestamp': datetime.now(),
        'price_usd': 0,
        'market_cap': 0,
        'circulating_supply': 0,
        'total_supply': 0,
        'volume_24h': 0,
        'price_change_24h': 0,
        'tvl': 0,
        'fees_24h': 0,
        'revenue_24h': 0,
        'total_holders_helius': 0,
        'top_100_holders_balance': 0,
        'top_100_holders_share': 0,
    }
    
    # Add CoinGecko data
    if protocol_info['coingecko_id'] and coingecko_data and protocol_info['coingecko_id'] in coingecko_data:
        token_data = coingecko_data[protocol_info['coingecko_id']]
        metrics.update({
            'price_usd': token_data.get('usd', 0),
            'market_cap': token_data.get('usd_market_cap', 0),
            'circulating_supply': token_data.get('circulating_supply', 0),
            'total_supply': token_data.get('total_supply', 0),
            'volume_24h': token_data.get('usd_24h_vol', 0),
            'price_change_24h': token_data.get('usd_24h_change', 0)
        })
    
    # Add DefiLlama protocol data
    if defillama_protocols and protocol_key in defillama_protocols:
        metrics['tvl'] = defillama_protocols[protocol_key]['tvl']
    
    # Add DefiLlama fees data
    if defillama_fees and protocol_key in defillama_fees:
        metrics.update({
            'fees_24h': defillama_fees[protocol_key]['total_24h'],
            'revenue_24h': defillama_fees[protocol_key]['revenue_24h']
        })
    
    # Add Helius holder data
    if helius_data and protocol_key in helius_data:
        metrics.update({
            'total_holders_helius': helius_data[protocol_key]['total_holders'],
            'top_100_holders_balance': helius_data[protocol_key]['top_100_holders_balance'],
            'top_100_holders_share': helius_data[protocol_key]['top_100_holders_share']
        })
    
    consolidated_data.append(metrics)

# Create DataFrame
df_consolidated = pd.DataFrame(consolidated_data)

if not df_consolidated.empty:
    print("✅ Consolidated DataFrame created!")
    print(f"📊 Shape: {df_consolidated.shape}")
    print("\nSample data:")
    print(df_consolidated[['protocol', 'price_usd', 'market_cap', 'circulating_supply', 'tvl', 'fees_24h', 'revenue_24h', 'total_holders_helius', 'top_100_holders_balance', 'top_100_holders_share']].head())
    
    # Save processed data
    save_processed_data(df_consolidated, f'consolidated_metrics_{timestamp}.joblib', 'Consolidated metrics DataFrame')
else:
    print("⚠️ No data collected for consolidation")

#### Data Quality Check

In [None]:
print("\n🔍 Data Quality Check...")
# Check for missing data
missing_data = df_consolidated.isna().sum()
print("\nMissing values per column:")
print(missing_data)

# Check data completeness for key metrics
completeness = {}
for protocol_key, protocol_info in SOLANA_PROTOCOLS.items():
    protocol_data = df_consolidated[df_consolidated['protocol'] == protocol_info['name']]
    completeness[protocol_info['name']] = {
        'has_price': not protocol_data['price_usd'].isna().all() if not protocol_data.empty else False,
        'has_market_cap': not protocol_data['market_cap'].isna().all() if not protocol_data.empty else False,
        'has_circulating_supply': not protocol_data['circulating_supply'].isna().all() if not protocol_data.empty else False,
        'has_tvl': not protocol_data['tvl'].isna().all() if not protocol_data.empty else False,
        'has_fees': not protocol_data['fees_24h'].isna().all() if not protocol_data.empty else False,
        'has_revenue': not protocol_data['revenue_24h'].isna().all() if not protocol_data.empty else False,
        'has_holders_helius': not protocol_data['total_holders_helius'].isna().all() if not protocol_data.empty else False
    }

print("\nData completeness per protocol:")
for protocol, status in completeness.items():
    print(f" • {protocol}: Price={'✅' if status['has_price'] else '❌'}, MCap={'✅' if status['has_market_cap'] else '❌'}, Circulating={'✅' if status['has_circulating_supply'] else '❌'}, TVL={'✅' if status['has_tvl'] else '❌'}, Fees={'✅' if status['has_fees'] else '❌'}, Revenue={'✅' if status['has_revenue'] else '❌'}, Holders (Helius)={'✅' if status['has_holders_helius'] else '❌'}")

#### Next Steps 

In [None]:
print("\n📋 Data Collection Summary")
print("=" * 50)
# Summarize collected data
summary = {
    'timestamp': datetime.now(),
    'protocols_collected': len(df_consolidated),
    'coingecko_tokens': len(coingecko_data),
    'defillama_protocols': len(defillama_protocols),
    'defillama_fees_protocols': len(defillama_fees),
    'helius_protocols': len(helius_data),
    'cached_files': os.listdir('../data/api_responses/') + os.listdir('../data/processed/')
}

print("Collection Summary:")
print(f" • Protocols with data: {summary['protocols_collected']}")
print(f" • CoinGecko tokens: {summary['coingecko_tokens']}")
print(f" • DefiLlama protocols: {summary['defillama_protocols']}")
print(f" • DefiLlama fees/revenue protocols: {summary['defillama_fees_protocols']}")
print(f" • Helius holder protocols: {summary['helius_protocols']}")
print(f" • Cached files: {len(summary['cached_files'])}")

print("\n📁 Cached Files Created:")
for file in summary['cached_files']:
    file_path = os.path.normpath(f"../data/{'api_responses' if file.startswith('coingecko') or file.startswith('defillama') or file.startswith('helius') else 'processed'}/{file}")
    file_size = os.path.getsize(file_path)
    print(f" • {file} ({file_size:,} bytes)")

# Save summary
save_processed_data(summary, f'collection_summary_{timestamp}.joblib', 'Data collection summary')


print("\n🎯 Data collection complete! Check ../data/ for all saved files.")