In [11]:
import pandas as pd
import numpy as np
import os
import sys
import joblib
from pathlib import Path
from datetime import datetime
import glob

# Set up paths
parent_dir = str(Path().resolve().parent)
if parent_dir not in sys.path:
    sys.path.append(parent_dir)

# Set up directories
processed_dir = os.path.normpath('../data/processed')
api_responses_dir = os.path.normpath('../data/api_responses')
streamlit_data_dir = os.path.normpath('../data/streamlit')

# Create directories
os.makedirs(processed_dir, exist_ok=True)
os.makedirs(streamlit_data_dir, exist_ok=True)

print("📊 Solana DeFi Tracker - Analysis for Streamlit")
print(f"Analysis timestamp: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")

def load_latest_file(directory, pattern):
    """Load the most recent file matching the pattern"""
    files = glob.glob(os.path.join(directory, pattern))
    if not files:
        return None, None
    latest_file = max(files, key=os.path.getctime)
    return joblib.load(latest_file), latest_file

def format_currency(amount):
    """Format currency amount with appropriate units"""
    if amount is None or amount == 0:
        return "$0"
    if amount >= 1_000_000_000:
        return f"${amount/1_000_000_000:.2f}B"
    elif amount >= 1_000_000:
        return f"${amount/1_000_000:.2f}M"
    elif amount >= 1_000:
        return f"${amount/1_000:.2f}K"
    else:
        return f"${amount:.2f}"

print("\n🔍 Loading Collected Data...")
# Load all datasets
tvl_data, _ = load_latest_file(api_responses_dir, 'solana_defi_tvl_*.joblib')
revenue_data, _ = load_latest_file(api_responses_dir, 'solana_revenue_*.joblib')
fees_data, _ = load_latest_file(api_responses_dir, 'solana_fees_*.joblib')
coingecko_data, _ = load_latest_file(api_responses_dir, 'solana_coingecko_enhanced_*.joblib')
holders_data, _ = load_latest_file(api_responses_dir, 'solana_token_holders_*.joblib')

# Convert fees_data to DataFrame if it's a list
if isinstance(fees_data, list):
    fees_data = pd.DataFrame(fees_data)

# Convert CoinGecko data to DataFrame
coingecko_df = pd.DataFrame()
if isinstance(coingecko_data, dict):
    coingecko_df = pd.DataFrame.from_dict(coingecko_data, orient='index').reset_index()
    coingecko_df.rename(columns={'index': 'protocol_key'}, inplace=True)
elif isinstance(coingecko_data, (list, pd.DataFrame)):
    coingecko_df = pd.DataFrame(coingecko_data) if isinstance(coingecko_data, list) else coingecko_data

print(f"✅ Loaded TVL: {tvl_data.shape if isinstance(tvl_data, pd.DataFrame) else 'Not available'}")
print(f"✅ Loaded Revenue: {revenue_data.shape if isinstance(revenue_data, pd.DataFrame) else 'Not available'}")
print(f"✅ Loaded Fees: {fees_data.shape if isinstance(fees_data, pd.DataFrame) else 'Not available'}")
print(f"✅ Loaded CoinGecko: {coingecko_df.shape if not coingecko_df.empty else 'Not available'}")
print(f"✅ Loaded Holders: {holders_data.shape if isinstance(holders_data, pd.DataFrame) else 'Not available'}")

# Save raw revenue and fees as DataFrames
timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
if isinstance(revenue_data, pd.DataFrame) and not revenue_data.empty:
    revenue_clean = revenue_data.copy()
    revenue_clean['protocol_clean'] = revenue_clean['protocol'].astype(str).str.lower().str.strip()
    revenue_clean = revenue_clean.rename(columns={
        'total_24h': 'Revenue_24h',
        'total_7d': 'Revenue_7d',
        'total_30d': 'Revenue_30d'
    })
    revenue_filepath = os.path.join(streamlit_data_dir, f'raw_revenue_{timestamp}.joblib')
    joblib.dump(revenue_clean, revenue_filepath, compress='zlib')
    print(f"💾 Saved raw revenue: {revenue_clean.shape}")

if isinstance(fees_data, pd.DataFrame) and not fees_data.empty:
    fees_clean = fees_data.copy()
    fees_clean['protocol_clean'] = fees_clean['protocol'].astype(str).str.lower().str.strip()
    fees_clean = fees_clean.rename(columns={
        'total_24h': 'Fees_24h',
        'total_7d': 'Fees_7d',
        'total_30d': 'Fees_30d'
    })
    fees_filepath = os.path.join(streamlit_data_dir, f'raw_fees_{timestamp}.joblib')
    joblib.dump(fees_clean, fees_filepath, compress='zlib')
    print(f"💾 Saved raw fees: {fees_clean.shape}")

def process_tvl_data():
    """Process TVL data for Overview tab"""
    if not isinstance(tvl_data, pd.DataFrame) or tvl_data.empty:
        print("❌ No TVL data available")
        return pd.DataFrame()
    
    df_base = tvl_data.copy()
    df_base['protocol_clean'] = df_base['name'].astype(str).str.lower().str.strip()
    
    print(f"\n📊 Processing {len(df_base)} protocols for TVL data")
    
    # Merge CoinGecko data
    if not coingecko_df.empty:
        cg = coingecko_df.copy()
        if 'protocol_name' in cg.columns:
            cg['protocol_clean'] = cg['protocol_name'].astype(str).str.lower().str.strip()
        elif 'protocol_key' in cg.columns:
            cg['protocol_clean'] = cg['protocol_key'].astype(str).str.lower().str.strip()
        
        cg_cols = ['protocol_clean', 'symbol', 'current_price_usd', 'market_cap_usd',
                   'price_change_24h_percent', 'price_change_7d_percent', 'price_change_30d_percent']
        cg_cols = [c for c in cg_cols if c in cg.columns]
        
        if 'protocol_clean' in cg_cols:
            cg_merge = cg[cg_cols].dropna(subset=['protocol_clean']).drop_duplicates('protocol_clean')
            df_base = df_base.merge(cg_merge, on='protocol_clean', how='left')
            matched_cg = df_base['market_cap_usd'].notna().sum()
            print(f"✅ CoinGecko for TVL: {matched_cg}/{len(df_base)} protocols matched")
    
    # Convert to numeric and fill NaN
    numeric_cols = ['market_cap_usd', 'tvl']
    for col in numeric_cols:
        if col in df_base.columns:
            df_base[col] = pd.to_numeric(df_base[col], errors='coerce').fillna(0)
    
    # Calculate MCAP/TVL ratio
    df_base['mcap_tvl_ratio'] = np.where(
        df_base['tvl'] > 0,
        df_base['market_cap_usd'] / df_base['tvl'],
        np.nan
    )
    
    return df_base

def process_financial_metrics():
    """Process revenue, fees, and CoinGecko for financial metrics (P/F, P/R ratios)"""
    df_metrics = pd.DataFrame()
    
    # Start with revenue if available
    if isinstance(revenue_data, pd.DataFrame) and not revenue_data.empty:
        df_metrics = revenue_data.copy()
        df_metrics['protocol_clean'] = df_metrics['protocol'].astype(str).str.lower().str.strip()
        df_metrics = df_metrics.rename(columns={
            'total_24h': 'Revenue_24h',
            'total_7d': 'Revenue_7d',
            'total_30d': 'Revenue_30d'
        })
        print(f"\n📊 Starting metrics with {len(df_metrics)} protocols from revenue data")
    
    # Add fees if available
    if isinstance(fees_data, pd.DataFrame) and not fees_data.empty:
        fees_clean = fees_data.copy()
        fees_clean['protocol_clean'] = fees_clean['protocol'].astype(str).str.lower().str.strip()
        fees_clean = fees_clean.rename(columns={
            'total_24h': 'Fees_24h',
            'total_7d': 'Fees_7d',
            'total_30d': 'Fees_30d'
        })
        fees_merge = fees_clean[['protocol_clean', 'Fees_24h', 'Fees_7d', 'Fees_30d']].drop_duplicates('protocol_clean')
        if df_metrics.empty:
            df_metrics = fees_merge.copy()
            print(f"\n📊 Starting metrics with {len(df_metrics)} protocols from fees data")
        else:
            df_metrics = df_metrics.merge(fees_merge, on='protocol_clean', how='outer')
        matched_fees = df_metrics['Fees_24h'].notna().sum()
        print(f"✅ Fees matched: {matched_fees}/{len(df_metrics)}")
    
    # Merge CoinGecko for market cap
    if not coingecko_df.empty and not df_metrics.empty:
        cg = coingecko_df.copy()
        if 'protocol_name' in cg.columns:
            cg['protocol_clean'] = cg['protocol_name'].astype(str).str.lower().str.strip()
        elif 'protocol_key' in cg.columns:
            cg['protocol_clean'] = cg['protocol_key'].astype(str).str.lower().str.strip()
        
        cg_cols = ['protocol_clean', 'market_cap_usd']
        cg_cols = [c for c in cg_cols if c in cg.columns]
        
        if 'protocol_clean' in cg_cols:
            cg_merge = cg[cg_cols].dropna(subset=['protocol_clean']).drop_duplicates('protocol_clean')
            df_metrics = df_metrics.merge(cg_merge, on='protocol_clean', how='left')
            matched_cg = df_metrics['market_cap_usd'].notna().sum()
            print(f"✅ CoinGecko for metrics: {matched_cg}/{len(df_metrics)} protocols matched")
    
    # Convert to numeric
    numeric_cols = ['Revenue_24h', 'Revenue_7d', 'Revenue_30d', 'Fees_24h', 'Fees_7d', 'Fees_30d', 'market_cap_usd']
    for col in numeric_cols:
        if col in df_metrics.columns:
            df_metrics[col] = pd.to_numeric(df_metrics[col], errors='coerce').fillna(0)
    
    # Calculate ratios
    df_metrics['PF_Ratio'] = np.where(
        (df_metrics['Fees_24h'] > 0) & (df_metrics['market_cap_usd'] > 0),
        df_metrics['market_cap_usd'] / (df_metrics['Fees_24h'] * 365),
        np.nan
    )
    
    df_metrics['PR_Ratio'] = np.where(
        (df_metrics['Revenue_24h'] > 0) & (df_metrics['market_cap_usd'] > 0),
        df_metrics['market_cap_usd'] / (df_metrics['Revenue_24h'] * 365),
        np.nan
    )
    
    # Debug ratios
    valid_pf = df_metrics['PF_Ratio'].notna().sum()
    valid_pr = df_metrics['PR_Ratio'].notna().sum()
    print(f"📊 Calculated ratios: P/F={valid_pf}, P/R={valid_pr}")
    
    return df_metrics

def process_token_holders_data():
    """Process token holders data for concentration analysis, add price from CoinGecko"""
    if not isinstance(holders_data, pd.DataFrame) or holders_data.empty:
        print("❌ No token holders data available")
        return pd.DataFrame()
    
    token_df_sorted = holders_data.sort_values(['token_name', 'rank']).copy()
    concentration_metrics = []
    
    for token_name in token_df_sorted['token_name'].unique():
        token_df = token_df_sorted[token_df_sorted['token_name'] == token_name]
        if len(token_df) > 0:
            total_supply = token_df['ui_amount'].sum()
            top_1_share = (token_df.iloc[0]['ui_amount'] / total_supply * 100) if total_supply > 0 else 0
            top_5_share = (token_df.head(5)['ui_amount'].sum() / total_supply * 100) if total_supply > 0 else 0
            top_10_share = (token_df.head(10)['ui_amount'].sum() / total_supply * 100) if total_supply > 0 else 0
            # Gini coefficient
            balances = token_df['ui_amount'].values
            balances = balances[balances > 0]
            if len(balances) > 1:
                n = len(balances)
                mean_balance = np.mean(balances)
                if mean_balance > 0:
                    diffs = np.abs(balances.reshape(-1, 1) - balances.reshape(1, -1))
                    gini = diffs.sum() / (2 * n * n * mean_balance)
                else:
                    gini = 0
            else:
                gini = 0
            concentration_metrics.append({
                'token_name': token_name,
                'token_symbol': token_df.iloc[0].get('token_symbol', ''),
                'total_accounts_analyzed': len(token_df),
                'top_1_holder_share': top_1_share,
                'top_5_holders_share': top_5_share,
                'top_10_holders_share': top_10_share,
                'gini_coefficient': gini,
                'largest_holder_amount': token_df.iloc[0]['ui_amount']
            })
    
    concentration_df = pd.DataFrame(concentration_metrics)
    
    # Add token price from coingecko_df
    if not coingecko_df.empty and 'symbol' in coingecko_df.columns and 'current_price_usd' in coingecko_df.columns:
        coingecko_df['token_clean'] = coingecko_df['symbol'].astype(str).str.lower().str.strip()
        concentration_df['token_clean'] = concentration_df['token_symbol'].astype(str).str.lower().str.strip()
        
        price_merge = coingecko_df[['token_clean', 'current_price_usd']].drop_duplicates('token_clean')
        concentration_df = concentration_df.merge(price_merge, on='token_clean', how='left')
        concentration_df.rename(columns={'current_price_usd': 'token_price_usd'}, inplace=True)
        concentration_df['token_price_usd'] = pd.to_numeric(concentration_df['token_price_usd'], errors='coerce').fillna(0)
        
        matched_prices = concentration_df['token_price_usd'].gt(0).sum()
        print(f"✅ Matched prices for {matched_prices}/{len(concentration_df)} tokens")
    
    print(f"✅ Processed {len(concentration_df)} tokens for concentration analysis")
    return concentration_df

def process_raw_holders_data():
    """Process raw token holders data using 'account_address' column"""
    if not isinstance(holders_data, pd.DataFrame) or holders_data.empty:
        print("❌ No token holders data available")
        return {}

    if 'account_address' not in holders_data.columns:
        print("⚠️ Warning: No 'account_address' column found, using index as placeholder")
        holders_data['account_address'] = holders_data.index.astype(str)

    raw_holders = {}
    for token_name in holders_data['token_name'].unique():
        token_holders = holders_data[holders_data['token_name'] == token_name].copy()
        if not token_holders.empty:
            required_cols = ['rank', 'account_address', 'ui_amount']
            for col in required_cols:
                if col not in token_holders.columns:
                    token_holders[col] = token_holders.index.astype(str) if col == 'account_address' else 0

            raw_holders[token_name] = token_holders[required_cols]

    print(f"💾 Prepared raw token holders: {len(raw_holders)} tokens")
    return raw_holders


def create_streamlit_datasets():
    """Create datasets for Streamlit tabs"""
    # Tab 1: Overview (TVL-based)
    df_tvl = process_tvl_data()
    if not df_tvl.empty:
        overview_cols = ['name', 'category', 'tvl', 'market_cap_usd', 'current_price_usd',
                        'change_1d', 'price_change_24h_percent', 'mcap_tvl_ratio']
        
        for col in overview_cols:
            if col not in df_tvl.columns:
                df_tvl[col] = np.nan if col not in ['name', 'category'] else ''
        df_tab1 = df_tvl[overview_cols].copy()
        df_tab1.rename(columns={
            'name': 'Protocol', 'category': 'Category', 'tvl': 'TVL_USD',
            'market_cap_usd': 'Market_Cap_USD', 'current_price_usd': 'Price_USD',
            'change_1d': 'TVL_Change_1d', 'price_change_24h_percent': 'Price_Change_24h',
            'mcap_tvl_ratio': 'MCap_TVL_Ratio'
        }, inplace=True)
        numeric_cols = ['Market_Cap_USD', 'Price_USD', 'TVL_Change_1d', 'Price_Change_24h']
        for col in numeric_cols:
            if col in df_tab1.columns:
                df_tab1[col] = pd.to_numeric(df_tab1[col], errors='coerce').fillna(0)
        df_tab1 = df_tab1.sort_values('TVL_USD', ascending=False).reset_index(drop=True)
        print(f"📋 Tab 1 (Overview): {df_tab1.shape}")
    else:
        df_tab1 = pd.DataFrame()
    
    # Tab 2: Revenue (already saved as raw_revenue)
    df_revenue = revenue_clean if isinstance(revenue_data, pd.DataFrame) and not revenue_data.empty else pd.DataFrame()
    if not df_revenue.empty:
        df_revenue = df_revenue[['protocol_clean', 'Revenue_24h', 'Revenue_7d', 'Revenue_30d']].rename(columns={'protocol_clean': 'Protocol'})
        df_revenue = df_revenue.sort_values('Revenue_24h', ascending=False).reset_index(drop=True)
        print(f"📋 Tab 2 (Revenue): {df_revenue.shape}")
    
    # Tab 2: Fees (already saved as raw_fees)
    df_fees = fees_clean if isinstance(fees_data, pd.DataFrame) and not fees_data.empty else pd.DataFrame()
    if not df_fees.empty:
        df_fees = df_fees[['protocol_clean', 'Fees_24h', 'Fees_7d', 'Fees_30d']].rename(columns={'protocol_clean': 'Protocol'})
        df_fees = df_fees.sort_values('Fees_24h', ascending=False).reset_index(drop=True)
        print(f"📋 Tab 2 (Fees): {df_fees.shape}")
    
    # Tab 3: Financial Metrics
    df_tab3 = process_financial_metrics()
    if not df_tab3.empty:
        metrics_cols = ['protocol_clean', 'Revenue_24h', 'Fees_24h', 'Revenue_7d', 'Fees_7d', 'Revenue_30d', 'Fees_30d', 'market_cap_usd', 'PF_Ratio', 'PR_Ratio']
        df_tab3 = df_tab3[metrics_cols].rename(columns={'protocol_clean': 'Protocol', 'market_cap_usd': 'Market_Cap_USD'})
        df_tab3 = df_tab3.sort_values('Revenue_24h', ascending=False).reset_index(drop=True)
        print(f"📋 Tab 3 (Financial Metrics): {df_tab3.shape}")
    
    # Tab 4: Token Distribution
    df_tab4 = process_token_holders_data()
    if not df_tab4.empty:
        df_tab4 = df_tab4.sort_values('gini_coefficient', ascending=False).reset_index(drop=True)
        print(f"📋 Tab 4 (Distribution): {df_tab4.shape}")
    
    # Raw token holders
    raw_holders = process_raw_holders_data()
    
    return df_tab1, df_revenue, df_fees, df_tab3, df_tab4, raw_holders

def create_category_analysis(df):
    """Create category-level analysis for Tab 1"""
    if df.empty:
        return pd.DataFrame()
    category_stats = df.groupby('Category').agg({
        'TVL_USD': ['count', 'sum', 'mean', 'median'],
        'Market_Cap_USD': ['sum', 'mean'],
        'TVL_Change_1d': 'mean',
        'Price_Change_24h': 'mean'
    }).round(2)
    category_stats.columns = [f"{col[0]}_{col[1]}" for col in category_stats.columns]
    category_stats = category_stats.reset_index()
    category_stats.rename(columns={
        'TVL_USD_count': 'Protocol_Count',
        'TVL_USD_sum': 'Total_TVL',
        'TVL_USD_mean': 'Avg_TVL',
        'TVL_USD_median': 'Median_TVL',
        'Market_Cap_USD_sum': 'Total_Market_Cap',
        'Market_Cap_USD_mean': 'Avg_Market_Cap',
        'TVL_Change_1d_mean': 'Avg_TVL_Change_1d',
        'Price_Change_24h_mean': 'Avg_Price_Change_24h'
    }, inplace=True)
    return category_stats.sort_values('Total_TVL', ascending=False).reset_index(drop=True)

def create_financial_rankings(df):
    """Create financial rankings for Tab 3"""
    if df.empty:
        return {}
    rankings = {}
    if 'Revenue_24h' in df.columns:
        rankings['top_revenue'] = df[df['Revenue_24h'] > 0].nlargest(20, 'Revenue_24h')[
            ['Protocol', 'Revenue_24h', 'Revenue_7d', 'Revenue_30d']
        ].reset_index(drop=True)
    if 'Fees_24h' in df.columns:
        rankings['top_fees'] = df[df['Fees_24h'] > 0].nlargest(20, 'Fees_24h')[
            ['Protocol', 'Fees_24h', 'Fees_7d', 'Fees_30d']
        ].reset_index(drop=True)
    if 'PF_Ratio' in df.columns:
        valid_pf = df[(df['PF_Ratio'].notna()) & (df['PF_Ratio'] > 0) & (df['PF_Ratio'] < 100)]
        if not valid_pf.empty:
            rankings['best_pf_ratios'] = valid_pf.nsmallest(15, 'PF_Ratio')[
                ['Protocol', 'Market_Cap_USD', 'Fees_24h', 'PF_Ratio']
            ].reset_index(drop=True)
    if 'PR_Ratio' in df.columns:
        valid_pr = df[(df['PR_Ratio'].notna()) & (df['PR_Ratio'] > 0) & (df['PR_Ratio'] < 100)]
        if not valid_pr.empty:
            rankings['best_pr_ratios'] = valid_pr.nsmallest(15, 'PR_Ratio')[
                ['Protocol', 'Market_Cap_USD', 'Revenue_24h', 'PR_Ratio']
            ].reset_index(drop=True)
    return rankings

def create_summary_stats(df_tab1, df_revenue, df_fees, df_tab3, df_tab4):
    """Create summary statistics"""
    summary_stats = {}
    if not df_tab1.empty:
        summary_stats['overview'] = {
            'total_protocols': len(df_tab1),
            'total_tvl': df_tab1['TVL_USD'].sum(),
            'total_market_cap': df_tab1['Market_Cap_USD'].sum(),
            'top_category': df_tab1.groupby('Category')['TVL_USD'].sum().idxmax() if not df_tab1.empty else 'N/A',
            'timestamp': datetime.now()
        }
    if not df_revenue.empty or not df_fees.empty:
        summary_stats['financial'] = {
            'protocols_with_revenue': len(df_revenue[df_revenue['Revenue_24h'] > 0]) if not df_revenue.empty else 0,
            'protocols_with_fees': len(df_fees[df_fees['Fees_24h'] > 0]) if not df_fees.empty else 0,
            'total_daily_revenue': df_revenue['Revenue_24h'].sum() if not df_revenue.empty else 0,
            'total_daily_fees': df_fees['Fees_24h'].sum() if not df_fees.empty else 0,
            'avg_pf_ratio': df_tab3['PF_Ratio'].median() if not df_tab3.empty and 'PF_Ratio' in df_tab3.columns else None,
            'avg_pr_ratio': df_tab3['PR_Ratio'].median() if not df_tab3.empty and 'PR_Ratio' in df_tab3.columns else None,
            'highest_revenue_protocol': df_revenue.loc[df_revenue['Revenue_24h'].idxmax(), 'Protocol'] if not df_revenue.empty and df_revenue['Revenue_24h'].max() > 0 else 'N/A',
            'timestamp': datetime.now()
        }
    if not df_tab4.empty:
        summary_stats['distribution'] = {
            'tokens_analyzed': len(df_tab4),
            'avg_gini_coefficient': df_tab4['gini_coefficient'].mean(),
            'avg_top_10_share': df_tab4['top_10_holders_share'].mean(),
            'timestamp': datetime.now()
        }
    return summary_stats

print("\n🔄 Processing Data...")
# Execute data processing
df_tab1, df_revenue, df_fees, df_tab3, df_tab4, raw_holders = create_streamlit_datasets()
category_analysis = create_category_analysis(df_tab1)
financial_rankings = create_financial_rankings(df_tab3)
summary_stats = create_summary_stats(df_tab1, df_revenue, df_fees, df_tab3, df_tab4)

print("\n💾 Saving Streamlit Datasets...")
datasets = {
    'tab1_overview': df_tab1,
    'tab2_revenue': df_revenue,
    'tab2_fees': df_fees,
    'tab3_metrics': df_tab3,
    'tab4_distribution': df_tab4,
    'category_analysis': category_analysis,
    'financial_rankings': financial_rankings,
    'summary_stats': summary_stats,
    'raw_token_holders': raw_holders
}
for name, dataset in datasets.items():
    if isinstance(dataset, pd.DataFrame) and not dataset.empty:
        filepath = os.path.join(streamlit_data_dir, f'{name}_{timestamp}.joblib')
        joblib.dump(dataset, filepath, compress='zlib')
        print(f"💾 Saved {name}: {dataset.shape}")
    elif isinstance(dataset, dict) and dataset:
        filepath = os.path.join(streamlit_data_dir, f'{name}_{timestamp}.joblib')
        joblib.dump(dataset, filepath, compress='zlib')
        print(f"💾 Saved {name}: {len(dataset)} items")

# Save metadata
metadata = {
    'last_updated': datetime.now(),
    'data_files': {
        'tab1_overview': f'tab1_overview_{timestamp}.joblib',
        'tab2_revenue': f'tab2_revenue_{timestamp}.joblib',
        'tab2_fees': f'tab2_fees_{timestamp}.joblib',
        'tab3_metrics': f'tab3_metrics_{timestamp}.joblib',
        'tab4_distribution': f'tab4_distribution_{timestamp}.joblib',
        'category_analysis': f'category_analysis_{timestamp}.joblib',
        'financial_rankings': f'financial_rankings_{timestamp}.joblib',
        'summary_stats': f'summary_stats_{timestamp}.joblib',
        'raw_token_holders': f'raw_token_holders_{timestamp}.joblib',
        'raw_revenue': f'raw_revenue_{timestamp}.joblib',
        'raw_fees': f'raw_fees_{timestamp}.joblib'
    },
    'record_counts': {
        'protocols_overview': len(df_tab1),
        'protocols_revenue': len(df_revenue),
        'protocols_fees': len(df_fees),
        'protocols_metrics': len(df_tab3),
        'tokens_distribution': len(df_tab4),
        'raw_token_holders': len(raw_holders)
    }
}
metadata_filepath = os.path.join(streamlit_data_dir, 'latest_data_metadata.joblib')
joblib.dump(metadata, metadata_filepath)
print(f"💾 Saved metadata: latest_data_metadata.joblib")

print(f"\n✅ Analysis Complete!")
print(f"📊 Overview: {len(df_tab1)} protocols")
print(f"💵 Revenue: {len(df_revenue)} protocols")
print(f"💸 Fees: {len(df_fees)} protocols")
print(f"📈 Metrics: {len(df_tab3)} protocols")
print(f"🎯 Distribution: {len(df_tab4)} tokens")
if 'financial' in summary_stats:
    fs = summary_stats['financial']
    print(f"\n📈 Financial Summary:")
    print(f" Daily Revenue: {format_currency(fs.get('total_daily_revenue', 0))}")
    print(f" Daily Fees: {format_currency(fs.get('total_daily_fees', 0))}")
    print(f" Avg P/F Ratio: {fs.get('avg_pf_ratio', 'N/A')}")
    print(f" Avg P/R Ratio: {fs.get('avg_pr_ratio', 'N/A')}")
print("🚀 Ready for Streamlit!")

📊 Solana DeFi Tracker - Analysis for Streamlit
Analysis timestamp: 2025-09-02 00:40:08

🔍 Loading Collected Data...
✅ Loaded TVL: (250, 14)
✅ Loaded Revenue: (117, 7)
✅ Loaded Fees: (117, 7)
✅ Loaded CoinGecko: (80, 20)
✅ Loaded Holders: (616, 10)
💾 Saved raw revenue: (117, 8)
💾 Saved raw fees: (117, 8)

🔄 Processing Data...

📊 Processing 250 protocols for TVL data
✅ CoinGecko for TVL: 80/250 protocols matched
📋 Tab 1 (Overview): (250, 8)
📋 Tab 2 (Revenue): (117, 4)
📋 Tab 2 (Fees): (117, 4)

📊 Starting metrics with 117 protocols from revenue data
✅ Fees matched: 117/117
✅ CoinGecko for metrics: 19/117 protocols matched
📊 Calculated ratios: P/F=14, P/R=14
📋 Tab 3 (Financial Metrics): (117, 10)
✅ Matched prices for 60/67 tokens
✅ Processed 67 tokens for concentration analysis
📋 Tab 4 (Distribution): (67, 10)
💾 Prepared raw token holders: 67 tokens

💾 Saving Streamlit Datasets...
💾 Saved tab1_overview: (250, 8)
💾 Saved tab2_revenue: (117, 4)
💾 Saved tab2_fees: (117, 4)
💾 Saved tab3_metric