# Data Exploration

Understand your data before backtesting. This notebook covers price distribution, correlation analysis, volume patterns, and data quality checks.

In [8]:
import pandas as pd
import numpy as np
from clyptq.data.loaders.ccxt import load_crypto_data
from clyptq.analytics.reporting.data_explorer import DataExplorer

## 1. Load Data

Load 10 symbols for analysis

In [9]:
symbols = [
    "BTC/USDT", "ETH/USDT", "BNB/USDT", "SOL/USDT", "XRP/USDT",
    "ADA/USDT", "AVAX/USDT", "DOGE/USDT", "DOT/USDT", "MATIC/USDT",
]

store = load_crypto_data(
    symbols=symbols,
    exchange="binance",
    timeframe="1d",
    days=180
)

print(f"Loaded {len(symbols)} symbols")
date_range = store.get_date_range()
print(f"Date range: {date_range.start.date()} to {date_range.end.date()}")

No data returned for symbol
No data for symbol


Loaded 10 symbols
Date range: 2025-07-09 to 2026-01-04


## 2. Statistical Summary

Overall statistics for each symbol

In [10]:
explorer = DataExplorer(store)
summary = explorer.statistical_summary()

print("STATISTICAL SUMMARY")
print("=" * 100)
print(f"{'Symbol':<12} {'Bars':<8} {'Mean Price':<15} {'Mean Return':<15} {'Volatility':<12}")
print("=" * 100)

for _, row in summary.iterrows():
    annualized_return = row['mean_return'] * 365
    annualized_vol = row['std_return'] * np.sqrt(365)
    print(
        f"{row['symbol']:<12} "
        f"{int(row['count']):<8} "
        f"${row['mean_price']:>13,.2f}  "
        f"{annualized_return:>13.2%}  "
        f"{annualized_vol:>10.2%}"
    )

STATISTICAL SUMMARY
Symbol       Bars     Mean Price      Mean Return     Volatility  
ADA/USDT     180      $         0.67        -54.29%      83.03%
AVAX/USDT    180      $        21.01        -24.99%      89.31%
BNB/USDT     180      $       915.73         74.61%      58.53%
BTC/USDT     180      $   106,688.05        -33.94%      36.08%
DOGE/USDT    180      $         0.20          7.19%      90.10%
DOT/USDT     180      $         3.28        -62.19%      90.76%
ETH/USDT     180      $     3,728.55         48.55%      68.00%
SOL/USDT     180      $       175.80         -2.46%      76.41%
XRP/USDT     180      $         2.61         -1.58%      70.49%


## 3. Price Analysis by Symbol

Detailed statistics including Sharpe ratio

In [11]:
print("\nPRICE STATISTICS")
print("=" * 100)
print(f"{'Symbol':<12} {'Return':<12} {'Vol':<12} {'Sharpe':<10} {'Skew':<10} {'Kurt':<10}")
print("=" * 100)

for symbol in symbols:
    stats = explorer.price_statistics(symbol)
    if stats:
        print(
            f"{symbol:<12} "
            f"{stats['return_mean']*365:>10.2%}  "
            f"{stats['return_std']*np.sqrt(365):>10.2%}  "
            f"{stats['sharpe_ratio']:>8.3f}  "
            f"{stats['return_skew']:>8.3f}  "
            f"{stats['return_kurtosis']:>8.3f}"
        )


PRICE STATISTICS
Symbol       Return       Vol          Sharpe     Skew       Kurt      
BTC/USDT        -33.94%      36.08%    -0.782    -0.386     1.376
ETH/USDT         48.55%      68.00%     0.593     0.264     1.926
BNB/USDT         74.61%      58.53%     1.059     0.232     4.020
SOL/USDT         -2.46%      76.41%    -0.027    -0.059     0.999
XRP/USDT         -1.58%      70.49%    -0.019     0.097     2.618
ADA/USDT        -54.29%      83.03%    -0.543    -0.352     3.230
AVAX/USDT       -24.99%      89.31%    -0.232    -0.904     5.900
DOGE/USDT         7.19%      90.10%     0.066    -0.207     2.392
DOT/USDT        -62.19%      90.76%    -0.569    -0.415     8.062


## 4. Correlation Matrix

Understand relationships between assets

In [12]:
correlation = explorer.correlation_matrix()

print("\nCORRELATION MATRIX")
print("=" * 100)
print(correlation.round(3))

# Find least correlated pairs
print("\nLEAST CORRELATED PAIRS (for diversification)")
print("=" * 80)

corr_pairs = []
for i in range(len(correlation.columns)):
    for j in range(i+1, len(correlation.columns)):
        sym1 = correlation.columns[i]
        sym2 = correlation.columns[j]
        corr_val = correlation.iloc[i, j]
        if not np.isnan(corr_val):
            corr_pairs.append((sym1, sym2, corr_val))

corr_pairs.sort(key=lambda x: abs(x[2]))
for sym1, sym2, corr_val in corr_pairs[:5]:
    print(f"{sym1:<12} - {sym2:<12}: {corr_val:>6.3f}")


CORRELATION MATRIX
           ADA/USDT  AVAX/USDT  BNB/USDT  BTC/USDT  DOGE/USDT  DOT/USDT  \
ADA/USDT      1.000      0.830     0.710     0.800      0.883     0.892   
AVAX/USDT     0.830      1.000     0.640     0.733      0.781     0.856   
BNB/USDT      0.710      0.640     1.000     0.704      0.703     0.685   
BTC/USDT      0.800      0.733     0.704     1.000      0.746     0.719   
DOGE/USDT     0.883      0.781     0.703     0.746      1.000     0.860   
DOT/USDT      0.892      0.856     0.685     0.719      0.860     1.000   
ETH/USDT      0.839      0.763     0.749     0.806      0.851     0.767   
SOL/USDT      0.846      0.780     0.732     0.804      0.854     0.793   
XRP/USDT      0.835      0.729     0.635     0.776      0.806     0.747   

           ETH/USDT  SOL/USDT  XRP/USDT  
ADA/USDT      0.839     0.846     0.835  
AVAX/USDT     0.763     0.780     0.729  
BNB/USDT      0.749     0.732     0.635  
BTC/USDT      0.806     0.804     0.776  
DOGE/USDT     0.851

## 5. Volume Analysis

Identify liquidity and trading activity

In [13]:
print("\nVOLUME STATISTICS")
print("=" * 100)
print(f"{'Symbol':<12} {'Avg Volume':<20} {'Vol Std':<20} {'CV':<10}")
print("=" * 100)

for _, row in summary.iterrows():
    cv = row['std_volume'] / row['mean_volume'] if row['mean_volume'] > 0 else 0
    print(
        f"{row['symbol']:<12} "
        f"{row['mean_volume']:>18,.0f}  "
        f"{row['std_volume']:>18,.0f}  "
        f"{cv:>8.3f}"
    )

print("\nInterpretation:")
print("  CV (Coefficient of Variation) = Std / Mean")
print("  Low CV (<0.5): Stable trading activity")
print("  High CV (>1.0): Sporadic activity, be careful")


VOLUME STATISTICS
Symbol       Avg Volume           Vol Std              CV        
ADA/USDT            163,831,549         110,378,087     0.674
AVAX/USDT             3,523,564           1,886,994     0.536
BNB/USDT                329,614             273,351     0.829
BTC/USDT                 18,238              10,309     0.565
DOGE/USDT         1,354,240,626         862,173,296     0.637
DOT/USDT              8,271,381           5,276,911     0.638
ETH/USDT                537,814             271,998     0.506
SOL/USDT              3,933,926           1,756,751     0.447
XRP/USDT            153,554,586         105,220,178     0.685

Interpretation:
  CV (Coefficient of Variation) = Std / Mean
  Low CV (<0.5): Stable trading activity
  High CV (>1.0): Sporadic activity, be careful


## 6. Data Quality Checks

Check for missing data and gaps

In [None]:
print("\nDATA QUALITY")
print("=" * 80)

total_bars = 0
expected_bars = (date_range.end - date_range.start).days + 1

for symbol in symbols:
    if symbol not in store._data:
        print(f"WARNING {symbol:<12}: No data")
        continue
    
    df = store._data[symbol]
    total_bars += len(df)
    completeness = len(df) / expected_bars if expected_bars > 0 else 0
    
    status = "OK" if completeness > 0.95 else "WARNING"
    print(f"{status:>7} {symbol:<12}: {len(df):>4} bars ({completeness:>6.2%} complete)")

overall_completeness = total_bars / (len(symbols) * expected_bars) if expected_bars > 0 else 0
print(f"\nOverall completeness: {overall_completeness:.2%}")

if overall_completeness > 0.95:
    print("Data quality is good")
else:
    print("Some symbols have missing data")

## Summary

Price analysis shows risk-adjusted performance via Sharpe ratio, with volatility indicating opportunity and risk. Skew reveals tail risk direction, and kurtosis measures extreme moves.

Correlation below 0.3 offers better diversification, while high correlation above 0.7 suggests redundancy. Volume with low coefficient of variation indicates stable trading activity.

## Next Steps

- **03_factor_research.ipynb**: Develop factors based on these insights
- **04_strategy_comparison.ipynb**: Test different strategy approaches
- Filter out low-quality or highly correlated symbols