In [None]:
import polars as pl
import matplotlib.pyplot as plt

# Load the four parquet datasets
returns_df = pl.read_parquet("../data/returns_df.parquet")
mkt_cap_df = pl.read_parquet("../data/mkt_cap_df.parquet")
sector_df = pl.read_parquet("../data/sector_df.parquet")
value_df = pl.read_parquet("../data/value_df.parquet")

print("=== Data Summary ===")
for name, df in [("returns_df", returns_df), ("mkt_cap_df", mkt_cap_df), 
                  ("sector_df", sector_df), ("value_df", value_df)]:
    print(f"\n{name}: {df.shape[0]:,} rows x {df.shape[1]} cols  |  "
          f"{df['date'].min()} → {df['date'].max()}  |  "
          f"{df['symbol'].n_unique()} symbols")

In [None]:
# Inspect returns data
print("=== Returns ===")
print(f"Columns: {returns_df.columns}")
print(f"Schema: {returns_df.schema}")
print(returns_df.head(5))

# Basic return statistics
print("\nReturn statistics:")
print(returns_df.select([
    pl.col('asset_returns').mean().alias('mean'),
    pl.col('asset_returns').std().alias('std'),
    pl.col('asset_returns').min().alias('min'),
    pl.col('asset_returns').max().alias('max'),
    pl.col('asset_returns').null_count().alias('nulls'),
]))

In [None]:
# Inspect market cap data
print("=== Market Cap ===")
print(mkt_cap_df.head(5))

# AAPL market cap over time
aapl_mc = mkt_cap_df.filter(pl.col('symbol') == 'AAPL').sort('date')
print(f"\nAAPL market cap: ${aapl_mc['market_cap'][0]/1e12:.2f}T (start) → ${aapl_mc['market_cap'][-1]/1e12:.2f}T (end)")

In [None]:
# Inspect sector data — one-hot encoded
print("=== Sectors ===")
sector_cols = [c for c in sector_df.columns if c not in ['date', 'symbol']]
print(f"{len(sector_cols)} sectors: {sector_cols}")
print(sector_df.head(3))

# Check AAPL's sector
print("\nAAPL sector encoding:")
print(sector_df.filter(pl.col('symbol') == 'AAPL').head(1))

In [None]:
# Inspect value metrics data
print("=== Value Metrics ===")
print(f"Columns: {value_df.columns}")
print(value_df.head(5))

# Check null/NaN counts
print("\nNull/NaN check:")
for col in ['book_price', 'sales_price', 'cf_price']:
    null_ct = value_df[col].null_count()
    nan_ct = value_df[col].is_nan().sum()
    print(f"  {col}: {null_ct} nulls, {nan_ct} NaNs")

# AAPL value metrics over time
aapl_val = value_df.filter(pl.col('symbol') == 'AAPL').sort('date')
print(f"\nAAPL value data: {aapl_val.shape[0]} rows, {aapl_val['date'].min()} → {aapl_val['date'].max()}")
print(aapl_val.head(3))
print(aapl_val.tail(3))

In [None]:
# Coverage check — how many stocks have data per trading day
print("=== Data Coverage ===")

returns_coverage = returns_df.group_by('date').agg(pl.col('symbol').n_unique().alias('n_stocks')).sort('date')
value_coverage = value_df.group_by('date').agg(pl.col('symbol').n_unique().alias('n_stocks')).sort('date')

fig, axes = plt.subplots(2, 1, figsize=(12, 6), sharex=True)

axes[0].plot(returns_coverage['date'], returns_coverage['n_stocks'], label='Returns', linewidth=0.8)
axes[0].set_ylabel('# Stocks')
axes[0].set_title('Returns Coverage (stocks per day)')
axes[0].grid(True, alpha=0.3)

axes[1].plot(value_coverage['date'], value_coverage['n_stocks'], label='Value', color='orange', linewidth=0.8)
axes[1].set_ylabel('# Stocks')
axes[1].set_title('Value Metrics Coverage (stocks per day)')
axes[1].set_xlabel('Date')
axes[1].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

# Value data rows per year
print("\nValue data rows per year:")
yearly = value_df.with_columns(pl.col('date').dt.year().alias('year')).group_by('year').agg(pl.len().alias('count')).sort('year')
print(yearly)