# Data Preparation for Volatility Prediction

This notebook demonstrates:
1. Loading the PyTorch Geometric dataset
2. Exploring temporal graph structure
3. Analyzing feature distributions
4. Visualizing stock relationships and volatility patterns
5. Preparing train/test splits for model comparison

In [None]:
import sys
sys.path.append('../')

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import torch
from torch_geometric.data import Data
from torch_geometric.loader import DataLoader

from src.datasets import VNStocksDataset
from src.datasets.VNStocksDataset import VNStocksVolatilityDataset

# Set style
plt.style.use('seaborn-v0_8-darkgrid')
sns.set_palette("husl")

%matplotlib inline

## 1. Load Raw Data

First, let's load and examine the raw data files.

In [None]:
# Load raw values
values = pd.read_csv('../data/values.csv').set_index(['Symbol', 'Date'])
print(f"Dataset shape: {values.shape}")
print(f"\nFeatures: {list(values.columns)}")
print(f"\nNumber of stocks: {len(values.index.get_level_values('Symbol').unique())}")
print(f"Number of trading days: {len(values.index.get_level_values('Date').unique())}")

values.head(10)

In [None]:
# Load adjacency matrix
adj = np.load('../data/adj.npy')
print(f"Adjacency matrix shape: {adj.shape}")
print(f"Number of edges: {np.count_nonzero(adj)}")
print(f"Graph density: {np.count_nonzero(adj) / (adj.shape[0] * adj.shape[1]):.4f}")

# Visualize adjacency matrix
plt.figure(figsize=(10, 8))
plt.imshow(adj, cmap='YlOrRd', aspect='auto')
plt.colorbar(label='Correlation')
plt.title('Stock Correlation Adjacency Matrix')
plt.xlabel('Stock Index')
plt.ylabel('Stock Index')
plt.tight_layout()
plt.show()

## 2. Create PyTorch Geometric Dataset

Now let's create temporal graph datasets using our custom dataset classes.

In [None]:
# Create standard dataset (for price prediction baseline)
past_window = 25  # 5 weeks of trading days
future_window = 1  # Predict next day

dataset = VNStocksDataset(
    root='../data/',
    past_window=past_window,
    future_window=future_window,
    force_reload=True
)

print(f"Dataset size: {len(dataset)} temporal snapshots")
print(f"\nFirst sample:")
sample = dataset[0]
print(f"  x shape: {sample.x.shape}  # (nodes, features, timesteps)")
print(f"  y shape: {sample.y.shape}  # (nodes, future_window)")
print(f"  edge_index shape: {sample.edge_index.shape}")
print(f"  edge_weight shape: {sample.edge_weight.shape}")
print(f"  close_price shape: {sample.close_price.shape}")

sample

In [None]:
# Create volatility dataset (main task)
volatility_dataset = VNStocksVolatilityDataset(
    root='../data/',
    past_window=25,
    future_window=5,  # Predict volatility over next 5 days
    volatility_window=20,  # Calculate volatility using 20-day window
    force_reload=True
)

print(f"Volatility dataset size: {len(volatility_dataset)} temporal snapshots")
print(f"\nFirst sample:")
vol_sample = volatility_dataset[0]
print(f"  x shape: {vol_sample.x.shape}")
print(f"  y shape (volatility): {vol_sample.y.shape}")
print(f"  volatility shape: {vol_sample.volatility.shape}")

vol_sample

## 3. Feature Analysis

Analyze the distribution and statistics of features.

In [None]:
# Feature statistics
feature_names = ['NormClose', 'DailyLogReturn', 'ALR1W', 'ALR2W', 'ALR1M', 'ALR2M', 'RSI', 'MACD']

print("Feature Statistics:")
print("=" * 80)
for i, feat_name in enumerate(feature_names):
    feat_values = values.iloc[:, i+1].values  # +1 to skip 'Close'
    print(f"\n{feat_name}:")
    print(f"  Mean: {feat_values.mean():.6f}")
    print(f"  Std:  {feat_values.std():.6f}")
    print(f"  Min:  {feat_values.min():.6f}")
    print(f"  Max:  {feat_values.max():.6f}")

In [None]:
# Visualize feature distributions
fig, axes = plt.subplots(3, 3, figsize=(15, 12))
axes = axes.flatten()

for i, feat_name in enumerate(feature_names):
    feat_values = values.iloc[:, i+1].values
    axes[i].hist(feat_values, bins=50, alpha=0.7, edgecolor='black')
    axes[i].set_title(f'{feat_name} Distribution')
    axes[i].set_xlabel('Value')
    axes[i].set_ylabel('Frequency')
    axes[i].grid(True, alpha=0.3)

# Remove empty subplot
fig.delaxes(axes[8])

plt.tight_layout()
plt.show()

## 4. Volatility Analysis

Analyze volatility patterns in the data.

In [None]:
# Calculate volatility for all stocks
stocks = values.index.get_level_values('Symbol').unique()
volatilities = []

for stock in stocks:
    stock_data = values.loc[stock]
    returns = np.diff(np.log(stock_data['Close'].values))
    volatility = pd.Series(returns).rolling(window=20).std().values
    volatilities.append(volatility)

volatilities = np.array(volatilities)
print(f"Volatility matrix shape: {volatilities.shape}")

In [None]:
# Plot volatility over time for random stocks
fig, axes = plt.subplots(2, 2, figsize=(15, 10))
axes = axes.flatten()

np.random.seed(42)
random_stocks = np.random.choice(len(stocks), 4, replace=False)

dates = pd.to_datetime(values.loc[stocks[0]].index)

for i, stock_idx in enumerate(random_stocks):
    axes[i].plot(dates, volatilities[stock_idx], linewidth=1.5)
    axes[i].set_title(f'{stocks[stock_idx]} - Volatility Over Time')
    axes[i].set_xlabel('Date')
    axes[i].set_ylabel('Volatility (20-day rolling std)')
    axes[i].grid(True, alpha=0.3)
    axes[i].tick_params(axis='x', rotation=45)

plt.tight_layout()
plt.show()

In [None]:
# Average volatility across all stocks
avg_volatility = np.nanmean(volatilities, axis=0)

plt.figure(figsize=(15, 5))
plt.plot(dates, avg_volatility, linewidth=2, color='darkblue')
plt.fill_between(dates, avg_volatility, alpha=0.3)
plt.title('Average Market Volatility (All Stocks)', fontsize=14, fontweight='bold')
plt.xlabel('Date')
plt.ylabel('Volatility')
plt.grid(True, alpha=0.3)
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

print(f"\nAverage volatility statistics:")
print(f"  Mean: {np.nanmean(avg_volatility):.6f}")
print(f"  Std:  {np.nanstd(avg_volatility):.6f}")
print(f"  Max:  {np.nanmax(avg_volatility):.6f}")

## 5. Graph Structure Analysis

Analyze the correlation-based graph structure.

In [None]:
# Degree distribution
degrees = adj.sum(axis=1) > 0  # Binary: has connections or not
out_degrees = (adj > 0).sum(axis=1)

fig, axes = plt.subplots(1, 2, figsize=(15, 5))

# Degree distribution histogram
axes[0].hist(out_degrees, bins=20, alpha=0.7, edgecolor='black')
axes[0].set_title('Node Degree Distribution', fontsize=12, fontweight='bold')
axes[0].set_xlabel('Degree (Number of Connections)')
axes[0].set_ylabel('Number of Nodes')
axes[0].grid(True, alpha=0.3)

# Edge weight distribution
edge_weights = adj[adj > 0]
axes[1].hist(edge_weights, bins=30, alpha=0.7, edgecolor='black', color='orange')
axes[1].set_title('Edge Weight Distribution', fontsize=12, fontweight='bold')
axes[1].set_xlabel('Correlation Value')
axes[1].set_ylabel('Number of Edges')
axes[1].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

print(f"Graph statistics:")
print(f"  Number of nodes: {adj.shape[0]}")
print(f"  Number of edges: {np.count_nonzero(adj)}")
print(f"  Average degree: {out_degrees.mean():.2f}")
print(f"  Max degree: {out_degrees.max()}")
print(f"  Average edge weight: {edge_weights.mean():.4f}")

## 6. Train/Test Split

Prepare data splits for model training and evaluation.

In [None]:
# Standard 80-20 train-test split for time series
train_ratio = 0.8
train_size = int(len(volatility_dataset) * train_ratio)

train_dataset = volatility_dataset[:train_size]
test_dataset = volatility_dataset[train_size:]

print(f"Dataset splits:")
print(f"  Total samples: {len(volatility_dataset)}")
print(f"  Train samples: {len(train_dataset)} ({len(train_dataset)/len(volatility_dataset)*100:.1f}%)")
print(f"  Test samples:  {len(test_dataset)} ({len(test_dataset)/len(volatility_dataset)*100:.1f}%)")

# Save split indices for reproducibility
split_info = {
    'train_size': train_size,
    'test_size': len(volatility_dataset) - train_size,
    'train_ratio': train_ratio,
    'past_window': past_window,
    'future_window': future_window
}

import json
with open('../data/train_test_split.json', 'w') as f:
    json.dump(split_info, f, indent=2)

print(f"\nSplit configuration saved to '../data/train_test_split.json'")

In [None]:
# Create DataLoaders for batch processing
batch_size = 32

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

print(f"DataLoaders created:")
print(f"  Batch size: {batch_size}")
print(f"  Train batches: {len(train_loader)}")
print(f"  Test batches: {len(test_loader)}")

# Test a batch
batch = next(iter(train_loader))
print(f"\nSample batch:")
print(f"  x shape: {batch.x.shape}")
print(f"  y shape: {batch.y.shape}")
print(f"  batch size: {batch.num_graphs}")

## 7. Summary

Data preparation is complete! We have:

### Dataset
- **98 Vietnamese FDI stocks** from 2022-01-01 to 2024-12-31
- **773 trading days** of historical data
- **9 features** per stock: Close, NormClose, DailyLogReturn, ALR1W, ALR2W, ALR1M, ALR2M, RSI, MACD

### Graph Structure
- **52 edges** based on correlation threshold (0.1)
- **Correlation-based adjacency matrix** capturing stock relationships
- **Sparse graph** with density of 0.54%

### Temporal Snapshots
- **Input window**: 25 trading days (~5 weeks)
- **Prediction window**: 5 trading days (volatility over next week)
- **Volatility calculation**: 20-day rolling standard deviation

### Splits
- **Train**: 80% of temporal snapshots
- **Test**: 20% of temporal snapshots

### Next Steps
1. Implement baseline models (ARIMA)
2. Implement ML models (Random Forest)
3. Implement DL models (LSTM, GRU)
4. Compare model performance on volatility prediction task