# Data Preparation for Volatility and Risk Prediction
## Predicting Volatility and Risk Level of Stock Prices for FDI Enterprises in Vietnam

This notebook prepares stock price data and calculates volatility and risk metrics for Graph Neural Network (GNN) analysis.


## 1. Import Required Libraries

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime, timedelta
import warnings
warnings.filterwarnings('ignore')

# For data handling and manipulation
from scipy import stats
from scipy.stats import norm

# For visualization
import plotly.graph_objects as go
import plotly.express as px

# For machine learning
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.model_selection import train_test_split

# For GNN (PyTorch Geometric)
import torch
import torch.nn as nn
from torch_geometric.data import Data, DataLoader

print("Libraries imported successfully!")
print(f"PyTorch version: {torch.__version__}")


## 2. Load Stock Price Data

In this section, we'll load historical stock price data for FDI enterprises in Vietnam. 
You can use:
- CSV files with historical data
- APIs (yfinance, VnEX API)
- Database connections


In [None]:
# Example: Create sample data for FDI enterprises in Vietnam
# In practice, you would load real data from:
# - CSV files: pd.read_csv('path/to/data.csv')
# - APIs: yfinance, VnEX
# - Databases

def generate_sample_stock_data(num_stocks=20, num_days=252*2):
    """
    Generate sample stock price data for demonstration
    num_stocks: number of FDI enterprises
    num_days: number of trading days (~2 years)
    """
    np.random.seed(42)
    
    dates = pd.date_range(end=datetime.now(), periods=num_days, freq='D')
    stocks = [f'FDI_Stock_{i:02d}' for i in range(1, num_stocks + 1)]
    
    data_dict = {'Date': []}
    
    for stock in stocks:
        # Generate realistic stock prices using geometric Brownian motion
        np.random.seed(hash(stock) % 2**32)
        prices = 100  # Initial price
        price_series = [prices]
        
        for _ in range(num_days - 1):
            # Drift and volatility parameters
            drift = 0.0001
            volatility = 0.02
            
            # Daily return
            daily_return = np.random.normal(drift, volatility)
            prices = prices * (1 + daily_return)
            price_series.append(prices)
        
        data_dict[stock] = price_series
    
    data_dict['Date'] = dates
    return pd.DataFrame(data_dict)

# Generate sample data
stock_data = generate_sample_stock_data(num_stocks=15, num_days=252*2)
print(f"Data shape: {stock_data.shape}")
print("\nFirst few rows:")
print(stock_data.head())
print("\nData info:")
print(stock_data.info())


## 3. Data Preprocessing and Cleaning

In [None]:
# Convert Date column to datetime
stock_data['Date'] = pd.to_datetime(stock_data['Date'])

# Check for missing values
print("Missing values:")
print(stock_data.isnull().sum())

# Remove duplicates (if any)
stock_data = stock_data.drop_duplicates(subset=['Date'], keep='first')

# Sort by date
stock_data = stock_data.sort_values('Date').reset_index(drop=True)

print(f"\nCleaned data shape: {stock_data.shape}")
print(f"Date range: {stock_data['Date'].min()} to {stock_data['Date'].max()}")
print(f"Number of trading days: {len(stock_data)}")


## 4. Calculate Returns and Log Returns

Returns are fundamental for volatility and risk calculations:
- Simple Return: $$R_t = \frac{P_t - P_{t-1}}{P_{t-1}}$$
- Log Return: $$r_t = \ln\left(\frac{P_t}{P_{t-1}}\right)$$


In [None]:
# Calculate returns for all stocks
price_columns = [col for col in stock_data.columns if col.startswith('FDI_Stock')]

# Simple returns
returns_data = stock_data[price_columns].pct_change()

# Log returns
log_returns_data = np.log(stock_data[price_columns] / stock_data[price_columns].shift(1))

# Combine with dates
returns_data.insert(0, 'Date', stock_data['Date'])
log_returns_data.insert(0, 'Date', stock_data['Date'])

# Remove first row (NaN values)
returns_data = returns_data.dropna()
log_returns_data = log_returns_data.dropna()

print("Simple Returns:")
print(returns_data.head())
print(f"\nShape: {returns_data.shape}")

print("\n\nLog Returns:")
print(log_returns_data.head())
print(f"\nShape: {log_returns_data.shape}")

# Summary statistics
print("\n\nSummary Statistics of Log Returns:")
print(log_returns_data[price_columns].describe())


## 5. Compute Volatility Metrics

Volatility measures the dispersion of returns and is crucial for risk assessment.


In [None]:
# 1. Historical Volatility (annualized)
historical_volatility = log_returns_data[price_columns].std() * np.sqrt(252)

print("Historical Volatility (Annualized):")
print(historical_volatility)

# 2. Rolling Volatility (20-day window)
rolling_window = 20
rolling_volatility = log_returns_data[price_columns].rolling(window=rolling_window).std() * np.sqrt(252)

print("\n\nRolling Volatility (20-day window) - Last 5 rows:")
print(rolling_volatility.tail())

# 3. Exponential Moving Volatility (EWMA)
lambda_param = 0.94  # Decay parameter
ewma_variance = log_returns_data[price_columns].ewm(span=20, adjust=False).var()
exponential_volatility = np.sqrt(ewma_variance) * np.sqrt(252)

print("\n\nExponential Moving Volatility - Last 5 rows:")
print(exponential_volatility.tail())

# Visualization
fig, axes = plt.subplots(3, 1, figsize=(14, 10))

# Historical Volatility
historical_volatility.sort_values().plot(kind='barh', ax=axes[0], color='steelblue')
axes[0].set_title('Historical Volatility (Annualized)', fontsize=12, fontweight='bold')
axes[0].set_xlabel('Volatility')

# Rolling Volatility - sample stocks
rolling_volatility[[price_columns[0], price_columns[1], price_columns[2]]].plot(ax=axes[1])
axes[1].set_title('Rolling Volatility (20-day) - Sample Stocks', fontsize=12, fontweight='bold')
axes[1].set_xlabel('Date')
axes[1].set_ylabel('Volatility')
axes[1].legend()

# Exponential Moving Volatility - sample stocks
exponential_volatility[[price_columns[0], price_columns[1], price_columns[2]]].plot(ax=axes[2])
axes[2].set_title('Exponential Moving Volatility - Sample Stocks', fontsize=12, fontweight='bold')
axes[2].set_xlabel('Date')
axes[2].set_ylabel('Volatility')
axes[2].legend()

plt.tight_layout()
plt.show()


## 6. Calculate Risk Indicators

Key risk metrics for portfolio analysis and GNN features.


In [None]:
risk_indicators = pd.DataFrame(index=price_columns)

# 1. Value at Risk (VaR) - 95% confidence level
confidence_level = 0.95
var_95 = log_returns_data[price_columns].quantile(1 - confidence_level)
risk_indicators['VaR_95'] = var_95

# 2. Conditional Value at Risk (CVaR) - Expected Shortfall
cvar_95 = log_returns_data[price_columns].apply(lambda x: x[x <= x.quantile(1 - confidence_level)].mean())
risk_indicators['CVaR_95'] = cvar_95

# 3. Sharpe Ratio (assuming risk-free rate = 0)
risk_free_rate = 0
mean_returns = log_returns_data[price_columns].mean() * 252  # Annualized
volatility = log_returns_data[price_columns].std() * np.sqrt(252)  # Annualized
risk_indicators['Sharpe_Ratio'] = (mean_returns - risk_free_rate) / volatility

# 4. Sortino Ratio (downside risk)
downside_returns = log_returns_data[price_columns][log_returns_data[price_columns] < 0]
downside_volatility = downside_returns.std() * np.sqrt(252)
risk_indicators['Sortino_Ratio'] = (mean_returns - risk_free_rate) / downside_volatility

# 5. Maximum Drawdown
cumulative_returns = (1 + log_returns_data[price_columns]).cumprod()
running_max = cumulative_returns.expanding().max()
drawdown = (cumulative_returns - running_max) / running_max
max_drawdown = drawdown.min()
risk_indicators['Max_Drawdown'] = max_drawdown

# 6. Skewness
risk_indicators['Skewness'] = log_returns_data[price_columns].skew()

# 7. Kurtosis
risk_indicators['Kurtosis'] = log_returns_data[price_columns].kurtosis()

print("Risk Indicators Summary:")
print(risk_indicators)

# Visualization
fig, axes = plt.subplots(2, 3, figsize=(16, 8))

# VaR 95%
risk_indicators['VaR_95'].sort_values().plot(kind='barh', ax=axes[0, 0], color='coral')
axes[0, 0].set_title('Value at Risk (95%)', fontsize=11, fontweight='bold')
axes[0, 0].set_xlabel('VaR')

# CVaR 95%
risk_indicators['CVaR_95'].sort_values().plot(kind='barh', ax=axes[0, 1], color='lightcoral')
axes[0, 1].set_title('Conditional Value at Risk (95%)', fontsize=11, fontweight='bold')
axes[0, 1].set_xlabel('CVaR')

# Sharpe Ratio
risk_indicators['Sharpe_Ratio'].sort_values().plot(kind='barh', ax=axes[0, 2], color='lightgreen')
axes[0, 2].set_title('Sharpe Ratio', fontsize=11, fontweight='bold')
axes[0, 2].set_xlabel('Sharpe Ratio')

# Maximum Drawdown
risk_indicators['Max_Drawdown'].sort_values().plot(kind='barh', ax=axes[1, 0], color='gold')
axes[1, 0].set_title('Maximum Drawdown', fontsize=11, fontweight='bold')
axes[1, 0].set_xlabel('Drawdown')

# Skewness
risk_indicators['Skewness'].sort_values().plot(kind='barh', ax=axes[1, 1], color='lightblue')
axes[1, 1].set_title('Skewness', fontsize=11, fontweight='bold')
axes[1, 1].set_xlabel('Skewness')

# Kurtosis
risk_indicators['Kurtosis'].sort_values().plot(kind='barh', ax=axes[1, 2], color='plum')
axes[1, 2].set_title('Kurtosis (Excess)', fontsize=11, fontweight='bold')
axes[1, 2].set_xlabel('Kurtosis')

plt.tight_layout()
plt.show()


## 7. Feature Engineering for GNN Input

Create feature matrices for Graph Neural Networks where:
- **Nodes** represent individual stocks
- **Node Features** include price movements, volatility, and risk metrics
- **Edges** represent correlations or sector relationships


In [None]:
# Create node features for each stock
num_stocks = len(price_columns)

# Initialize feature matrix (num_stocks x num_features)
node_features_list = []

for stock in price_columns:
    stock_returns = log_returns_data[stock].values
    
    # Feature 1: Mean return (annualized)
    mean_return = stock_returns.mean() * 252
    
    # Feature 2: Historical volatility (annualized)
    hist_vol = stock_returns.std() * np.sqrt(252)
    
    # Feature 3: Skewness
    skewness = stats.skew(stock_returns)
    
    # Feature 4: Kurtosis
    kurtosis = stats.kurtosis(stock_returns)
    
    # Feature 5: Max daily return
    max_return = stock_returns.max()
    
    # Feature 6: Min daily return
    min_return = stock_returns.min()
    
    # Feature 7: VaR (95%)
    var_95_val = np.percentile(stock_returns, 5)
    
    # Feature 8: Average positive return
    pos_returns = stock_returns[stock_returns > 0]
    avg_pos_return = pos_returns.mean() if len(pos_returns) > 0 else 0
    
    node_features_list.append([
        mean_return, hist_vol, skewness, kurtosis, 
        max_return, min_return, var_95_val, avg_pos_return
    ])

node_features = np.array(node_features_list)
print(f"Node Features shape: {node_features.shape}")
print(f"Features per node: {node_features.shape[1]}")
print("\nNode Features Matrix (first 3 stocks):")
print(node_features[:3])

# Normalize features
scaler = StandardScaler()
node_features_normalized = scaler.fit_transform(node_features)

print("\nNormalized Node Features (first 3 stocks):")
print(node_features_normalized[:3])

# Create correlation matrix for edge construction
correlation_matrix = log_returns_data[price_columns].corr()

print(f"\nCorrelation Matrix shape: {correlation_matrix.shape}")
print("Correlation Matrix (first 3x3):")
print(correlation_matrix.iloc[:3, :3])

# Visualization of correlation matrix
plt.figure(figsize=(10, 8))
sns.heatmap(correlation_matrix, cmap='coolwarm', center=0, square=True, 
            xticklabels=False, yticklabels=False, cbar_kws={'label': 'Correlation'})
plt.title('Stock Correlation Matrix', fontsize=14, fontweight='bold')
plt.tight_layout()
plt.show()


## 8. Prepare Data for Model Training

Create graph structures and prepare train/test splits for GNN models.


In [None]:
# Create edge indices based on correlation threshold
correlation_threshold = 0.3
edge_indices = []
edge_weights = []

for i in range(num_stocks):
    for j in range(i + 1, num_stocks):
        corr_value = correlation_matrix.iloc[i, j]
        if abs(corr_value) > correlation_threshold:
            # Create undirected edges
            edge_indices.append([i, j])
            edge_indices.append([j, i])
            edge_weights.append(corr_value)
            edge_weights.append(corr_value)

edge_indices = np.array(edge_indices).T if edge_indices else np.array([[], []])
edge_weights = np.array(edge_weights)

print(f"Number of edges: {len(edge_weights)}")
print(f"Edge indices shape: {edge_indices.shape}")
print(f"Edge weights shape: {edge_weights.shape}")

# Create PyTorch Geometric Data object
x = torch.tensor(node_features_normalized, dtype=torch.float32)
edge_index = torch.tensor(edge_indices, dtype=torch.long)
edge_attr = torch.tensor(edge_weights, dtype=torch.float32).unsqueeze(-1)

# Target: Volatility of each stock for prediction task
y = torch.tensor(historical_volatility.values, dtype=torch.float32).unsqueeze(-1)

# Create the graph data object
graph_data = Data(x=x, edge_index=edge_index, edge_attr=edge_attr, y=y)

print(f"\nGraph Data Object:")
print(f"  Number of nodes: {graph_data.num_nodes}")
print(f"  Number of edges: {graph_data.num_edges}")
print(f"  Node feature dimension: {graph_data.num_node_features}")
print(f"  Edge attribute dimension: {graph_data.num_edge_features}")
print(f"  Target dimension: {graph_data.y.shape}")

# Train/Test split on temporal data
# We'll use time-series split: train on earlier data, test on later data
train_ratio = 0.8
split_idx = int(len(log_returns_data) * train_ratio)

print(f"\nTemporal Split:")
print(f"  Training data: {log_returns_data['Date'].iloc[0].date()} to {log_returns_data['Date'].iloc[split_idx].date()}")
print(f"  Testing data: {log_returns_data['Date'].iloc[split_idx].date()} to {log_returns_data['Date'].iloc[-1].date()}")

# Save prepared data
import pickle
import os

save_dir = '/Users/hoc/Documents/NCKH/code/data'
os.makedirs(save_dir, exist_ok=True)

# Save all relevant data
data_dict = {
    'stock_data': stock_data,
    'returns_data': returns_data,
    'log_returns_data': log_returns_data,
    'node_features': node_features,
    'node_features_normalized': node_features_normalized,
    'correlation_matrix': correlation_matrix,
    'risk_indicators': risk_indicators,
    'graph_data': graph_data,
    'historical_volatility': historical_volatility,
    'price_columns': price_columns,
    'split_idx': split_idx
}

with open(f'{save_dir}/prepared_data.pkl', 'wb') as f:
    pickle.dump(data_dict, f)

print(f"\nâœ“ Data saved to {save_dir}/prepared_data.pkl")
