# Data Preparation & Analysis

**Objective**: Load, validate, and analyze the processed Vietnamese FDI stock data.

**Data Source**: `data/processed/values.csv` (98 stocks × 773 trading days × 9 features)
**Graph Data**: `data/processed/adj.npy` (98×98 correlation-based adjacency matrix)

## 1. Load Data

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path

# Load processed data
values_path = Path('../data/processed/values.csv')
adj_path = Path('../data/processed/adj.npy')
values = pd.read_csv(values_path, index_col=[0, 1])
adj = np.load(adj_path)

sns.set_style("whitegrid")

# Data quality metrics
print("=" * 70)
print("DATA QUALITY REPORT")
print("=" * 70)

# Structure
print("\n[STRUCTURE]")
print(f"  Total records: {values.shape[0]:,}")
print(f"  Features: {values.shape[1]}")
print(f"  Unique stocks: {values.index.get_level_values('Symbol').nunique()}")
print(f"  Trading days: {values.index.get_level_values('Date').nunique()}")

# Missing values
print("\n[DATA INTEGRITY]")
null_count = values.isnull().sum().sum()
print(f"  Missing values: {null_count}")
if null_count == 0:
    print("  ✓ Complete dataset, no NaN values")

# Feature columns
print("\n[FEATURES]")
for i, col in enumerate(values.columns, 1):
    print(f"  {i}. {col}")

# Date range
dates = pd.to_datetime(values.index.get_level_values('Date'))
print("\n[DATE RANGE]")
print(f"  Start: {dates.min().date()}")
print(f"  End: {dates.max().date()}")
print(f"  Duration: {(dates.max() - dates.min()).days} days")

print("\n" + "=" * 70)

## 2. Data Quality Validation

In [None]:
import pandas as pd
import numpy as np
from datetime import datetime

# Example: Create sample data for FDI enterprises in Vietnam
# In practice, you would load real data from:
# - CSV files: pd.read_csv('path/to/data.csv')
# - APIs: yfinance, VnEX
# - Databases

def generate_sample_stock_data(num_stocks=20, num_days=252*2):
    """
    Generate sample stock price data for demonstration
    num_stocks: number of FDI enterprises
    num_days: number of trading days (~2 years)
    """
    np.random.seed(42)
    
    dates = pd.date_range(end=datetime.now(), periods=num_days, freq='D')
    stocks = [f'FDI_Stock_{i:02d}' for i in range(1, num_stocks + 1)]
    
    data_dict = {'Date': []}
    
    for stock in stocks:
        # Generate realistic stock prices using geometric Brownian motion
        np.random.seed(hash(stock) % 2**32)
        prices = 100  # Initial price
        price_series = [prices]
        
        for _ in range(num_days - 1):
            # Drift and volatility parameters
            drift = 0.0001
            volatility = 0.02
            
            # Daily return
            daily_return = np.random.normal(drift, volatility)
            prices = prices * (1 + daily_return)
            price_series.append(prices)
        
        data_dict[stock] = price_series
    
    data_dict['Date'] = dates
    return pd.DataFrame(data_dict)

# Generate sample data
stock_data = generate_sample_stock_data(num_stocks=15, num_days=252*2)
print(f"Data shape: {stock_data.shape}")
print("\nFirst few rows:")
print(stock_data.head())
print("\nData info:")
print(stock_data.info())


## 3. Feature Statistics

In [None]:
import pandas as pd
import numpy as np

# Ensure data is loaded
if 'values' not in globals():
    values = pd.read_csv('../data/processed/values.csv', index_col=[0, 1])

print("\nFeature Statistics:")
print("-" * 70)
stats = values.describe().T
print(stats[['mean', 'std', 'min', 'max']].round(4))

## 4. Feature Distributions

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# Ensure data is loaded
if 'values' not in globals():
    values = pd.read_csv('../data/processed/values.csv', index_col=[0, 1])

fig, axes = plt.subplots(3, 3, figsize=(15, 12))
axes = axes.flatten()

for idx, col in enumerate(values.columns):
    axes[idx].hist(values[col].dropna(), bins=50, edgecolor='black', alpha=0.7)
    axes[idx].set_title(f'{col} Distribution')
    axes[idx].set_xlabel('Value')
    axes[idx].set_ylabel('Frequency')

plt.tight_layout()
plt.savefig('../data/analysis/feature_distributions.png', dpi=100)
plt.show()

print("✓ Distribution plots saved")

## 5. Stock-Wise Analysis

In [None]:
import pandas as pd

# Ensure data is loaded
if 'values' not in globals():
    values = pd.read_csv('../data/processed/values.csv', index_col=[0, 1])

# Stock-wise statistics
stock_stats = values.groupby(level='Symbol')[['Close', 'DailyLogReturn', 'RSI']].agg(['mean', 'std', 'min', 'max'])
print("\nTop 10 Stocks by Average Price:")
print(values.groupby(level='Symbol')['Close'].mean().sort_values(ascending=False).head(10))

print("\nTop 10 Stocks by Daily Return Volatility:")
print(values.groupby(level='Symbol')['DailyLogReturn'].std().sort_values(ascending=False).head(10))

## 6. Adjacency Matrix Analysis

In [None]:
import numpy as np
import matplotlib.pyplot as plt

# Ensure adjacency is loaded
if 'adj' not in globals():
    adj = np.load('../data/processed/adj.npy')

# Adjacency matrix statistics
print("\n[ADJACENCY MATRIX ANALYSIS]")
print(f"Shape: {adj.shape}")
print(f"Non-zero edges: {np.count_nonzero(adj)}")
print(f"Density: {np.count_nonzero(adj) / (adj.shape[0] * adj.shape[1]):.6f}")
print(f"Symmetric: {np.allclose(adj, adj.T)}")

# Degree distribution
degrees = adj.sum(axis=1)
print(f"\nDegree Distribution:")
print(f"  Mean degree: {degrees.mean():.2f}")
print(f"  Max degree: {int(degrees.max())}")
print(f"  Min degree: {int(degrees.min())}")

# Visualize adjacency matrix
fig, ax = plt.subplots(figsize=(10, 8))
im = ax.imshow(adj, cmap='viridis', aspect='auto')
ax.set_title('Stock Adjacency Matrix (Correlation-based)')
ax.set_xlabel('Stock Index')
ax.set_ylabel('Stock Index')
plt.colorbar(im, ax=ax, label='Correlation')
plt.savefig('../data/analysis/adjacency_matrix.png', dpi=100, bbox_inches='tight')
plt.show()

print("\n✓ Adjacency matrix visualization saved")

## 7. Time Series Visualization

In [None]:
import pandas as pd
import matplotlib.pyplot as plt

# Ensure data is loaded
if 'values' not in globals():
    values = pd.read_csv('../data/processed/values.csv', index_col=[0, 1])

# Plot key stocks over time
top_stocks = ['VNM', 'SAB', 'VIC', 'VHM', 'HPG']
fig, axes = plt.subplots(5, 1, figsize=(14, 12))

for idx, stock in enumerate(top_stocks):
    stock_data = values.loc[stock]
    axes[idx].plot(pd.to_datetime(stock_data.index), stock_data['Close'], linewidth=1.5, label='Close Price')
    axes[idx].set_title(f'{stock} - Closing Price')
    axes[idx].set_ylabel('Price (VND)')
    axes[idx].grid(True, alpha=0.3)
    axes[idx].legend()

axes[-1].set_xlabel('Date')
plt.tight_layout()
plt.savefig('../data/analysis/timeseries_topstocks.png', dpi=100)
plt.show()

print("✓ Time series visualization saved")

## 8. Data Export for Analysis

In [None]:
# Create feature matrices for analysis
import os
import pandas as pd

# Ensure data is loaded
if 'values' not in globals():
    values = pd.read_csv('../data/processed/values.csv', index_col=[0, 1])

output_dir = '../data/features'
os.makedirs(output_dir, exist_ok=True)

# Unstack to create stock × date matrix for each feature
for feature in ['Close', 'DailyLogReturn', 'RSI', 'MACD']:
    feature_matrix = values[feature].unstack(level='Symbol')
    output_file = f'{output_dir}/{feature.lower()}_matrix.csv'
    feature_matrix.to_csv(output_file)
    print(f"✓ Saved {feature} matrix: {feature_matrix.shape}")

# Save ticker list
ticker_list = values.index.get_level_values('Symbol').unique()
ticker_df = pd.DataFrame({'ticker': ticker_list})
ticker_df.to_csv(f'{output_dir}/tickers.csv', index=False)
print(f"\n✓ Saved {len(ticker_list)} tickers to features/tickers.csv")

print("\n" + "=" * 70)
print("Data preparation complete! Ready for analysis.")
print("=" * 70)