# Prepare Time-Series Data

This notebook prepares the time-series data for training the Temporal Fusion Transformer model.

In [None]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime, timedelta
from glob import glob

# Set up plotting
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (12, 8)

## 1. Load Technical Features

First, let's load the technical features from the batch features directory.

In [None]:
# Find all technical feature files
technical_files = glob('../data/features/batch/technical/**/*.parquet', recursive=True)
print(f"Found {len(technical_files)} technical feature files")

# If no files found, try loading from raw data
if len(technical_files) == 0:
    raw_files = glob('../data/raw/ticks/**/*.parquet', recursive=True)
    print(f"Found {len(raw_files)} raw data files")
    
    if len(raw_files) > 0:
        # Load raw data
        dfs = []
        for file in raw_files:
            try:
                df = pd.read_parquet(file)
                dfs.append(df)
            except Exception as e:
                print(f"Error loading {file}: {e}")
        
        if dfs:
            df = pd.concat(dfs, ignore_index=True)
            print(f"Loaded {len(df)} records from raw data")
        else:
            print("No data loaded from raw files")
            df = None
    else:
        print("No raw data files found")
        df = None
else:
    # Load technical features
    dfs = []
    for file in technical_files:
        try:
            df = pd.read_parquet(file)
            dfs.append(df)
        except Exception as e:
            print(f"Error loading {file}: {e}")
    
    if dfs:
        df = pd.concat(dfs, ignore_index=True)
        print(f"Loaded {len(df)} records from technical features")
    else:
        print("No data loaded from technical feature files")
        df = None

# Try loading from processed data if no data loaded yet
if df is None:
    processed_file = '../data/processed/training_data.parquet'
    if os.path.exists(processed_file):
        df = pd.read_parquet(processed_file)
        print(f"Loaded {len(df)} records from processed data")
    else:
        print("No processed data file found")
        df = None

# Check if data was loaded
if df is None:
    raise ValueError("No data could be loaded. Please check the data paths.")

In [None]:
# Display the first few rows
df.head()

In [None]:
# Check data types and missing values
print("Data types:")
print(df.dtypes)

print("\nMissing values:")
print(df.isnull().sum())

## 2. Prepare Time-Series Data

Now, let's prepare the data for the Temporal Fusion Transformer model by adding a time index.

In [None]:
# Ensure timestamp is datetime
if 'timestamp' in df.columns:
    df['timestamp'] = pd.to_datetime(df['timestamp'])

# Sort by symbol and timestamp
df = df.sort_values(['symbol', 'timestamp'])

# Add time_idx (minutes since start)
min_timestamp = df['timestamp'].min()
df['time_idx'] = ((df['timestamp'] - min_timestamp).dt.total_seconds() / 60).astype(int)

print(f"Time index range: {df['time_idx'].min()} to {df['time_idx'].max()}")

In [None]:
# Check the distribution of time_idx
plt.figure(figsize=(12, 6))
plt.hist(df['time_idx'], bins=50)
plt.title('Distribution of time_idx')
plt.xlabel('time_idx (minutes since start)')
plt.ylabel('Frequency')
plt.grid(True)
plt.show()

In [None]:
# Check time_idx by symbol
for symbol in df['symbol'].unique():
    symbol_df = df[df['symbol'] == symbol]
    print(f"Symbol {symbol}: {len(symbol_df)} records, time_idx from {symbol_df['time_idx'].min()} to {symbol_df['time_idx'].max()}")

## 3. Select Features for Time-Series Model

Let's select the features we want to use for the time-series model.

In [None]:
# List all available features
print("Available features:")
print(df.columns.tolist())

In [None]:
# Select features for the time-series model
required_columns = ['symbol', 'timestamp', 'time_idx', 'close']

# Technical indicators
technical_columns = [
    'open', 'high', 'low', 'volume',
    'ma_5', 'ma_15', 'ma_60',
    'rsi_14'
]

# Add volatility features if available
volatility_columns = [col for col in df.columns if 'volatility' in col]

# Add MACD features if available
macd_columns = [col for col in df.columns if 'macd' in col]

# Add Bollinger Bands features if available
bb_columns = [col for col in df.columns if 'bb_' in col]

# Combine all selected features
selected_columns = required_columns + technical_columns + volatility_columns + macd_columns + bb_columns

# Filter to only include columns that exist in the DataFrame
selected_columns = [col for col in selected_columns if col in df.columns]

# Create a new DataFrame with selected features
ts_df = df[selected_columns].copy()

print(f"Selected {len(selected_columns)} features for the time-series model")
print(f"Selected features: {selected_columns}")

In [None]:
# Check for missing values in selected features
missing_values = ts_df.isnull().sum()
print("Missing values in selected features:")
print(missing_values[missing_values > 0])

In [None]:
# Fill missing values
# For time-series data, forward fill is often a good choice
ts_df = ts_df.groupby('symbol').apply(lambda x: x.fillna(method='ffill'))

# Fill any remaining missing values with backward fill
ts_df = ts_df.groupby('symbol').apply(lambda x: x.fillna(method='bfill'))

# Check if there are still missing values
missing_values = ts_df.isnull().sum()
print("Missing values after filling:")
print(missing_values[missing_values > 0])

In [None]:
# If there are still missing values, fill with zeros
ts_df = ts_df.fillna(0)

# Verify no missing values
assert ts_df.isnull().sum().sum() == 0, "There are still missing values in the data"

## 4. Save Prepared Data

Now, let's save the prepared data for training the Temporal Fusion Transformer model.

In [None]:
# Create output directory if it doesn't exist
output_dir = '../data/features/batch'
os.makedirs(output_dir, exist_ok=True)

# Save the prepared data
output_path = f"{output_dir}/technical_with_timeidx.parquet"
ts_df.to_parquet(output_path, index=False)

print(f"Saved prepared time-series data to {output_path}")

## 5. Visualize Time-Series Data

Let's visualize the time-series data to better understand it.

In [None]:
# Plot close price for each symbol
plt.figure(figsize=(14, 8))

for symbol in ts_df['symbol'].unique():
    symbol_df = ts_df[ts_df['symbol'] == symbol]
    plt.plot(symbol_df['timestamp'], symbol_df['close'], label=symbol)

plt.title('Close Price by Symbol')
plt.xlabel('Timestamp')
plt.ylabel('Close Price')
plt.legend()
plt.grid(True)
plt.show()

In [None]:
# Plot technical indicators for a specific symbol
symbol = ts_df['symbol'].unique()[0]  # Choose the first symbol
symbol_df = ts_df[ts_df['symbol'] == symbol]

plt.figure(figsize=(14, 12))

# Plot 1: Close price and moving averages
plt.subplot(3, 1, 1)
plt.plot(symbol_df['timestamp'], symbol_df['close'], label='Close')
if 'ma_5' in symbol_df.columns:
    plt.plot(symbol_df['timestamp'], symbol_df['ma_5'], label='MA(5)')
if 'ma_15' in symbol_df.columns:
    plt.plot(symbol_df['timestamp'], symbol_df['ma_15'], label='MA(15)')
if 'ma_60' in symbol_df.columns:
    plt.plot(symbol_df['timestamp'], symbol_df['ma_60'], label='MA(60)')
plt.title(f'{symbol} - Close Price and Moving Averages')
plt.xlabel('Timestamp')
plt.ylabel('Price')
plt.legend()
plt.grid(True)

# Plot 2: RSI
if 'rsi_14' in symbol_df.columns:
    plt.subplot(3, 1, 2)
    plt.plot(symbol_df['timestamp'], symbol_df['rsi_14'])
    plt.axhline(y=70, color='r', linestyle='--', alpha=0.5)
    plt.axhline(y=30, color='g', linestyle='--', alpha=0.5)
    plt.title(f'{symbol} - RSI(14)')
    plt.xlabel('Timestamp')
    plt.ylabel('RSI')
    plt.grid(True)

# Plot 3: Volume
plt.subplot(3, 1, 3)
plt.bar(symbol_df['timestamp'], symbol_df['volume'], alpha=0.7)
plt.title(f'{symbol} - Volume')
plt.xlabel('Timestamp')
plt.ylabel('Volume')
plt.grid(True)

plt.tight_layout()
plt.show()

## 6. Summary

We have successfully prepared the time-series data for training the Temporal Fusion Transformer model. The prepared data includes:

1. Time index (minutes since start)
2. Technical indicators (moving averages, RSI, etc.)
3. Price data (open, high, low, close)
4. Volume data

The data has been saved to `data/features/batch/technical_with_timeidx.parquet` and is ready for training the TFT model.