In [1]:
!unzip -q /content/results.zip -d /content/data

In [5]:
import os
import re
import ast
import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
import pandas as pd
from tqdm import tqdm
from torch.utils.data import DataLoader, TensorDataset
from transformers import AutoTokenizer, AutoModel
from datetime import datetime, timedelta

# === Device Setup ===
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

# === FinBERT Model ===
tokenizer = AutoTokenizer.from_pretrained("yiyanghkust/finbert-tone")
finbert_model = AutoModel.from_pretrained("yiyanghkust/finbert-tone").to(device)
finbert_model.eval()

# === Define 1D Convolution Network for Financial Metrics ===
class FinancialConvNet(nn.Module):
    def __init__(self, in_channels=5, kernel_size=3, hidden_dim=64, stocks_count=5):
        super().__init__()

        self.in_channels = in_channels
        self.hidden_dim = hidden_dim
        self.stocks_count = stocks_count

        # Separate convolution layer for each metric
        self.conv_layers = nn.ModuleList([
            nn.Conv1d(
                in_channels=stocks_count,     # Each stock is a channel
                out_channels=hidden_dim,      # Hidden dimension per metric
                kernel_size=kernel_size,      # Kernel size of 3
                padding=kernel_size//2        # Same padding
            ) for _ in range(in_channels)
        ])

        self.relu = nn.ReLU()
        self.tanh = nn.Tanh()

        # Dimension reducer after convolution
        self.metrics_dim_reducer = nn.Linear(hidden_dim * in_channels, 384)

    def forward(self, x):
        # x shape: [batch_size, seq_length, metrics, stocks]
        batch_size, seq_len, metrics, stocks = x.shape

        # Transpose to get format suitable for convolution across days
        # We need: [batch_size, metrics, stocks, seq_length]
        x_transposed = x.permute(0, 2, 3, 1)

        # Process each day's output separately to maintain temporal information
        all_days_features = []

        for day_idx in range(seq_len):
            day_features = []

            # Process each metric
            for metric_idx in range(metrics):
                # Get all stocks data for this metric across all days
                # Shape: [batch_size, stocks, seq_length]
                metric_data = x_transposed[:, metric_idx]

                # Apply convolution
                # This will give us: [batch_size, hidden_dim, seq_length]
                conv_output = self.relu(self.conv_layers[metric_idx](metric_data))

                # Extract features for the current day from the convolution output
                # Shape: [batch_size, hidden_dim]
                day_metric_features = conv_output[:, :, day_idx]

                day_features.append(day_metric_features)

            # Combine all metric features for this day
            # Shape: [batch_size, metrics*hidden_dim]
            combined_day_features = torch.cat(day_features, dim=1)

            # Apply dimension reduction
            # Shape: [batch_size, 384]
            day_output = self.tanh(self.metrics_dim_reducer(combined_day_features))

            all_days_features.append(day_output)

        # Stack all days features
        # Shape: [batch_size, seq_length, 384]
        return torch.stack(all_days_features, dim=1)

# === News Processing Module ===
class NewsProcessor(nn.Module):
    def __init__(self, input_dim=768, output_dim=512):
        super().__init__()

        # Weighted sum to reduce dimensionality
        self.attention = nn.Linear(input_dim, 1)
        self.feature_projection = nn.Linear(input_dim, output_dim)
        self.tanh = nn.Tanh()

    def forward(self, x):
        # x shape: [batch_size, seq_length, num_articles, embed_dim]
        batch_size, seq_len, num_articles, embed_dim = x.shape

        all_days_features = []

        for day in range(seq_len):
            # Extract current day's articles
            # Shape: [batch_size, num_articles, embed_dim]
            day_articles = x[:, day]

            # Calculate attention weights
            # Shape: [batch_size, num_articles, 1]
            attn_weights = F.softmax(self.attention(day_articles), dim=1)

            # Apply attention weights (weighted sum)
            # Shape: [batch_size, embed_dim]
            weighted_features = torch.sum(day_articles * attn_weights, dim=1)

            # Project to output dimension
            # Shape: [batch_size, output_dim]
            day_features = self.tanh(self.feature_projection(weighted_features))
            all_days_features.append(day_features)

        # Stack to get [batch_size, seq_len, output_dim]
        return torch.stack(all_days_features, dim=1)

# === Combined Model with Preserved Temporal Information ===
class CombinedFinancialModel(nn.Module):
    def __init__(self, finbert_dim=768, metrics_count=5, stocks_count=5, output_dim=312, pred_metrics=5):
        super().__init__()

        # News processing
        self.news_processor = NewsProcessor(input_dim=finbert_dim, output_dim=512)

        # Metrics processing
        self.conv_net = FinancialConvNet(in_channels=metrics_count, kernel_size=3, stocks_count=stocks_count)

        # Combined processing
        self.concat_reducer = nn.Linear(512 + 384, output_dim)
        self.tanh = nn.Tanh()

        # Prediction layer for next day (day 10)
        self.predictor = nn.Linear(output_dim * 9, pred_metrics * stocks_count)

    def forward(self, news_embeddings, metrics_data):
        # Process news through attention and projection
        # news_embeddings shape: [batch_size, 9 days, 10 articles, 768]
        # Output shape: [batch_size, 9 days, 512]
        news_features = self.news_processor(news_embeddings)

        # Process metrics through convolutions
        # metrics_data shape: [batch_size, 9 days, 5 metrics, 5 stocks]
        # Output shape: [batch_size, 9 days, 384]
        metrics_features = self.conv_net(metrics_data)

        # Concatenate along feature dimension
        # Shape: [batch_size, 9 days, 512+384]
        combined = torch.cat([news_features, metrics_features], dim=2)

        # Apply linear and tanh to match diagram
        # Shape: [batch_size, 9 days, output_dim]
        temporal_features = self.tanh(self.concat_reducer(combined))

        # For prediction, flatten the sequence dimension to predict day 10
        # Shape: [batch_size, 9*output_dim]
        flat_features = temporal_features.reshape(temporal_features.shape[0], -1)

        # Predict next day's metrics for all stocks
        # Shape: [batch_size, pred_metrics*stocks_count]
        predictions = self.predictor(flat_features)

        return {
            'temporal_features': temporal_features,  # Intermediate features preserving temporal structure
            'predictions': predictions  # Predictions for day 10
        }

# === Data Processing Functions ===
def find_nearest_date(df, target_date, max_days=7):
    """Find closest valid date within business days."""
    dates = df.index
    if target_date in dates:
        return target_date

    candidates = dates[(dates >= target_date - pd.Timedelta(days=max_days)) &
                      (dates <= target_date + pd.Timedelta(days=max_days))]

    if not candidates.empty:
        return candidates[np.argmin(np.abs((candidates - target_date).total_seconds()))]

    return None

def process_article_folder(example_path):
    """Process 9 days of news articles, 10 articles per day"""
    day_files = sorted([f for f in os.listdir(example_path) if f.endswith(".txt")])[:9]  # Process 9 days
    daily_embeddings = []

    for day_file in day_files:
        day_path = os.path.join(example_path, day_file)

        with open(day_path, 'r', encoding='utf-8') as f:
            articles = re.split(r'--- Article \d+ ---', f.read())
            articles = [a.strip() for a in articles if a.strip()]

            # Select up to 10 random articles
            if len(articles) > 10:
                articles = np.random.choice(articles, 10, replace=False).tolist()
            elif len(articles) < 10:
                # Pad with empty strings if fewer than 10 articles
                articles = articles + [''] * (10 - len(articles))

        # Batch processing
        inputs = tokenizer(
            articles,
            padding='max_length',
            truncation=True,
            max_length=512,
            return_tensors='pt'
        ).to(device)

        with torch.no_grad():
            with torch.amp.autocast(device_type='cuda', dtype=torch.float16):
                outputs = finbert_model(**inputs)

        # Get the CLS token embeddings from last hidden state
        # Shape: [10, 768]
        cls_embeddings = outputs.last_hidden_state[:, 0, :]
        daily_embeddings.append(cls_embeddings)

    return torch.stack(daily_embeddings)  # Shape: [9, 10, 768]

def extract_raw_metrics(df, start_date, stocks, metrics, include_target=False):
    """Extract 9 days of metrics data and optionally the 10th day as target"""
    days_to_extract = 10 if include_target else 9
    date_range = [start_date + timedelta(days=i) for i in range(days_to_extract)]
    valid_dates = [find_nearest_date(df, d) for d in date_range]

    metrics_data = []
    for date in valid_dates[:9]:  # First 9 days are input features
        if date is None:
            return None, None, None

        daily_data = []
        for stock in stocks:
            try:
                stock_metrics = [df.loc[date, (metric, stock)] for metric in metrics]
            except KeyError:
                return None, None, None
            daily_data.append(stock_metrics)

        daily_tensor = torch.tensor(daily_data, dtype=torch.float32).T
        metrics_data.append(daily_tensor)

    input_tensor = torch.stack(metrics_data)  # Shape: (9, 5 metrics, 5 stocks)

    # If target day is requested, extract it separately
    target_tensor = None
    target_date = None
    if include_target and len(valid_dates) == 10 and valid_dates[9] is not None:
        target_date = valid_dates[9]
        target_data = []
        for stock in stocks:
            try:
                stock_metrics = [df.loc[valid_dates[9], (metric, stock)] for metric in metrics]
                target_data.append(stock_metrics)
            except KeyError:
                return input_tensor, None, None

        target_tensor = torch.tensor(target_data, dtype=torch.float32).flatten()

    return input_tensor, target_tensor, target_date

# === Generate Dataset with Dates ===
def generate_dataset_with_dates(df, input_dir, output_dir, split):
    """Generate dataset with inputs, targets, and dates"""
    os.makedirs(output_dir, exist_ok=True)

    example_folders = sorted(os.listdir(input_dir))
    all_news_tensors = []
    all_metrics_tensors = []
    all_targets = []
    all_dates = []  # To store date information
    valid_examples = []

    stocks = ['AAPL', 'AMZN', 'GOOGL', 'META', 'NFLX']
    metrics = ['Open', 'High', 'Low', 'Close', 'Volume']

    for folder in tqdm(example_folders, desc=f"Processing {split}"):
        example_path = os.path.join(input_dir, folder)
        if not os.path.isdir(example_path):
            continue

        try:
            txt_files = sorted([f for f in os.listdir(example_path) if f.endswith(".txt")])
            if len(txt_files) < 9:
                continue

            start_date_str = txt_files[0].replace(".txt", "")
            start_date = datetime.strptime(start_date_str, "%Y-%m-%d")

            # Process news
            news_tensor = process_article_folder(example_path)

            # Process metrics and extract target (10th day)
            metrics_tensor, target_tensor, target_date = extract_raw_metrics(
                df, start_date, stocks, metrics, include_target=True
            )

            if metrics_tensor is None or target_tensor is None or target_date is None:
                continue

            # Collect data
            all_news_tensors.append(news_tensor)
            all_metrics_tensors.append(metrics_tensor)
            all_targets.append(target_tensor)

            # Store date information
            date_info = {
                'start_date': start_date.strftime("%Y-%m-%d"),
                'target_date': target_date.strftime("%Y-%m-%d") if target_date else None,
                'example_id': folder
            }
            all_dates.append(date_info)
            valid_examples.append(folder)

        except Exception as e:
            print(f"Failed processing {folder}: {str(e)}")
            continue

    # Save dataset if we have examples
    if all_news_tensors and all_metrics_tensors and all_targets:
        combined_news = torch.stack(all_news_tensors)
        combined_metrics = torch.stack(all_metrics_tensors)
        combined_targets = torch.stack(all_targets)

        # Save tensors
        torch.save(combined_news.cpu(), os.path.join(output_dir, f"{split}_news.pt"))
        torch.save(combined_metrics.cpu(), os.path.join(output_dir, f"{split}_metrics.pt"))
        torch.save(combined_targets.cpu(), os.path.join(output_dir, f"{split}_targets.pt"))

        # Save date information
        dates_df = pd.DataFrame(all_dates)
        dates_df.to_csv(os.path.join(output_dir, f"{split}_dates.csv"), index=False)

        print(f"Saved {split} dataset with {len(valid_examples)} examples")
        return combined_news, combined_metrics, combined_targets, dates_df

    return None, None, None, None

# === Generate Embeddings Function ===
def generate_and_save_embeddings(model, news_tensors, metrics_tensors, dates_df, output_dir, split):
    """
    Generate concatenated embeddings with dimension [batch×9×312] and save them
    """
    model.eval()
    batch_size = news_tensors.shape[0]
    max_batch = 16  # Adjust based on GPU memory

    all_embeddings = []

    with torch.no_grad():
        for i in range(0, batch_size, max_batch):
            news_batch = news_tensors[i:i+max_batch].to(device)
            metrics_batch = metrics_tensors[i:i+max_batch].to(device)

            # Forward pass to get temporal features (before flattening)
            outputs = model(news_batch, metrics_batch)
            temporal_features = outputs['temporal_features']  # Shape: [batch, 9, 312]

            all_embeddings.append(temporal_features.cpu())

    # Concatenate all batches
    combined_embeddings = torch.cat(all_embeddings, dim=0)

    # Save the embeddings
    embedding_path = os.path.join(output_dir, f"{split}_embeddings_9x312.pt")
    torch.save(combined_embeddings, embedding_path)

    print(f"Saved {split} embeddings with shape {combined_embeddings.shape} to {embedding_path}")

    return combined_embeddings

# === Main Execution Function ===
def main():
    # Set paths
    csv_path = "/content/cleaned_stock_prices.csv"
    data_dir = "/content/data/processed_dataset_v2"
    output_dir = "/content/data/preprocessed"
    os.makedirs(output_dir, exist_ok=True)

    # Load CSV with multi-index columns
    print("Loading stock price data...")
    df = pd.read_csv(csv_path)
    # Set first column as index
    df.set_index(df.columns[0], inplace=True)
    df.index = pd.to_datetime(df.index)

    # Parse the column headers properly
    column_pairs = []
    for col in df.columns:
        # Extract components from string like "('Open', 'AAPL')"
        match = re.match(r"\('([^']+)', '([^']+)'\)", col)
        if match:
            metric, stock = match.groups()
            column_pairs.append((metric, stock))
        else:
            column_pairs.append(col)

    # Create MultiIndex columns
    df.columns = pd.MultiIndex.from_tuples(column_pairs)

    # Convert index to datetime if needed
    if not isinstance(df.index, pd.DatetimeIndex):
        df.index = pd.to_datetime(df.index, errors='coerce')
        df.dropna(subset=[df.index.name], inplace=True)

    # Check date range
    print(f"CSV date range: {df.index.min().date()} → {df.index.max().date()}")

    # Define constants
    stocks = ['AAPL', 'AMZN', 'GOOGL', 'META', 'NFLX']
    metrics = ['Open', 'High', 'Low', 'Close', 'Volume']

    # === Column Verification ===
    expected_cols = [(m, s) for m in metrics for s in stocks]
    missing_cols = [col for col in expected_cols if col not in df.columns]
    if missing_cols:
        raise ValueError(f"Missing critical columns: {missing_cols}")
    else:
        print("✅ All required stock columns are present.")

    # Initialize model
    combined_model = CombinedFinancialModel().to(device)
    print("Model initialized.")

    # Process all splits and generate datasets with dates
    all_data = {}
    for split in ["train", "val", "test"]:
        print(f"\nProcessing {split} split...")
        input_dir = os.path.join(data_dir, split)

        # Generate dataset with dates
        news_tensors, metrics_tensors, targets, dates_df = generate_dataset_with_dates(
            df, input_dir, output_dir, split
        )

        if news_tensors is not None:
            # Store for later use
            all_data[split] = {
                'news': news_tensors,
                'metrics': metrics_tensors,
                'targets': targets,
                'dates': dates_df
            }

            # Generate and save embeddings
            print(f"Generating embeddings for {split}...")
            embeddings = generate_and_save_embeddings(
                combined_model, news_tensors, metrics_tensors, dates_df, output_dir, split
            )
            all_data[split]['embeddings'] = embeddings

    # Print summary
    print("\n=== Dataset Summary ===")
    for split, data in all_data.items():
        print(f"{split.capitalize()} split:")
        print(f"  • Examples: {data['news'].shape[0]}")
        print(f"  • News tensor: {data['news'].shape}")
        print(f"  • Metrics tensor: {data['metrics'].shape}")
        print(f"  • Targets tensor: {data['targets'].shape}")
        print(f"  • Embeddings tensor: {data['embeddings'].shape}")

    print("\n✅ Processing complete. Generated [batch×9×312] embeddings for all splits.")

if __name__ == "__main__":
    main()

Using device: cuda
Loading stock price data...
CSV date range: 2018-01-02 → 2018-05-30
✅ All required stock columns are present.
Model initialized.

Processing train split...


Processing train: 100%|██████████| 345/345 [03:48<00:00,  1.51it/s]


Saved train dataset with 345 examples
Generating embeddings for train...
Saved train embeddings with shape torch.Size([345, 9, 312]) to /content/data/preprocessed/train_embeddings_9x312.pt

Processing val split...


Processing val: 100%|██████████| 44/44 [00:28<00:00,  1.53it/s]


Saved val dataset with 44 examples
Generating embeddings for val...
Saved val embeddings with shape torch.Size([44, 9, 312]) to /content/data/preprocessed/val_embeddings_9x312.pt

Processing test split...


Processing test: 100%|██████████| 43/43 [00:28<00:00,  1.53it/s]

Saved test dataset with 43 examples
Generating embeddings for test...
Saved test embeddings with shape torch.Size([43, 9, 312]) to /content/data/preprocessed/test_embeddings_9x312.pt

=== Dataset Summary ===
Train split:
  • Examples: 345
  • News tensor: torch.Size([345, 9, 10, 768])
  • Metrics tensor: torch.Size([345, 9, 5, 5])
  • Targets tensor: torch.Size([345, 25])
  • Embeddings tensor: torch.Size([345, 9, 312])
Val split:
  • Examples: 44
  • News tensor: torch.Size([44, 9, 10, 768])
  • Metrics tensor: torch.Size([44, 9, 5, 5])
  • Targets tensor: torch.Size([44, 25])
  • Embeddings tensor: torch.Size([44, 9, 312])
Test split:
  • Examples: 43
  • News tensor: torch.Size([43, 9, 10, 768])
  • Metrics tensor: torch.Size([43, 9, 5, 5])
  • Targets tensor: torch.Size([43, 25])
  • Embeddings tensor: torch.Size([43, 9, 312])

✅ Processing complete. Generated [batch×9×312] embeddings for all splits.



