In [None]:
import os
import re
import ast
import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
import pandas as pd
from tqdm import tqdm
from torch.utils.data import DataLoader, TensorDataset
from transformers import AutoTokenizer, AutoModel
from datetime import datetime, timedelta
from sklearn.decomposition import PCA

# === Device Setup ===
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

# === FinBERT Model ===
tokenizer = AutoTokenizer.from_pretrained("yiyanghkust/finbert-tone")
finbert_model = AutoModel.from_pretrained("yiyanghkust/finbert-tone").to(device)
finbert_model.eval()

# === PCA Dimensionality Reduction ===
class PCAReducer:
    def __init__(self, input_dim=768, output_dim=400):
        self.pca = PCA(n_components=output_dim)
        self.input_dim = input_dim
        self.output_dim = output_dim
        self.is_fitted = False
    
    def fit(self, data):
        # data shape: [N, input_dim]
        print(f"Fitting PCA to reduce dimensions from {self.input_dim} to {self.output_dim}")
        self.pca.fit(data.cpu().numpy())
        self.is_fitted = True
        print(f"PCA fitted, explained variance ratio sum: {sum(self.pca.explained_variance_ratio_):.4f}")
        
    def transform(self, data):
        # data shape: [..., input_dim]
        if not self.is_fitted:
            raise ValueError("PCA must be fitted before transforming data")
        
        original_shape = data.shape
        # Reshape to 2D for PCA
        data_2d = data.reshape(-1, self.input_dim)
        
        # Transform using PCA
        if isinstance(data, torch.Tensor):
            data_2d_np = data_2d.cpu().numpy()
            reduced_data_np = self.pca.transform(data_2d_np)
            reduced_data = torch.from_numpy(reduced_data_np).to(data.device)
        else:
            reduced_data = torch.from_numpy(self.pca.transform(data_2d)).to(device)
            
        # Reshape back to original dimensions but with reduced feature size
        new_shape = list(original_shape)
        new_shape[-1] = self.output_dim
        return reduced_data.reshape(new_shape)

# === Define LSTM Network for Financial Metrics ===
class FinancialLSTMNet(nn.Module):
    def __init__(self, in_channels=6, kernel_size=3, hidden_dim=64, stocks_count=5):
        super().__init__()
        
        # We'll keep the same parameter names for drop-in compatibility
        # but kernel_size won't be used in this implementation
        self.in_channels = in_channels  # Number of financial metrics
        self.hidden_dim = hidden_dim    # Hidden dimensions in LSTM
        self.stocks_count = stocks_count  # Number of stocks
        
        # Separate LSTM layer for each metric
        self.lstm_layers = nn.ModuleList([
            nn.LSTM(
                input_size=stocks_count,     # Each stock is a feature
                hidden_size=hidden_dim,      # Hidden dimension per metric
                batch_first=True,            # Expect [batch, seq, features]
                num_layers=1                 # Single layer LSTM
            ) for _ in range(in_channels)
        ])
        
        self.relu = nn.ReLU()
        self.tanh = nn.Tanh()
        
        # Dimension reducer after LSTM, same as the original
        self.metrics_dim_reducer = nn.Linear(hidden_dim * in_channels, 384)
    
    def forward(self, x):
        # x shape: [batch_size, seq_length, metrics, stocks]
        batch_size, seq_len, metrics, stocks = x.shape
        assert metrics == self.in_channels
        assert stocks == self.stocks_count
        
        # Process all days in sequence
        all_days_features = []
        
        for day_idx in range(seq_len):
            day_features = []
            
            # Process each metric
            for metric_idx in range(metrics):
                # Extract the time series for each stock for this specific metric
                # Shape: [batch_size, seq_length, stocks]
                metric_data = x[:, :, metric_idx, :]
                
                # Process through LSTM
                # Output shape: [batch_size, seq_length, hidden_dim]
                lstm_output, _ = self.lstm_layers[metric_idx](metric_data)
                
                # Extract features for the current day
                # Shape: [batch_size, hidden_dim]
                day_metric_features = lstm_output[:, day_idx, :]
                
                day_features.append(day_metric_features)
            
            # Combine all metric features for this day
            # Shape: [batch_size, metrics*hidden_dim]
            combined_day_features = torch.cat(day_features, dim=1)
            
            # Apply dimension reduction
            # Shape: [batch_size, 384]
            day_output = self.tanh(self.metrics_dim_reducer(combined_day_features))
            
            all_days_features.append(day_output)
        
        # Stack all days features
        # Shape: [batch_size, seq_length, 384]
        return torch.stack(all_days_features, dim=1)

# === Modified News Processor with Memory-Efficient Processing ===
class MemoryEfficientNewsProcessor(nn.Module):
    def __init__(self, input_dim=400):  # Changed from 768 to 400
        super().__init__()
        # Attention for token weighting
        self.token_attention = nn.Linear(input_dim, 1)
        self.tanh = nn.Tanh()
        
    def forward(self, x):
        # x shape: [batch_size, seq_length=9, tokens=128, features=400]
        # Note: We've already averaged across the 10 articles before this stage
        batch_size, seq_len, num_tokens, features = x.shape
        
        # Process each day separately to save memory
        processed_days = []
        for day in range(seq_len):
            # Get current day: [batch_size, 128, 400]
            day_data = x[:, day]
            
            # Calculate attention weights for tokens
            # Shape: [batch_size, 128, 1]
            attn_weights = F.softmax(self.token_attention(day_data), dim=1)
            
            # Apply attention to get weighted representation
            # Shape: [batch_size, 400]
            day_repr = torch.sum(day_data * attn_weights, dim=1)
            
            # Apply tanh directly as specified
            day_repr = self.tanh(day_repr)
            
            processed_days.append(day_repr)
            
        # Stack back to [batch_size, 9, 128]
        return torch.stack(processed_days, dim=1)

# === Modified Combined Model with PCA Reduction ===
class CombinedFinancialModel(nn.Module):
    def __init__(self, pca_dim=400, metrics_count=6, stocks_count=5, output_dim=312, pred_metrics=6):
        super().__init__()

        # PCA reducer for FinBERT embeddings
        self.pca_reducer = PCAReducer(input_dim=768, output_dim=pca_dim)
        
        # News processing branch
        # This will reduce token dimension with weighted sum
        self.news_processor = MemoryEfficientNewsProcessor(input_dim=pca_dim)
        
        # Linear layer to reduce news embedding dimension from pca_dim to 128
        self.news_dim_reducer = nn.Linear(pca_dim, 128)
        self.tanh = nn.Tanh()

        # Metrics processing - changed from ConvNet to LSTMNet
        self.lstm_net = FinancialLSTMNet(in_channels=metrics_count, kernel_size=3, stocks_count=stocks_count)

        # Combined processing
        self.concat_reducer = nn.Linear(128 + 384, output_dim)  # 128 from news branch + 384 from metrics

        # Prediction layer for next day (day 10)
        self.predictor = nn.Linear(output_dim * 9, pred_metrics * stocks_count)

    def forward(self, news_embeddings, metrics_data):
        batch_size, seq_len, num_articles, num_tokens, features = news_embeddings.shape
        print(f"Input shapes - news: {news_embeddings.shape}, metrics: {metrics_data.shape}")
        
        # Step 1: Apply PCA to reduce from 768 to 400 dimensions
        if not self.pca_reducer.is_fitted:
            flattened_data = news_embeddings.reshape(-1, features)
            max_samples = min(100000, flattened_data.shape[0])
            sample_indices = torch.randperm(flattened_data.shape[0])[:max_samples]
            self.pca_reducer.fit(flattened_data[sample_indices])
        
        # Transform the data - process in chunks to save memory
        reduced_embeddings = []
        chunk_size = 2
        for i in range(0, batch_size, chunk_size):
            end_idx = min(i + chunk_size, batch_size)
            chunk = news_embeddings[i:end_idx]
            reduced_chunk = self.pca_reducer.transform(chunk)
            reduced_embeddings.append(reduced_chunk)
        
        reduced_news = torch.cat(reduced_embeddings, dim=0)
        # Shape is now [batch_size, 9, 10, 128, 400]
        
        # Step 2: Average across articles dimension
        reduced_news = reduced_news.mean(dim=2)  # -> [batch_size, 9, 128, 400]
        print(f"After PCA and article averaging: {reduced_news.shape}")
        
        # Step 3: Process through news_processor - applies token attention and weighted sum
        # This takes [batch_size, 9, 128, 400] -> [batch_size, 9, 400]
        news_features = self.news_processor(reduced_news)
        print(f"After weighted sum: {news_features.shape}")
        
        # Step 4: Apply dimension reduction from 400 -> 128
        # [batch_size, 9, 400] -> [batch_size, 9, 128]
        news_features = self.tanh(self.news_dim_reducer(news_features))
        print(f"News features final shape: {news_features.shape}")

        # Process metrics through LSTM (previously ConvNet)
        # metrics_data shape: [batch_size, 9, 6, 5] -> [batch_size, 9, 384]
        metrics_features = self.lstm_net(metrics_data)
        print(f"Metrics features shape: {metrics_features.shape}")

        # Concatenate along feature dimension
        combined = torch.cat([news_features, metrics_features], dim=2)  # -> [batch_size, 9, 512]
        print(f"Combined shape: {combined.shape}")

        # Apply linear and tanh for final embedding
        temporal_features = self.tanh(self.concat_reducer(combined))  # -> [batch_size, 9, 312]
        print(f"Temporal features shape: {temporal_features.shape}")

        # Flatten and predict
        flat_features = temporal_features.reshape(temporal_features.shape[0], -1)  # -> [batch_size, 9*312]
        predictions = self.predictor(flat_features)  # -> [batch_size, 6*5]

        return {
            'temporal_features': temporal_features,
            'predictions': predictions
        }
# === Modified process_article_folder function with memory optimization ===
def process_article_folder(example_path, max_tokens=128, pca_reducer=None):
    """Process 9 days of news articles with reduced memory usage and PCA reduction"""
    day_files = sorted([f for f in os.listdir(example_path) if f.endswith(".txt")])[:9]
    daily_embeddings = []

    for day_file in day_files:
        day_path = os.path.join(example_path, day_file)
        
        with open(day_path, 'r', encoding='utf-8') as f:
            articles = re.split(r'--- Article \d+ ---', f.read())
            articles = [a.strip() for a in articles if a.strip()]

            # Ensure exactly 10 articles
            if len(articles) > 10:
                articles = articles[:10]
            elif len(articles) < 10:
                articles = articles + [''] * (10 - len(articles))

        article_embeddings = []
        
        # Process each article
        for article in articles:
            if not article:
                # For empty articles, create zero tensor of appropriate shape
                # If using PCA, the dimensions will be reduced
                if pca_reducer and pca_reducer.is_fitted:
                    article_embeddings.append(torch.zeros(max_tokens, pca_reducer.output_dim).to(device))
                else:
                    article_embeddings.append(torch.zeros(max_tokens, 768).to(device))
                continue
                
            # First tokenize to get all tokens
            inputs = tokenizer(
                article,
                padding='max_length',
                truncation=True,
                max_length=512,
                return_tensors='pt'
            ).to(device)
            
            # Get non-padding token positions
            attention_mask = inputs['attention_mask'][0]
            valid_positions = attention_mask.nonzero().squeeze()
            
            # If we have fewer than max_tokens valid tokens, use all of them
            if len(valid_positions) <= max_tokens:
                selected_indices = valid_positions
            else:
                # Select tokens with regular intervals
                step = len(valid_positions) // max_tokens
                selected_indices = valid_positions[::step][:max_tokens]
            
            # Create a new inputs dictionary with only the selected tokens
            selected_input_ids = inputs['input_ids'][0][selected_indices].unsqueeze(0)
            selected_attention_mask = torch.ones(1, len(selected_indices)).to(device)
            
            selected_inputs = {
                'input_ids': selected_input_ids,
                'attention_mask': selected_attention_mask
            }
            
            # Run through FinBERT
            with torch.no_grad():
                outputs = finbert_model(**selected_inputs)
                sequence_output = outputs.last_hidden_state[0]  # [selected_tokens, 768]
                
                # Apply PCA reduction if provided and fitted
                if pca_reducer and pca_reducer.is_fitted:
                    sequence_output = pca_reducer.transform(sequence_output)
                
                # Pad if necessary to ensure uniform size
                if sequence_output.shape[0] < max_tokens:
                    # Make sure padding uses the correct dimension
                    padding_dim = pca_reducer.output_dim if pca_reducer and pca_reducer.is_fitted else 768
                    padding = torch.zeros(max_tokens - sequence_output.shape[0], padding_dim).to(device)
                    sequence_output = torch.cat([sequence_output, padding], dim=0)
                
                article_embeddings.append(sequence_output)
        
        # Stack all articles for this day
        daily_embeddings.append(torch.stack(article_embeddings))

    # Stack all days
    return torch.stack(daily_embeddings)

# === Data Processing Functions ===
def find_nearest_date(df, target_date, max_days=7):
    """Find closest valid date within business days."""
    dates = df.index
    if target_date in dates:
        return target_date

    candidates = dates[(dates >= target_date - pd.Timedelta(days=max_days)) &
                      (dates <= target_date + pd.Timedelta(days=max_days))]

    if not candidates.empty:
        return candidates[np.argmin(np.abs((candidates - target_date).total_seconds()))]

    return None

def extract_raw_metrics(df, start_date, stocks, metrics, include_target=False):
    """Extract 9 days of metrics data and optionally the 10th day as target"""
    days_to_extract = 10 if include_target else 9
    date_range = [start_date + timedelta(days=i) for i in range(days_to_extract)]
    valid_dates = [find_nearest_date(df, d) for d in date_range]

    metrics_data = []
    for date in valid_dates[:9]:  # First 9 days are input features
        if date is None:
            return None, None

        daily_data = []
        for stock in stocks:
            try:
                stock_metrics = [df.loc[date, (metric, stock)] for metric in metrics]
            except KeyError:
                return None, None
            daily_data.append(stock_metrics)

        daily_tensor = torch.tensor(daily_data, dtype=torch.float32).T
        metrics_data.append(daily_tensor)

    input_tensor = torch.stack(metrics_data)  # Shape: (9, 6 metrics, 5 stocks)

    # If target day is requested, extract it separately
    target_tensor = None
    if include_target and len(valid_dates) == 10 and valid_dates[9] is not None:
        target_data = []
        for stock in stocks:
            try:
                stock_metrics = [df.loc[valid_dates[9], (metric, stock)] for metric in metrics]
                target_data.append(stock_metrics)
            except KeyError:
                return input_tensor, None

        target_tensor = torch.tensor(target_data, dtype=torch.float32).flatten()

    return input_tensor, target_tensor

# === Generate Dataset with Memory Efficiency ===
def generate_dataset_with_dates(df, input_dir, output_dir, split, pca_reducer=None):
    """Generate dataset with inputs and targets using PCA reduction"""
    os.makedirs(output_dir, exist_ok=True)

    example_folders = sorted(os.listdir(input_dir))
    all_news_tensors = []
    all_metrics_tensors = []
    all_targets = []
    valid_examples = []

    stocks = ['AAPL', 'AMZN', 'GOOGL', 'META', 'NFLX']
    metrics = ['Open', 'High', 'Low', 'Close', 'Adj Close', 'Volume']

    for folder in tqdm(example_folders, desc=f"Processing {split}"):
        example_path = os.path.join(input_dir, folder)
        if not os.path.isdir(example_path):
            continue

        try:
            txt_files = sorted([f for f in os.listdir(example_path) if f.endswith(".txt")])
            if len(txt_files) < 9:
                continue

            start_date_str = txt_files[0].replace(".txt", "")
            start_date = datetime.strptime(start_date_str, "%Y-%m-%d")

            # Process news with PCA reduction
            news_tensor = process_article_folder(example_path, pca_reducer=pca_reducer)

            # Process metrics and extract target (10th day)
            metrics_tensor, target_tensor = extract_raw_metrics(
                df, start_date, stocks, metrics, include_target=True
            )

            if metrics_tensor is None or target_tensor is None:
                continue

            # Collect data
            all_news_tensors.append(news_tensor)
            all_metrics_tensors.append(metrics_tensor)
            all_targets.append(target_tensor)
            valid_examples.append(folder)

            # Free up memory
            torch.cuda.empty_cache()

        except Exception as e:
            print(f"Failed processing {folder}: {str(e)}")
            continue

    # Save dataset if we have examples
    if all_news_tensors and all_metrics_tensors and all_targets:
        # Stack tensors in batches to save memory
        all_news_tensors_batched = []
        batch_size = 10  # Process in smaller batches
        for i in range(0, len(all_news_tensors), batch_size):
            batch = all_news_tensors[i:i+batch_size]
            stacked_batch = torch.stack(batch)
            all_news_tensors_batched.append(stacked_batch.cpu())
        
        combined_news = torch.cat(all_news_tensors_batched, dim=0)
        combined_metrics = torch.stack(all_metrics_tensors)
        combined_targets = torch.stack(all_targets)

        # Save tensors
        torch.save(combined_news.cpu(), os.path.join(output_dir, f"{split}_news.pt"))
        torch.save(combined_metrics.cpu(), os.path.join(output_dir, f"{split}_metrics.pt"))
        torch.save(combined_targets.cpu(), os.path.join(output_dir, f"{split}_targets.pt"))

        print(f"Saved {split} dataset with {len(valid_examples)} examples")
        return combined_news, combined_metrics, combined_targets

    return None, None, None

# === Memory-Efficient Embeddings Generation ===
def generate_and_save_embeddings(model, news_tensors, metrics_tensors, output_dir, split, targets=None):
    """
    Generate concatenated embeddings with dimension [batch×9×312] with memory efficiency
    """
    model.eval()
    batch_size = news_tensors.shape[0]
    max_batch = 4  # Smaller batch size to reduce memory usage

    all_embeddings = []

    with torch.no_grad():
        for i in range(0, batch_size, max_batch):
            # Process a small batch at a time
            end_idx = min(i+max_batch, batch_size)
            print(f"Processing batch {i} to {end_idx} of {batch_size}")
            
            news_batch = news_tensors[i:end_idx].to(device)
            metrics_batch = metrics_tensors[i:end_idx].to(device)

            # Forward pass to get temporal features
            outputs = model(news_batch, metrics_batch)
            temporal_features = outputs['temporal_features']  # Shape: [batch, 9, 312]

            all_embeddings.append(temporal_features.cpu())
            
            # Free up GPU memory
            del news_batch, metrics_batch, outputs, temporal_features
            torch.cuda.empty_cache()

    # Concatenate all batches
    combined_embeddings = torch.cat(all_embeddings, dim=0)

    # Save the embeddings
    embedding_path = os.path.join(output_dir, f"{split}_embeddings_9x312.pt")
    torch.save(combined_embeddings, embedding_path)

    print(f"Saved {split} embeddings with shape {combined_embeddings.shape} to {embedding_path}")

    return combined_embeddings

# === Updated Main Function ===
def main():
    # Set paths
    csv_path = "/kaggle/input/stock-prices-full/stock_prices_complete.csv"
    data_dir = "/kaggle/input/processed-data/processed_dataset_v2"
    output_dir = "data/preprocessed"
    os.makedirs(output_dir, exist_ok=True)

    # Initialize PCA reducer
    pca_reducer = PCAReducer(input_dim=768, output_dim=400)

    # Load CSV with the updated format
    print("Loading stock price data...")
    df = pd.read_csv(csv_path, header=[0, 1])

    print("CSV structure sample:")
    print(df.head(3))

    # Check if the first column is a date or similar indicator
    first_col_name = df.columns[0][0]
    first_col_subname = df.columns[0][1]

    # If the first column is 'Price' or similar, use it to set the index
    if first_col_name.lower() in ['price', 'ticker', 'date', 'time', 'datetime']:
        # Set the index using the first column's values
        df.index = pd.to_datetime(df.iloc[:, 0], errors='coerce')
        # Remove the first column after setting it as index
        df = df.iloc[:, 1:]
        df.index.name = 'Date'

    # Drop rows with NaT in index
    df = df.loc[~pd.isna(df.index)]

    # Check for business days column and filter if present
    if ('is_business_day', '') in df.columns:
        print("Filtering for business days only...")
        df = df[df[('is_business_day', '')] == True]
        df = df.drop(columns=[('is_business_day', '')])

    # Define the stocks and metrics based on the updated CSV structure
    metrics = ['Open', 'High', 'Low', 'Close', 'Adj Close', 'Volume']
    stocks = ['AAPL', 'AMZN', 'GOOGL', 'META', 'NFLX']

    # Check if all expected columns are present
    for metric in metrics:
        for stock in stocks:
            if (metric, stock) not in df.columns:
                print(f"Warning: Column ({metric}, {stock}) not found in dataframe")
                
                # Try to find the column with different case
                for col_metric, col_stock in df.columns:
                    if col_metric.lower() == metric.lower() and col_stock.lower() == stock.lower():
                        print(f"Found column with different case: ({col_metric}, {col_stock})")
                        # Rename the column to the expected case
                        df = df.rename(columns={(col_metric, col_stock): (metric, stock)})
                        break

    # Check date range
    print(f"CSV date range: {df.index.min().date()} → {df.index.max().date()}")
    print(f"Number of trading days: {len(df)}")

    # === Column Verification ===
    expected_cols = [(m, s) for m in metrics for s in stocks]
    missing_cols = [col for col in expected_cols if col not in df.columns]
    if missing_cols:
        print(f"Warning: Missing {len(missing_cols)} columns out of {len(expected_cols)} expected")
        print(f"First few missing columns: {missing_cols[:5]}")
        print("Available columns sample:", df.columns[:10].tolist())
    else:
        print("✅ All required stock columns are present.")

    # Initialize model with memory-efficient architecture
    combined_model = CombinedFinancialModel(pca_dim=400).to(device)
    print("Model initialized with PCA reduction.")

    # Process all splits and generate datasets
    all_data = {}
    for split in ["train", "val", "test"]:
        print(f"\nProcessing {split} split...")
        input_dir = os.path.join(data_dir, split)

        # Check if directory exists
        if not os.path.exists(input_dir):
            print(f"Warning: Directory {input_dir} does not exist. Skipping {split} split.")
            continue

        # Generate dataset with PCA reduction
        news_tensors, metrics_tensors, targets = generate_dataset_with_dates(
            df, input_dir, output_dir, split, pca_reducer=pca_reducer
        )

        if news_tensors is not None:
            # Store for later use
            all_data[split] = {
                'news': news_tensors,
                'metrics': metrics_tensors,
                'targets': targets
            }

            # Generate and save embeddings
            print(f"Generating embeddings for {split}...")
            embeddings = generate_and_save_embeddings(
                combined_model, news_tensors, metrics_tensors, output_dir, split, targets
            )
            all_data[split]['embeddings'] = embeddings

    # Print summary
    print("\n=== Dataset Summary ===")
    for split, data in all_data.items():
        print(f"{split.capitalize()} split:")
        print(f"  • Examples: {data['news'].shape[0]}")
        print(f"  • News tensor: {data['news'].shape}")
        print(f"  • Metrics tensor: {data['metrics'].shape}")
        print(f"  • Targets tensor: {data['targets'].shape}")
        print(f"  • Embeddings tensor: {data['embeddings'].shape}")

    print("\n✅ Processing complete. Generated [batch×9×312] embeddings for all splits.")


if __name__ == "__main__":
    main()

Using device: cuda


config.json:   0%|          | 0.00/533 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/226k [00:00<?, ?B/s]

2025-04-17 22:05:56.206936: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1744927556.664540      31 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1744927556.794380      31 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


pytorch_model.bin:   0%|          | 0.00/439M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/439M [00:00<?, ?B/s]

Loading stock price data...
CSV structure sample:
        Price  Adj Close                                                \
       Ticker       AAPL       AMZN      GOOGL        META        NFLX   
0  2018-01-02  40.479839  59.450500  53.405170  180.568954  201.070007   
1  2018-01-03  40.472782  60.209999  54.316319  183.803741  205.050003   
2  2018-01-04  40.660774  60.479500  54.527306  183.465317  205.630005   

       Close                                    ...       Open             \
        AAPL       AMZN      GOOGL        META  ...       AMZN      GOOGL   
0  43.064999  59.450500  53.660500  181.419998  ...  58.599998  52.651001   
1  43.057499  60.209999  54.576000  184.669998  ...  59.415001  53.696499   
2  43.257500  60.479500  54.787998  184.330002  ...  60.250000  54.854500   

                                Volume                                      \
         META        NFLX         AAPL        AMZN       GOOGL        META   
0  177.679993  196.100006  102223600.


Processing train:   0%|          | 0/345 [00:00<?, ?it/s][A
Processing train:   0%|          | 1/345 [00:02<13:40,  2.39s/it][A
Processing train:   1%|          | 2/345 [00:03<09:17,  1.63s/it][A
Processing train:   1%|          | 3/345 [00:04<07:44,  1.36s/it][A
Processing train:   1%|          | 4/345 [00:05<07:01,  1.24s/it][A
Processing train:   1%|▏         | 5/345 [00:06<06:36,  1.16s/it][A
Processing train:   2%|▏         | 6/345 [00:07<06:58,  1.24s/it][A
Processing train:   2%|▏         | 7/345 [00:09<07:07,  1.26s/it][A
Processing train:   2%|▏         | 8/345 [00:10<06:44,  1.20s/it][A
Processing train:   3%|▎         | 9/345 [00:11<06:28,  1.16s/it][A
Processing train:   3%|▎         | 10/345 [00:12<06:18,  1.13s/it][A
Processing train:   3%|▎         | 11/345 [00:13<06:15,  1.13s/it][A
Processing train:   3%|▎         | 12/345 [00:14<06:16,  1.13s/it][A
Processing train:   4%|▍         | 13/345 [00:15<06:11,  1.12s/it][A
Processing train:   4%|▍         | 14

Saved train dataset with 345 examples
Generating embeddings for train...
Processing batch 0 to 4 of 345
Input shapes - news: torch.Size([4, 9, 10, 128, 768]), metrics: torch.Size([4, 9, 6, 5])
Fitting PCA to reduce dimensions from 768 to 400
PCA fitted, explained variance ratio sum: 0.9124
After PCA and article averaging: torch.Size([4, 9, 128, 400])
After weighted sum: torch.Size([4, 9, 400])
News features final shape: torch.Size([4, 9, 128])
Metrics features shape: torch.Size([4, 9, 384])
Combined shape: torch.Size([4, 9, 512])
Temporal features shape: torch.Size([4, 9, 312])
Processing batch 4 to 8 of 345
Input shapes - news: torch.Size([4, 9, 10, 128, 768]), metrics: torch.Size([4, 9, 6, 5])
After PCA and article averaging: torch.Size([4, 9, 128, 400])
After weighted sum: torch.Size([4, 9, 400])
News features final shape: torch.Size([4, 9, 128])
Metrics features shape: torch.Size([4, 9, 384])
Combined shape: torch.Size([4, 9, 512])
Temporal features shape: torch.Size([4, 9, 312])
P

Processing val: 100%|██████████| 44/44 [00:47<00:00,  1.09s/it]


Saved val dataset with 44 examples
Generating embeddings for val...
Processing batch 0 to 4 of 44
Input shapes - news: torch.Size([4, 9, 10, 128, 768]), metrics: torch.Size([4, 9, 6, 5])
After PCA and article averaging: torch.Size([4, 9, 128, 400])
After weighted sum: torch.Size([4, 9, 400])
News features final shape: torch.Size([4, 9, 128])
Metrics features shape: torch.Size([4, 9, 384])
Combined shape: torch.Size([4, 9, 512])
Temporal features shape: torch.Size([4, 9, 312])
Processing batch 4 to 8 of 44
Input shapes - news: torch.Size([4, 9, 10, 128, 768]), metrics: torch.Size([4, 9, 6, 5])
After PCA and article averaging: torch.Size([4, 9, 128, 400])
After weighted sum: torch.Size([4, 9, 400])
News features final shape: torch.Size([4, 9, 128])
Metrics features shape: torch.Size([4, 9, 384])
Combined shape: torch.Size([4, 9, 512])
Temporal features shape: torch.Size([4, 9, 312])
Processing batch 8 to 12 of 44
Input shapes - news: torch.Size([4, 9, 10, 128, 768]), metrics: torch.Size(

Processing test: 100%|██████████| 43/43 [00:47<00:00,  1.10s/it]


Saved test dataset with 43 examples
Generating embeddings for test...
Processing batch 0 to 4 of 43
Input shapes - news: torch.Size([4, 9, 10, 128, 768]), metrics: torch.Size([4, 9, 6, 5])
After PCA and article averaging: torch.Size([4, 9, 128, 400])
After weighted sum: torch.Size([4, 9, 400])
News features final shape: torch.Size([4, 9, 128])
Metrics features shape: torch.Size([4, 9, 384])
Combined shape: torch.Size([4, 9, 512])
Temporal features shape: torch.Size([4, 9, 312])
Processing batch 4 to 8 of 43
Input shapes - news: torch.Size([4, 9, 10, 128, 768]), metrics: torch.Size([4, 9, 6, 5])
After PCA and article averaging: torch.Size([4, 9, 128, 400])
After weighted sum: torch.Size([4, 9, 400])
News features final shape: torch.Size([4, 9, 128])
Metrics features shape: torch.Size([4, 9, 384])
Combined shape: torch.Size([4, 9, 512])
Temporal features shape: torch.Size([4, 9, 312])
Processing batch 8 to 12 of 43
Input shapes - news: torch.Size([4, 9, 10, 128, 768]), metrics: torch.Siz