In [None]:
!pip install -r requirements.txt

In [None]:
import pytorch_lightning as pl
from pytorch_lightning.callbacks import ModelCheckpoint, EarlyStopping
from pytorch_lightning.loggers import TensorBoardLogger
import torchmetrics
from typing import Optional, List, Dict
import torch.nn.functional as F
import torch.nn as nn
import torch
import torch.nn as nn
from torch.nn import TransformerEncoderLayer, MultiheadAttention

class PricePredictionLightning(pl.LightningModule):
    def __init__(self, hparams=None):
        super().__init__()
        self.save_hyperparameters(hparams)
        
        # Initialize model layers
        self.norm = nn.Sequential(
            nn.BatchNorm2d(3),
            nn.LayerNorm([5000])
        )
        
        encoder_layer = TransformerEncoderLayer(
            d_model=5000,
            nhead=8,
            dim_feedforward=128,
            dropout=0.1,
            activation='gelu'
        )
        self.transformer = nn.TransformerEncoder(encoder_layer, num_layers=2)
        
        self.mlp = nn.Sequential(
            nn.Linear(5000, 128),
            nn.SiLU(),
            nn.Dropout(0.1),
            nn.Linear(128, 64),
            nn.SiLU(),
            nn.Dropout(0.1),
            nn.Linear(64, 1)
        )
        
        # Metrics
        self.train_mse = torchmetrics.MeanSquaredError()
        self.val_mse = torchmetrics.MeanSquaredError()
        self.test_mse = torchmetrics.MeanSquaredError()
        
        self._initialize_weights()
    
    def _initialize_weights(self):
        for m in self.modules():
            if isinstance(m, nn.Linear):
                nn.init.xavier_uniform_(m.weight)
                if hasattr(m, 'bias') and m.bias is not None:
                    nn.init.constant_(m.bias, 0)
    
    def forward(self, x):
        x = self.norm(x)
        x = x.permute(2, 0, 1)  # [5000, batch_size, 3]
        x = self.transformer(x)
        x = x.mean(dim=0)
        x = self.mlp(x)
        return F.softplus(x)
    
    def training_step(self, batch, batch_idx):
        x, y = batch
        y_hat = self(x)
        
        loss = F.mse_loss(y_hat, y)
        mse = self.train_mse(y_hat, y)
        
        self.log('train_loss', loss, prog_bar=True)
        self.log('train_mse', mse, prog_bar=True)
        
        return {'loss': loss}
    
    def validation_step(self, batch, batch_idx):
        x, y = batch
        y_hat = self(x)
        
        loss = F.mse_loss(y_hat, y)
        mse = self.val_mse(y_hat, y)
        
        self.log('val_loss', loss, prog_bar=True)
        self.log('val_mse', mse, prog_bar=True)
        
        return {'loss': loss}
    
    def test_step(self, batch, batch_idx):
        x, y = batch
        y_hat = self(x)
        
        loss = F.mse_loss(y_hat, y)
        mse = self.test_mse(y_hat, y)
        
        self.log('test_loss', loss)
        self.log('test_mse', mse)
        
        return {'loss': loss}
    
    def training_epoch_end(self, outputs):
        avg_loss = torch.stack([x['loss'] for x in outputs]).mean()
        self.log('train_epoch_loss', avg_loss)
    
    def validation_epoch_end(self, outputs):
        avg_loss = torch.stack([x['loss'] for x in outputs]).mean()
        self.log('val_epoch_loss', avg_loss)
    
    def configure_optimizers(self):
        optimizer = torch.optim.AdamW(
            self.parameters(),
            lr=1e-4,
            weight_decay=0.01
        )
        
        scheduler = {
            'scheduler': torch.optim.lr_scheduler.ReduceLROnPlateau(
                optimizer,
                mode='min',
                factor=0.5,
                patience=5,
                min_lr=1e-6
            ),
            'monitor': 'val_loss',
            'interval': 'epoch',
            'frequency': 1
        }
        
        return [optimizer], [scheduler]

def train_model(model, train_dataloader, val_dataloader, test_dataloader=None,
                max_epochs=100, gpus=0, checkpoint_dir='./checkpoints'):
    # Initialize callbacks
    checkpoint_callback = ModelCheckpoint(
        monitor='val_loss',
        mode='min',
        save_top_k=3,
        verbose=True,
        dirpath=checkpoint_dir
    )
    
    early_stopping = EarlyStopping(
        monitor='val_loss',
        patience=10,
        mode='min'
    )
    
    logger = TensorBoardLogger('logs', name='price_prediction')
    
    # Initialize trainer
    trainer = pl.Trainer(
        max_epochs=max_epochs,
        gpus=gpus,
        callbacks=[checkpoint_callback, early_stopping],
        logger=logger,
        gradient_clip_val=1.0,
        accumulate_grad_batches=2,
        precision=16,  # Mixed precision training
        progress_bar_refresh_rate=30
    )
    
    # Train the model
    trainer.fit(model, train_dataloader, val_dataloaders=val_dataloader)
    
    # Optional test evaluation
    if test_dataloader:
        trainer.test(model, test_dataloaders=test_dataloader)
    
    return trainer

    

In [None]:
import yfinance as yf
tickers = "AAPL NVDA MSFT GOOG AMZN GOOGL META AVGO TSLA TSM LLY WMT V JPM MA XOM UNH ORCL COST PG JNJ NFLX ABBV HD NVO BABA SAP BAC KO TMUS ASML CVX CRM TM MRK CSCO PM AZN ABT IBM WFC NVS MCD LIN GE SHEL PEP ACN HSBC TMO T PLTR"

# download from last month
yf.download(tickers, start="2025-03-10", interval='1m', period="1d", repair=True, auto_adjust=True)




In [None]:
# data
import pandas as pd
import numpy as np

tickers = "AAPL NVDA MSFT GOOG"#  AMZN GOOGL META AVGO TSLA TSM LLY WMT V JPM MA XOM UNH ORCL COST PG JNJ NFLX ABBV HD NVO BABA SAP BAC KO TMUS ASML CVX CRM TM MRK CSCO PM AZN ABT IBM WFC NVS MCD LIN GE SHEL PEP ACN HSBC TMO T PLTR"
dat = yf.Tickers(tickers)

from yfinance import cache
history = yf.download(tickers, start="2024-01-01", end="2025-01-01", interval="1d")
# history = dat.history(period="1mo", interval="1m", group_by='ticker')
history.index = history.index.strftime('%Y-%m-%d %H:%M:%S')

history.to_excel("history.xlsx")
history

In [None]:
reshaped_data = {}

for ticker in tickers.split():
    ticker_data = history[ticker]
    
    if len(ticker_data) >= 1000:
        ticker_data = ticker_data.iloc[:1000]
        
        reshaped_data[ticker] = np.array([
            ticker_data['Open'].values,
            ticker_data['High'].values,
            ticker_data['Low'].values,
            ticker_data['Volume'].values
        ])
    else:
        print(f"Not enough data points for {ticker}")

def create_dataset(data):
    

In [None]:
from torch.utils.data import DataLoader, TensorDataset

train_dataset = TensorDataset(create_dataset(reshaped_data))
val_dataset = TensorDataset(create_dataset(reshaped_data))
test_dataset = TensorDataset(create_dataset(reshaped_data))

# Create DataLoaders
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=32)
test_loader = DataLoader(test_dataset, batch_size=32)

model = PricePredictionLightning()
trainer = train_model(
    model=model,
    train_dataloader=train_loader,
    val_dataloader=val_loader,
    test_dataloader=test_loader,
    max_epochs=100,
    gpus=1 if torch.cuda.is_available() else 0
)

In [None]:
!pip install yahooquery

In [None]:
from yahooquery import Ticker
import pickle

import datetime
import os

tickers = "AVGO TSLA TSM LLY WMT V JPM MA XOM UNH ORCL COST PG JNJ NFLX ABBV HD NVO BABA SAP BAC KO TMUS ASML CVX CRM TM MRK CSCO PM AZN ABT IBM WFC NVS MCD LIN GE SHEL PEP ACN HSBC TMO T PLTR".split(" ")

for ticker_label in tickers:
    ticker = Ticker(ticker_label, asynchronous=True)
    os.makedirs(f'stock_data/{ticker_label}', exist_ok=True)
    for year in range(2011, 2025):
        for month in range(1, 13):
            startdate = datetime.date(year, month, 1)
            df = ticker.history(start=startdate.strftime("%Y-%m-%d"), period='1mo', interval='1m')
            with open(f'stock_data/{ticker_label}/{year}_{month}.pkl', 'wb') as f:
                pickle.dump(df, f)