In [1]:
import pandas as pd
df_merged = pd.read_csv('../../datasets/df_merged.csv').iloc[:,1:]

In [2]:
df_merged['symbol'].unique()

array(['A', 'AAPL', 'ADI', 'ADM', 'AIG', 'ALL', 'AMZN', 'ARE', 'AXP',
       'BA', 'BAC', 'BEN', 'BX', 'C', 'CAG', 'CAT', 'CI', 'CMA', 'CME',
       'CMS', 'COF', 'COO', 'COST', 'CRM', 'CSCO', 'CVS', 'CVX', 'D',
       'DD', 'DIS', 'ED', 'EFX', 'ES', 'ETR', 'F', 'FCX', 'FMC', 'GE',
       'GIS', 'GLW', 'GM', 'GOOG', 'GS', 'HAS', 'HD', 'HES', 'HIG', 'IBM',
       'IP', 'ISRG', 'IT', 'JPM', 'K', 'KEY', 'L', 'LVS', 'MA', 'MAA',
       'MCD', 'MDT', 'MGM', 'MSFT', 'NI', 'O', 'ORCL', 'PFE', 'PM', 'RL',
       'SJM', 'SO', 'T', 'TXN', 'UPS', 'USB', 'V', 'VLO', 'VZ', 'WAB',
       'WFC', 'WMT', 'ABBV', 'ABT', 'AMAT', 'BLK', 'CHTR', 'CMG', 'DAL',
       'DE', 'FDX', 'GILD', 'GOOGL', 'GWW', 'HAL', 'HON', 'HPE', 'HPQ',
       'JBHT', 'JNJ', 'MAR', 'MRK', 'MS', 'NFLX', 'NOW', 'PNC', 'QCOM',
       'SBUX', 'SLB', 'TAP', 'TMO', 'TSLA', 'UAL', 'UNH', 'XOM', 'AEE',
       'AMD', 'BMY', 'CB', 'CCL', 'DG', 'EBAY', 'ETSY', 'FAST', 'FIS',
       'INTC', 'INTU', 'IRM', 'KEYS', 'KR', 'LIN', 'LLY', 'LMT', '

In [3]:
columns_without_embeddings = df_merged.columns[~df_merged.columns.str.contains('embedding')]
columns_without_embeddings

Index(['index', 'date', 'symbol', 'adj close', 'close', 'high', 'low', 'open',
       'volume', 'lag_1', 'lag_2', 'lag_3', 'weekly_return', '5_day_ma',
       '20_day_ma', '5_day_volatility', 'momentum', 'macd', 'macd_signal',
       'macd_histogram', 'week_of_year', 'month'],
      dtype='object')

In [4]:
columns_title_embeddings = df_merged.columns[df_merged.columns.str.contains('title_embedding')]
columns_body_embeddings = df_merged.columns[df_merged.columns.str.contains('body_embedding')]
columns_title_embeddings, columns_body_embeddings

(Index(['title_embedding_0', 'title_embedding_1', 'title_embedding_2',
        'title_embedding_3', 'title_embedding_4', 'title_embedding_5',
        'title_embedding_6', 'title_embedding_7', 'title_embedding_8',
        'title_embedding_9',
        ...
        'title_embedding_374', 'title_embedding_375', 'title_embedding_376',
        'title_embedding_377', 'title_embedding_378', 'title_embedding_379',
        'title_embedding_380', 'title_embedding_381', 'title_embedding_382',
        'title_embedding_383'],
       dtype='object', length=384),
 Index(['body_embedding_0', 'body_embedding_1', 'body_embedding_2',
        'body_embedding_3', 'body_embedding_4', 'body_embedding_5',
        'body_embedding_6', 'body_embedding_7', 'body_embedding_8',
        'body_embedding_9',
        ...
        'body_embedding_374', 'body_embedding_375', 'body_embedding_376',
        'body_embedding_377', 'body_embedding_378', 'body_embedding_379',
        'body_embedding_380', 'body_embedding_381', 'bo

In [5]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
from sklearn.model_selection import train_test_split
from tqdm.notebook import tqdm
import numpy as np
from typing import List
from sklearn.metrics import r2_score
import plotly 
from plotly.subplots import make_subplots
import plotly.graph_objects
from IPython.display import clear_output
# import wandb

# Initialize wandb
# wandb.init(project="stock-predictor", entity="your_wandb_username")

# Класс модели с LSTM
class StockPredictor(nn.Module):
    def __init__(self, input_dim, hidden_dim, num_layers, output_dim, dropout_prob=0.5):
        super(StockPredictor, self).__init__()
        self.lstm = nn.LSTM(input_dim, hidden_dim, num_layers, batch_first=True)
        self.fc1 = nn.Linear(hidden_dim, 128)
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(128, 64)
        self.fc3 = nn.Linear(64, output_dim)
        self.dropout = nn.Dropout(dropout_prob)
        self.batch_norm1 = nn.BatchNorm1d(128)
        self.batch_norm2 = nn.BatchNorm1d(64)
        self.num_layers = num_layers
        self.hidden_dim = hidden_dim

    def forward(self, x):
        # Изменим форму входного тензора
        x = x.unsqueeze(1)  # Добавляем размерность для sequence_length, чтобы стало (batch_size, sequence_length, input_dim)
        
        # Инициализация скрытых состояний
        h0 = torch.zeros(self.num_layers, x.size(0), self.hidden_dim).to(x.device)
        c0 = torch.zeros(self.num_layers, x.size(0), self.hidden_dim).to(x.device)
        
        # LSTM
        out, _ = self.lstm(x, (h0, c0))
        
        # Получение только последнего выходного состояния
        out = out[:, -1, :]
        
        # Полносвязные слои с нормализацией и дроп-аутом
        out = self.fc1(out)
        out = self.batch_norm1(out)
        out = self.relu(out)
        out = self.dropout(out)
        out = self.fc2(out)
        out = self.batch_norm2(out)
        out = self.relu(out)
        out = self.dropout(out)
        out = self.fc3(out)
        return out

def plot_losses(train_losses: List[float], val_losses: List[float]):
    """
    Plot loss and perplexity of train and validation samples using plotly
    :param train_losses: list of train losses at each epoch
    :param val_losses: list of validation losses at each epoch
    """
    clear_output()
    fig = make_subplots(rows=1, cols=1)
    fig.add_trace(plotly.graph_objects.Scatter(y=train_losses, mode='lines', name='train'), row=1, col=1)
    fig.add_trace(plotly.graph_objects.Scatter(y=val_losses, mode='lines', name='validation'), row=1, col=1)
    fig.update_layout(title='Losses', xaxis_title='Epoch', yaxis_title='Loss')
    fig.show()
# Метод обучения
def train_model(model, train_loader, val_loader, criterion, optimizer, num_epochs=10):
    train_losses = []
    val_losses = []
    for epoch in range(num_epochs):
        model.train()
        running_loss = 0.0
        for inputs, targets in tqdm(train_loader):
            inputs = inputs.to(model.device)
            targets = targets.to(model.device)
            
            # Forward pass
            outputs = model(inputs)
            loss = criterion(outputs.squeeze(), targets)
            
            # Backward pass and optimization
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            
            running_loss += loss.item() * inputs.size(0)
        
        epoch_loss = running_loss / len(train_loader.dataset)
        train_losses.append(epoch_loss)
        # Validation
        model.eval()
        val_loss = 0.0
        val_mae = 0.0
        val_r2 = 0.0
        with torch.no_grad():
            for inputs, targets in tqdm(val_loader):
                inputs = inputs.to(model.device)
                targets = targets.to(model.device)
                
                outputs = model(inputs)
                loss = criterion(outputs.squeeze(), targets)
                val_loss += loss.item() * inputs.size(0)
                val_mae += torch.nn.functional.l1_loss(outputs.squeeze(), targets, reduction='sum').item()

                val_r2 += r2_score(outputs.squeeze(), targets).item() * inputs.size(0)

        val_loss /= len(val_loader.dataset)
        val_mae /= len(val_loader.dataset)
        val_r2 /= len(val_loader.dataset)

        val_losses.append(val_loss)
        print(f'Epoch [{epoch+1}/{num_epochs}], Training Loss: {epoch_loss:.4f}, Validation Loss: {val_loss:.4f}')

        # plot
        plot_losses(train_losses, val_losses)
        # Log metrics to wandb
        # wandb.log({"epoch": epoch+1, "training_loss": epoch_loss, "validation_loss": val_loss})
    
    print(f"Training complete. Final training loss: {train_losses[-1]}, final validation loss: {val_losses[-1]}")
    print(f"Final validation MAE: {val_mae}, final validation R2: {val_r2}")
    return model

# Пример данных
class StockDataset(Dataset):
    def __init__(self, features, targets):
        self.features = features
        self.targets = targets
    
    def __len__(self):
        return len(self.targets)
    
    def __getitem__(self, idx):
        return self.features[idx], self.targets[idx]

In [6]:
columns_to_test = columns_without_embeddings.to_list() + columns_title_embeddings.to_list()
columns_to_test = [item for item in columns_to_test if item not in ['index', 'date', 'title', 'body', 'symbol']]

In [8]:
# normalize weekly_return
def normalize(data, mean, std, eps=1e-6):
    return (data - mean) / (std + eps)
def denormalize(data, mean, std):
    return data * std + mean

targets = df_merged['weekly_return'].values
targets = torch.from_numpy(targets).to(dtype=torch.float)
combined_features = df_merged[columns_to_test].values
combined_features = torch.from_numpy(combined_features).to(dtype=torch.float)

targets_mean = targets.mean()
targets_std = targets.std()
targets_normalized = normalize(targets, targets_mean, targets_std)
features_mean, features_std = combined_features.mean(axis=0), combined_features.std(axis=0)
features_normalized = normalize(combined_features, features_mean, features_std)


In [20]:
df_merged[columns_to_test].head()

Unnamed: 0,adj close,close,high,low,open,volume,lag_1,lag_2,lag_3,weekly_return,...,title_embedding_374,title_embedding_375,title_embedding_376,title_embedding_377,title_embedding_378,title_embedding_379,title_embedding_380,title_embedding_381,title_embedding_382,title_embedding_383
0,24.517937,27.174536,27.360516,26.90987,27.181688,3521842.0,26.938484,27.918455,27.982834,-0.076792,...,-0.004133,0.008691,-0.042434,-0.002215,-0.043523,0.006658,0.038139,-0.136477,-0.011128,0.057885
1,17.295727,20.429644,20.5175,20.261786,20.404642,276536400.0,20.384644,20.789286,20.631071,-0.026183,...,-0.016463,-0.029878,-0.049738,-0.050536,-0.057034,0.036203,0.049257,-0.144588,0.0048,0.051053
2,28.117611,36.630001,37.07,36.23,37.0,2434700.0,36.959999,37.73,37.32,-0.036053,...,-0.024606,-0.001802,-0.047134,0.003014,-0.075411,0.04401,0.091576,-0.203208,-0.010621,0.068538
3,20.902025,28.629999,29.0,28.549999,28.780001,4786800.0,28.790001,29.280001,29.379999,-0.050415,...,-0.018906,0.022677,-0.068706,-0.022873,-0.106041,-0.014542,0.013619,-0.174847,-0.051576,0.097726
4,24.34798,30.799999,31.24,30.370001,31.02,13756800.0,30.629999,31.440001,31.040001,-0.035692,...,-0.006847,-0.006129,-0.050931,-0.134466,-0.052448,-0.029073,-0.009584,-0.111398,0.087133,0.026309


In [9]:
train_features, val_features, train_targets, val_targets = train_test_split(features_normalized, targets_normalized, test_size=0.2, random_state=42)
train_dataset = StockDataset(train_features, train_targets)
val_dataset = StockDataset(val_features, val_targets)
train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=64, shuffle=False)

In [17]:
input_dim = train_features.size(1)
hidden_dim = 128
num_layers = 2
output_dim = 1
model = StockPredictor(input_dim, hidden_dim, num_layers, output_dim)

model.load_state_dict(torch.load('../models/dl_average_pooling.pt', map_location=torch.device('cpu'), weights_only=True))

<All keys matched successfully>

In [18]:
model.eval()

StockPredictor(
  (lstm): LSTM(403, 128, num_layers=2, batch_first=True)
  (fc1): Linear(in_features=128, out_features=128, bias=True)
  (relu): ReLU()
  (fc2): Linear(in_features=128, out_features=64, bias=True)
  (fc3): Linear(in_features=64, out_features=1, bias=True)
  (dropout): Dropout(p=0.5, inplace=False)
  (batch_norm1): BatchNorm1d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (batch_norm2): BatchNorm1d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
)

In [137]:

model = StockPredictor(input_dim, hidden_dim, num_layers, output_dim)

criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)
model.device = device

train_model(model, train_loader, val_loader, criterion, optimizer, num_epochs=50)

Training complete. Final training loss: 0.06956687331503272, final validation loss: 0.02506735268079823
Final validation MAE: 0.09179490159158309, final validation R2: 0.0


StockPredictor(
  (lstm): LSTM(403, 128, num_layers=2, batch_first=True)
  (fc1): Linear(in_features=128, out_features=128, bias=True)
  (relu): ReLU()
  (fc2): Linear(in_features=128, out_features=64, bias=True)
  (fc3): Linear(in_features=64, out_features=1, bias=True)
  (dropout): Dropout(p=0.5, inplace=False)
  (batch_norm1): BatchNorm1d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (batch_norm2): BatchNorm1d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
)

In [150]:
# calculate mse mae r2
model.eval()
test_loss = 0.0
test_mae = 0.0
test_r2 = 0.0
with torch.no_grad():
    val_features = val_features.to(device)
    val_targets = val_targets.to(device)
    outputs = model(val_features)
    denormalized_outputs = denormalize(outputs.squeeze(), targets_mean, targets_std)
    denormalized_targets = denormalize(val_targets.squeeze(), targets_mean, targets_std)
    loss = criterion(denormalized_outputs, denormalized_targets)
    test_loss += loss.item() * val_features.size(0)
    print('mse', loss.item())
    test_mae += torch.nn.functional.l1_loss(denormalized_outputs, denormalized_targets).item()
    print('mae', test_mae)
    test_r2 += r2_score(denormalized_outputs.cpu().numpy(), denormalized_targets.cpu().numpy())
    # calculate that true and predicted values both bigger or less than 0
    our_metric = ((denormalized_outputs > 0) == (denormalized_targets > 0)).sum() / len(denormalized_targets)
    print('r2-score:', test_r2)
    print('Have same sign:', our_metric.item())
    print('Example of predcitions:', denormalized_outputs[:10].tolist())
    print('True values:', denormalized_targets[:10].tolist())

mse 8.228811202570796e-05
mae 0.005259359255433083
r2-score: 0.9681331409876758
Have same sign: 0.971124529838562
Example of predcitions: [-0.07886821776628494, -0.018322942778468132, 0.023021450266242027, -0.18347500264644623, 0.004382627084851265, -0.016448847949504852, -0.007681734394282103, 0.04562047868967056, 0.06402567774057388, 0.008089784532785416]
True values: [-0.0856386348605156, -0.02415516786277294, 0.02146715112030506, -0.18781840801239014, 0.006663176231086254, -0.028289422392845154, -0.010420558974146843, 0.0445161834359169, 0.062368981540203094, 0.010641969740390778]


In [2]:
import pandas as pd
historical_data_df = pd.read_csv('../../datasets/sp500_historical_data.csv')
title_embeddings_df = pd.read_csv('../../datasets/all-MiniLM-L6-v2-embedding-news-title.csv')


In [154]:
import ast
title_embeddings_df['date'] = pd.to_datetime(title_embeddings_df['created'])
title_embeddings_df['stocks'] = title_embeddings_df['stocks'].apply(ast.literal_eval)
title_embeddings_df = title_embeddings_df.explode('stocks').reset_index(drop=True)
title_embeddings_df['date'] = title_embeddings_df['date'].dt.date
title_embeddings_df['stocks'] = title_embeddings_df['stocks'].apply(lambda x: x['name'])

In [160]:
historical_data_df['date'] = pd.to_datetime(historical_data_df['Date'])
historical_data_df['date'] = historical_data_df['date'].dt.date
historical_data_df.head()

Unnamed: 0,Date,Symbol,Adj Close,Close,High,Low,Open,Volume,date
0,2010-01-04,A,20.154915,22.389128,22.625179,22.267525,22.453505,3815561.0,2010-01-04
1,2010-01-05,A,19.935982,22.145924,22.331903,22.002861,22.324751,4186031.0,2010-01-05
2,2010-01-06,A,19.865147,22.06724,22.174536,22.002861,22.06724,3243779.0,2010-01-06
3,2010-01-07,A,19.839396,22.038628,22.04578,21.816881,22.017166,3095172.0,2010-01-07
4,2010-01-08,A,19.832952,22.031473,22.06724,21.745352,21.917025,3733918.0,2010-01-08


In [161]:
historical_data_df.drop('Date', inplace=True, axis=1)
title_embeddings_df.drop('created', inplace=True, axis=1)
title_embeddings_df.head()

Unnamed: 0,stocks,title_embedding_0,title_embedding_1,title_embedding_2,title_embedding_3,title_embedding_4,title_embedding_5,title_embedding_6,title_embedding_7,title_embedding_8,...,title_embedding_375,title_embedding_376,title_embedding_377,title_embedding_378,title_embedding_379,title_embedding_380,title_embedding_381,title_embedding_382,title_embedding_383,date
0,CIT,0.027524,-0.028871,-0.060342,0.053199,0.046198,-0.037971,0.135296,-0.022271,0.029455,...,-0.019922,-0.055372,0.011424,-0.06769,-0.015036,0.037755,-0.089164,-0.005867,-0.016719,2009-07-27
1,CME,0.027524,-0.028871,-0.060342,0.053199,0.046198,-0.037971,0.135296,-0.022271,0.029455,...,-0.019922,-0.055372,0.011424,-0.06769,-0.015036,0.037755,-0.089164,-0.005867,-0.016719,2009-07-27
2,ISRG,0.027524,-0.028871,-0.060342,0.053199,0.046198,-0.037971,0.135296,-0.022271,0.029455,...,-0.019922,-0.055372,0.011424,-0.06769,-0.015036,0.037755,-0.089164,-0.005867,-0.016719,2009-07-27
3,MSFT,0.027524,-0.028871,-0.060342,0.053199,0.046198,-0.037971,0.135296,-0.022271,0.029455,...,-0.019922,-0.055372,0.011424,-0.06769,-0.015036,0.037755,-0.089164,-0.005867,-0.016719,2009-07-27
4,AAPL,0.02189,-0.027628,0.035083,0.050099,0.067462,0.026903,0.029803,0.023311,0.047869,...,-0.030781,-0.083482,-0.012103,-0.065227,0.015232,-0.005367,-0.121527,-0.053555,0.00127,2009-07-27


In [162]:
historical_data_df.head()


Unnamed: 0,Symbol,Adj Close,Close,High,Low,Open,Volume,date
0,A,20.154915,22.389128,22.625179,22.267525,22.453505,3815561.0,2010-01-04
1,A,19.935982,22.145924,22.331903,22.002861,22.324751,4186031.0,2010-01-05
2,A,19.865147,22.06724,22.174536,22.002861,22.06724,3243779.0,2010-01-06
3,A,19.839396,22.038628,22.04578,21.816881,22.017166,3095172.0,2010-01-07
4,A,19.832952,22.031473,22.06724,21.745352,21.917025,3733918.0,2010-01-08


In [3]:
import pandas as pd
import pandas as pd
from ta.trend import MACD

def apply_features(group):
    group.index = pd.to_datetime(group.index)

    # Compute lag features, moving averages, etc., for the group
    for lag in range(1, 4):
        group[f'lag_{lag}'] = group['Close'].shift(lag)
    group['weekly_return'] = group['Close'].pct_change(5)
    group['5_day_MA'] = group['Close'].rolling(window=5).mean()
    group['20_day_MA'] = group['Close'].rolling(window=20).mean()
    group['5_day_volatility'] = group['Close'].rolling(window=5).std()
    group['momentum'] = group['Close'] - group['Close'].shift(1)
    
    # MACD, ensuring you handle NaNs as per your strategy

    macd = MACD(close=group['Close'], window_slow=26, window_fast=12, window_sign=9)
    group['MACD'] = macd.macd()
    group['MACD_signal'] = macd.macd_signal()
    group['MACD_histogram'] = macd.macd_diff()

    # Adjusting for multi-stock data: adding week_of_year and month
    group['week_of_year'] = group.index.isocalendar().week
    group['month'] = group.index.month
    
    return group.dropna()  # Optionally drop NaNs

df_stocks = historical_data_df.groupby('Symbol').apply(apply_features)
df_stocks.index = df_stocks.index.droplevel()
df_stocks.reset_index(inplace=True)
df_stocks.drop('index', inplace=True, axis=1)
df_stocks.head()

  df_stocks = historical_data_df.groupby('Symbol').apply(apply_features)


Unnamed: 0,Date,Symbol,Adj Close,Close,High,Low,Open,Volume,lag_1,lag_2,...,weekly_return,5_day_MA,20_day_MA,5_day_volatility,momentum,MACD,MACD_signal,MACD_histogram,week_of_year,month
0,2010-02-22,A,20.058334,22.281832,22.381973,22.06724,22.381973,4038123.0,22.317596,22.160229,...,0.039026,22.061516,21.189199,0.28556,-0.035765,0.115873,-0.086697,0.20257,1,1
1,2010-02-23,A,19.865147,22.06724,22.288984,21.838341,22.238913,4366373.0,22.281832,22.317596,...,0.019835,22.147353,21.23927,0.165868,-0.214592,0.13978,-0.041401,0.181181,1,1
2,2010-02-24,A,20.109842,22.339056,22.346209,22.074392,22.153076,3945855.0,22.06724,22.281832,...,0.019589,22.233191,21.308655,0.115716,0.271816,0.178601,0.002599,0.176002,1,1
3,2010-02-25,A,20.019684,22.238913,22.267525,21.623749,22.060085,4778504.0,22.339056,22.06724,...,0.003551,22.248927,21.377325,0.108434,-0.100143,0.198992,0.041878,0.157114,1,1
4,2010-02-26,A,20.25794,22.503576,22.546495,22.160229,22.296137,4678127.0,22.238913,22.339056,...,0.008333,22.286123,21.476395,0.158307,0.264664,0.233813,0.080265,0.153548,1,1


In [176]:
# max pooling of embedding by date and stock. So we need to group by date and stock and take max (by module) of each dimension of embedding
# from tqdm import tqdm
# from tqdm.notebook import tqdm  # for notebooks
# from tqdm import tqdm
# from tqdm.gui import tqdm as tqdm_gui
# tqdm.pandas(ncols=50)
from tqdm import tqdm
# from pandarallel import pandarallel

import pandas as pd
# Create new pandas methods which use tqdm progress
# (can use tqdm_gui, optional kwargs, etc.)
tqdm.pandas()
# pandarallel.initialize(progress_bar=True)
def signed_abs_max(series):
    abs_max_idx = series.abs().idxmax()
    return series[abs_max_idx]


# aggregation = {}
# agg_func = lambda x: x.abs().max()
# for title_emb in title_embeddings_columns:
#     aggregation[title_emb] = agg_func
# title_embeddings_df_grouped = title_embeddings_df.groupby(['date', 'stocks'])[title_embeddings_columns].progress_apply(*aggregation)
# title_embeddings_df_grouped.reset_index(inplace=True)
# title_embeddings_df_grouped.head()
# resulted_df = None
# for name, group in tqdm(title_embeddings_df.groupby(['date', 'stocks'])):
#     group = group[title_embeddings_columns].abs().max()
#     group = group.to_frame().T
#     group['date'] = name[0]
#     group['stocks'] = name[1]
#     if resulted_df is None:
#         resulted_df = group
#     else:
#         resulted_df = pd.concat([resulted_df, group])
# title_embeddings_df_grouped = title_embeddings_df.groupby(['date', 'stocks'])[title_embeddings_columns].max()

# Group by 'date' and 'stocks'
grouped = title_embeddings_df.groupby(['date', 'stocks'])

max_df = grouped.progress_apply(lambda group: group.iloc[:, 1:-1].apply(signed_abs_max))
max_df.reset_index(inplace=True)
print(max_df)

100%|██████████| 344182/344182 [53:38<00:00, 106.93it/s] 


              date stocks  title_embedding_0  title_embedding_1  \
0       2009-07-27   AAPL          -0.041291          -0.027628   
1       2009-07-27   AMZN          -0.019739          -0.070451   
2       2009-07-27   BIDU          -0.030594          -0.065926   
3       2009-07-27    CIT           0.027524          -0.028871   
4       2009-07-27    CME           0.027524          -0.028871   
...            ...    ...                ...                ...   
344177  2023-10-30    XPO          -0.058720          -0.046279   
344178  2023-10-30    XRX           0.021734           0.014859   
344179  2023-10-30    YUM           0.021734           0.014859   
344180  2023-10-30   YUMC           0.021734           0.014859   
344181  2023-10-30     ZI          -0.058720          -0.020790   

        title_embedding_2  title_embedding_3  title_embedding_4  \
0                0.035083           0.088277           0.067462   
1               -0.061547           0.132563           0.0598

In [178]:
max_df.to_csv('../../datasets/max_title_embeddings.csv', index=False)

In [179]:
df_merged = pd.merge(df_stocks, max_df, left_on=['date', 'Symbol'], right_on=['date', 'stocks'], how='left')
df_merged.drop(['stocks', 'date'], inplace=True, axis=1)
df_merged.head()

Unnamed: 0,Symbol,Adj Close,Close,High,Low,Open,Volume,lag_1,lag_2,lag_3,...,title_embedding_374,title_embedding_375,title_embedding_376,title_embedding_377,title_embedding_378,title_embedding_379,title_embedding_380,title_embedding_381,title_embedding_382,title_embedding_383
0,A,20.058334,22.281832,22.381973,22.06724,22.381973,4038123.0,22.317596,22.160229,21.90987,...,,,,,,,,,,
1,A,19.865147,22.06724,22.288984,21.838341,22.238913,4366373.0,22.281832,22.317596,22.160229,...,,,,,,,,,,
2,A,20.109842,22.339056,22.346209,22.074392,22.153076,3945855.0,22.06724,22.281832,22.317596,...,,,,,,,,,,
3,A,20.019684,22.238913,22.267525,21.623749,22.060085,4778504.0,22.339056,22.06724,22.281832,...,,,,,,,,,,
4,A,20.25794,22.503576,22.546495,22.160229,22.296137,4678127.0,22.238913,22.339056,22.06724,...,,,,,,,,,,


In [180]:
df_merged.fillna(0, inplace=True)

In [182]:
df_merged.to_csv('../../datasets/max_title_embeddings_merged.csv', index=False)


In [1]:
import pandas as pd
df_merged = pd.read_csv('../../datasets/average_title_embeddings_merged.csv')

In [7]:
def normalize(data, mean, std, eps=1e-6):
    return (data - mean) / (std + eps)
def denormalize(data, mean, std):
    return data * std + mean

targets = df_merged['weekly_return'].values
combined_features = df_merged.drop(['weekly_return', 'Symbol'], axis=1).values
targets = torch.from_numpy(targets).to(dtype=torch.float)
targets_mean = targets.mean()
targets_std = targets.std()
targets_normalized = normalize(targets, targets_mean, targets_std)
features_mean, features_std = combined_features.mean(axis=0), combined_features.std(axis=0)
features_normalized = normalize(combined_features, features_mean, features_std)

KeyError: "['Symbol'] not found in axis"

In [None]:
del df_merged
del combined_features
del targets

In [None]:
train_features, val_features, train_targets, val_targets = train_test_split(features_normalized, targets_normalized, test_size=0.2, random_state=42)
train_dataset = StockDataset(train_features, train_targets)
val_dataset = StockDataset(val_features, val_targets)
train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=64, shuffle=False)

KeyboardInterrupt: 

In [9]:
model = StockPredictor(input_dim, hidden_dim, num_layers, output_dim)

criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)
model.device = device

train_model(model, train_loader, val_loader, criterion, optimizer, num_epochs=50)

In [None]:
# calculate mse mae r2
model.eval()
test_loss = 0.0
test_mae = 0.0
test_r2 = 0.0
with torch.no_grad():
    val_features = val_features.to(device)
    val_targets = val_targets.to(device)
    outputs = model(val_features)
    denormalized_outputs = denormalize(outputs.squeeze(), targets_mean, targets_std)
    denormalized_targets = denormalize(val_targets.squeeze(), targets_mean, targets_std)
    loss = criterion(denormalized_outputs, denormalized_targets)
    test_loss += loss.item() * val_features.size(0)
    print('mse', loss.item())
    test_mae += torch.nn.functional.l1_loss(denormalized_outputs, denormalized_targets).item()
    print('mae', test_mae)
    test_r2 += r2_score(denormalized_outputs.cpu().numpy(), denormalized_targets.cpu().numpy())
    # calculate that true and predicted values both bigger or less than 0
    our_metric = ((denormalized_outputs > 0) == (denormalized_targets > 0)).sum() / len(denormalized_targets)
    print('r2-score:', test_r2)
    print('Have same sign:', our_metric.item())
    print('Example of predcitions:', denormalized_outputs[:10].tolist())
    print('True values:', denormalized_targets[:10].tolist())