In [1]:
!pip install apimoex pandas-market-calendars

Collecting apimoex
  Downloading apimoex-1.4.0-py3-none-any.whl.metadata (5.4 kB)
Collecting pandas-market-calendars
  Downloading pandas_market_calendars-5.1.0-py3-none-any.whl.metadata (9.6 kB)
Collecting exchange-calendars>=3.3 (from pandas-market-calendars)
  Downloading exchange_calendars-4.10.1-py3-none-any.whl.metadata (37 kB)
Collecting pyluach (from exchange-calendars>=3.3->pandas-market-calendars)
  Downloading pyluach-2.2.0-py3-none-any.whl.metadata (4.3 kB)
Collecting korean_lunar_calendar (from exchange-calendars>=3.3->pandas-market-calendars)
  Downloading korean_lunar_calendar-0.3.1-py3-none-any.whl.metadata (2.8 kB)
Downloading apimoex-1.4.0-py3-none-any.whl (11 kB)
Downloading pandas_market_calendars-5.1.0-py3-none-any.whl (123 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m123.9/123.9 kB[0m [31m3.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading exchange_calendars-4.10.1-py3-none-any.whl (200 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [None]:
import datetime
from datetime import datetime as extra_datetime
import time
import pandas as pd

import requests
import apimoex
import time
import pandas_market_calendars as mcal
import matplotlib.pyplot as plt
import numpy as np

from tqdm.notebook import tqdm
import warnings
warnings.filterwarnings('ignore')



In [3]:
tickers = [
    "SBER",
    "TGLD",   # Сбербанк
    "GAZP",   # Газпром
    "LKOH",   # Лукойл
    "PIKK",   # Роснефть
    "SNGS",   # Сургутнефтегаз
    "CHMF",   # Северсталь

]

In [4]:
def parse_tickers(companies, start_date, end_date):
    '''
    companies: list, список тикеров 
    start_date: str, Дата вида ГГГГ-ММ-ДД
    end_date: Дата вида ГГГГ-ММ-ДД
    
    '''
    # Парсятся данные в основном режиме торгов T+2
    board = 'TQBR'
    dfs = []
    with requests.Session() as session:
        for ticker in tqdm(companies, desc = 'Processing russian stock', total = len(companies)):
            
            
            data = apimoex.get_board_history(session, ticker, board=board,
                                            start = start_date, end = end_date)

            if data == []:
                print(f"Для акции {ticker} нет данных")
                continue
            
            data = pd.DataFrame(data)[["TRADEDATE", "CLOSE"]]
            data["TRADEDATE"] = pd.to_datetime(data["TRADEDATE"])
            data.set_index("TRADEDATE", inplace = True)
            data.columns = [f"close_{ticker}"]
            
            dfs.append(data)
    return pd.concat(dfs, axis = 1)

In [5]:
df = parse_tickers(tickers, "2023-05-05", "2025-05-07")
df.ffill(inplace = True)

Processing russian stock:   0%|          | 0/7 [00:00<?, ?it/s]

Для акции TGLD нет данных


In [6]:
df = df.reset_index()

In [7]:
df["day"] = df["TRADEDATE"].map(lambda x: x.dayofweek)

In [None]:
df_tgld = pd.read_csv("/kaggle/input/tinkoff-pulse-parsing/TGLD_comments_final.csv")
df_sngs = pd.read_csv("/kaggle/input/tinkoff-pulse-parsing/SNGS_comments_final.csv")
df_lkoh = pd.read_csv("/kaggle/input/tinkoff-pulse-parsing/LKOH_comments_final.csv")
df_gazp = pd.read_csv("/kaggle/input/tinkoff-pulse-parsing/GAZP_comments_final.csv")
df_chmf = pd.read_csv("/kaggle/input/tinkoff-pulse-parsing/CHMF_comments_final.csv")
df_sber = pd.read_csv("/kaggle/input/tinkoff-pulse-parsing/SBER_comments_final.csv")
df_pikk = pd.read_csv("/kaggle/input/tinkoff-pulse-parsing/PIKK_comments_final.csv")

In [None]:
def align_comments_with_prices(df_comments, df_prices, price_col, l=20):
    '''
    Каждому комментарию в момент T сопоставляет цену p_T, если T < 18:50:00
    и p_T+1 иначе. Для выходных сопоставляется цена закрытия в пн. 

    Каждой строке добавляет фичи от пользователя (агрегированные)
    Добавляет историю цен, т.е. l лагов
    '''
    df_prices = df_prices.copy()
    df_comments = df_comments.copy()
    
    df_prices['TRADEDATE'] = pd.to_datetime(df_prices['TRADEDATE'])
    df_comments['inserted'] = pd.to_datetime(df_comments['inserted'])

    tradedates = sorted(df_prices['TRADEDATE'].unique())

    def match_tradedate(row):
        inserted = row['inserted']
        weekday = inserted.weekday()
        date = inserted.date()
        time = inserted.time()
        if weekday >= 5:
            for d in tradedates:
                if d > pd.Timestamp(date):
                    return d
            return None
        border_time = pd.to_timedelta('18:50:00')
        if pd.to_timedelta(str(time)) < border_time:
            for d in tradedates:
                if d == pd.Timestamp(date):
                    return d
            for d in tradedates:
                if d > pd.Timestamp(date):
                    return d
            return None
        else:
            for d in tradedates:
                if d > pd.Timestamp(date):
                    return d
            return None

    df_comments['TRADEDATE'] = df_comments.apply(match_tradedate, axis=1)
    
    comment_counts = (
        df_comments.groupby('TRADEDATE')
        .size()
        .rename('total_posts')
        .reset_index()
    )
    df_comments = pd.merge(df_comments, comment_counts, on='TRADEDATE', how='left')

    # === Лаги цен ===
    for i in range(1, l + 1):
        lag_col = f"{price_col}_lag{i}"
        df_prices[lag_col] = df_prices[price_col].shift(i)

    lag_cols = [f"{price_col}_lag{i}" for i in range(1, l + 1)]
    price_cols = [price_col] + lag_cols

    df_merged = pd.merge(df_comments, df_prices[['TRADEDATE'] + price_cols], on='TRADEDATE', how='left')

    df_merged["hour"] = df_merged["inserted"].map(lambda x: x.hour)
    
    # Средние реакции на комментарии пользователя
    mean_reactions = (
        df_merged.groupby('nickname')['reactions']
        .mean()
        .rename('mean_reactions_per_user')
        .reset_index()
    )
    df_merged = pd.merge(df_merged, mean_reactions, on='nickname', how='left')

    # Переименование столбцов цен
    rename_dict = {price_col: "close"}
    for i in range(1, l + 1):
        rename_dict[f"{price_col}_lag{i}"] = f"close_lag{i}"
    df_merged = df_merged.rename(columns=rename_dict)

    return df_merged

df_gazp_aligned = align_comments_with_prices(df_gazp, df, 'close_GAZP', l = 20).drop(['Unnamed: 0.1', 'Unnamed: 0', 'id', 'inserted', 'nickname', 'text', 'TRADEDATE'], axis = 1)
df_sngs_aligned = align_comments_with_prices(df_sngs, df, 'close_SNGS', l = 20).drop(['Unnamed: 0.1', 'Unnamed: 0', 'id', 'inserted', 'nickname', 'text', 'TRADEDATE'], axis = 1)
df_pikk_aligned = align_comments_with_prices(df_pikk, df, 'close_PIKK', l = 20).drop(['Unnamed: 0.1', 'Unnamed: 0', 'id', 'inserted', 'nickname', 'text', 'TRADEDATE'], axis = 1)
df_chmf_aligned = align_comments_with_prices(df_chmf, df, 'close_CHMF', l = 20).drop(['Unnamed: 0.1', 'Unnamed: 0', 'id', 'inserted', 'nickname', 'text', 'TRADEDATE'], axis = 1)
df_lkoh_aligned = align_comments_with_prices(df_lkoh, df, 'close_LKOH', l = 20).drop(['Unnamed: 0.1', 'Unnamed: 0', 'id', 'inserted', 'nickname', 'text', 'TRADEDATE'], axis = 1)
df_sber_aligned = align_comments_with_prices(df_sber, df, 'close_CHMF', l = 20).drop(['Unnamed: 0.1', 'Unnamed: 0', 'id', 'inserted', 'nickname', 'text', 'TRADEDATE'], axis = 1)


# PatchTST without sentiment (pretrain)

In [None]:
pip install transformers torch --upgrade

In [None]:
import torch
from transformers import PatchTSTForPrediction, PatchTSTConfig
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error

def preprocess_data(df, price_col, time_col, seq_len = 20, prediction_length = 1, test_size = 0.2):
    df = df.copy().sort_values(time_col)
    values = df[price_col].diff().dropna().values.astype(np.float32) # ключевой аспект - переход от цен к доходностям! r_t = (p_t - p_t-1) / p_t-1

    # масштабирование только по обучающей выборке (для стабилизации обучения)
    n = int(len(values[seq_len:]) * test_size)
    mean = values[:n].mean()
    std = values[:n].std()

    values = (values - mean) / std
    

    X, y = [], []
    for i in range(len(values) - seq_len - prediction_length + 1):
        X.append(values[i:i+seq_len].reshape(-1, 1))
        y.append(values[i+seq_len:i+seq_len+prediction_length]) 
    
    X = np.stack(X)  # (samples, seq_len, 1)
    y = np.stack(y).reshape(-1, prediction_length)  # (samples, pred_len)

    X = torch.tensor(X, dtype = torch.float32)
    y = torch.tensor(y, dtype = torch.float32)
    
    X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=test_size, random_state=42, shuffle = False)


    return X_train, X_val, y_train, y_val, mean, std

X_train, X_val, y_train, y_val, mean, std = preprocess_data(df, "close_GAZP", "TRADEDATE")

In [10]:
from torch.utils.data import TensorDataset, DataLoader

BATCH_SIZE = 32

train_ds = TensorDataset(X_train, y_train)
val_ds = TensorDataset(X_val, y_val)
train_dl = DataLoader(train_ds, batch_size=BATCH_SIZE, shuffle=False)
val_dl = DataLoader(val_ds, batch_size=BATCH_SIZE, shuffle=False)


In [11]:
train_ds[0][0].shape

torch.Size([20, 1])

In [None]:
import torch
import torch.optim as optim
from tqdm.notebook import tqdm
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import mean_squared_error, mean_absolute_percentage_error, mean_absolute_error

def pretrain_model(model, train_dl, val_dl, num_epochs, lr = 1e-3):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)
    loss_fn = torch.nn.MSELoss()
    optimizer = optim.AdamW(model.parameters(), lr=lr)
    val_loss_min = np.inf
    
    epoch_train_losses = []
    epoch_val_losses = []
    best_model_state = None

    for epoch in tqdm(range(num_epochs), desc="Epochs"):
        model.train()
        train_losses = []
        for xb, yb in train_dl:
            xb, yb = xb.to(device), yb.to(device)
            outputs = model(xb)
            preds = outputs.prediction_outputs.squeeze(-1)
            loss = loss_fn(preds, yb)
            loss.backward()
            optimizer.step()
            optimizer.zero_grad()
            train_losses.append(loss.item())
        train_loss_mean = np.mean(train_losses)
        epoch_train_losses.append(train_loss_mean)
        
        # Валидация
        model.eval()
        all_preds = []
        all_targets = []
        with torch.no_grad():
            for xb, yb in val_dl:
                xb, yb = xb.to(device), yb.to(device)
                outputs = model(xb)
                preds = outputs.prediction_outputs.squeeze(-1).cpu().numpy()
                all_preds.append(preds)
                all_targets.append(yb.cpu().numpy())
        y_pred = np.concatenate(all_preds).ravel()
        y_true = np.concatenate(all_targets).ravel()
        val_loss = mean_squared_error(y_true, y_pred)
        epoch_val_losses.append(val_loss)
        
        if (epoch % 10 == 0):
            tqdm.write(f"Epoch {epoch+1}: Train loss = {train_loss_mean:.5f}, Val loss = {val_loss:.5f}")
        
        # Сохраняем модель, если val_loss < минимума val_loss
        if val_loss < val_loss_min:
            val_loss_min = val_loss
            best_model_state = {k: v.cpu() for k, v in model.state_dict().items()}
            torch.save(best_model_state, "best_model.pth")
            tqdm.write(f"Best model saved at epoch {epoch+1} (val_loss < minimal val_loss)")

    # Графики на каждой эпохе
    plt.figure(figsize=(8, 5))
    plt.plot(epoch_train_losses, label="Train Loss")
    plt.plot(epoch_val_losses, label="Validation Loss")
    plt.xlabel("Epoch")
    plt.ylabel("Loss (MSE)")
    plt.title("Train & Validation Loss")
    plt.legend()
    plt.grid(True)
    plt.show()
    
    return epoch_train_losses, epoch_val_losses


In [109]:
seq_len = 20  

config = PatchTSTConfig(
    context_length=seq_len,
    prediction_length=1,
    patch_len=10,        
    d_model=32, # размерность скрытого пространства
    n_heads=20,
    n_layers=100,
    target_dim=1,
    channels=1,
    head_dropout=0.1
)

model = PatchTSTForPrediction(config)
model.train()


PatchTSTForPrediction(
  (model): PatchTSTModel(
    (scaler): PatchTSTScaler(
      (scaler): PatchTSTStdScaler()
    )
    (patchifier): PatchTSTPatchify()
    (masking): Identity()
    (encoder): PatchTSTEncoder(
      (embedder): PatchTSTEmbedding(
        (input_embedding): Linear(in_features=1, out_features=32, bias=True)
      )
      (positional_encoder): PatchTSTPositionalEncoding(
        (positional_dropout): Identity()
      )
      (layers): ModuleList(
        (0-2): 3 x PatchTSTEncoderLayer(
          (self_attn): PatchTSTAttention(
            (k_proj): Linear(in_features=32, out_features=32, bias=True)
            (v_proj): Linear(in_features=32, out_features=32, bias=True)
            (q_proj): Linear(in_features=32, out_features=32, bias=True)
            (out_proj): Linear(in_features=32, out_features=32, bias=True)
          )
          (dropout_path1): Identity()
          (norm_sublayer1): PatchTSTBatchNorm(
            (batchnorm): BatchNorm1d(32, eps=1e-05, mom

In [None]:
train_loss, val_loss = pretrain_model(model, train_dl, val_dl, 100, 1e-3)

Epochs:   0%|          | 0/100 [00:00<?, ?it/s]

Epoch 1: Train loss = 1.52930, Val loss = 8.77559
Best model saved at epoch 1 (val_loss < minimal val_loss)


In [None]:
model = PatchTSTForPrediction(config)
state_dict = torch.load('/kaggle/working/best_model.pth', map_location='cpu')
model.load_state_dict(state_dict)

In [None]:
device = "cuda" if torch.cuda.is_available() else "cpu"
model.eval()
all_preds = []
all_targets = []
with torch.no_grad():
    for xb, yb in val_dl:
        xb, yb = xb.to(device), yb.to(device)
        outputs = model(xb)
        preds = outputs.prediction_outputs.squeeze(-1).squeeze(-1).cpu().numpy()
        all_preds.append(preds)
        all_targets.append(yb.cpu().numpy())
        
y_pred = np.concatenate(all_preds)
y_true = np.concatenate(all_targets)

print(f"MAE: {mean_absolute_error(y_true, y_pred):.4f}")
print(f"MSE: {mean_squared_error(y_true, y_pred):.4f}")
print(f"MAPE: {mean_absolute_percentage_error(y_true, y_pred):.4f}")
naive_preds = np.zeros_like(y_true)
print(f"MAE naive: {mean_absolute_error(y_true, naive_preds):.4f}")
print(f"MSE naive: {mean_squared_error(y_true, naive_preds):.4f}")
print(f"MAPE naive: {mean_absolute_percentage_error(y_true, naive_preds):.4f}")


In [None]:
plt.plot(y_pred, label = "Forecaster predictions")
plt.plot(y_true, label = "True series")
plt.plot(naive_preds, label = "Naive")

plt.legend()
plt.grid(True);