# Main

In [28]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sqlalchemy import create_engine

import re
from tqdm import tqdm
import requests
import datetime
import time
from collections import Counter
from random import uniform, randint
import warnings

warnings.filterwarnings('ignore')

from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from natasha import (Segmenter, MorphVocab, NewsEmbedding, NewsMorphTagger,
                     NewsNERTagger, Doc)

from sklearn.utils.class_weight import compute_class_weight
from sklearn.model_selection import train_test_split
from sklearn.metrics import (ConfusionMatrixDisplay, confusion_matrix,
                             f1_score, classification_report)

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Embedding, LSTM, Dense
from keras.models import Sequential
import tensorflow as tf
import tensorflow_addons as tfa
import keras

In [2]:
username = '...'
password = '...'
host = '...'
port = '...'
database = '...'
conn_string = f'postgresql://{username}:{password}@{host}:{port}/{database}'

In [5]:
db = create_engine(conn_string) 
conn = db.connect() 
conn.autocommit = True
companies_df = pd.read_sql_query('SELECT * FROM data.companies_tickets;', conn)
conn.close()

print(companies_df.shape)
companies_df.head()

(7542, 9)


Unnamed: 0,website,section,url,header,body,tags,body_length,datetime,company
0,РИА,Экономика,https://ria.ru/20230103/banki-1842907991.html,Эксперт рассказал о развитии исламского банкин...,Порядка 10-15 филиалов исламского банкинга м...,"['Экономика', 'Россия', 'Сбербанк России']",1759,2023-01-03 10:16:00,SBER
1,РИА,Экономика,https://ria.ru/20230103/gazprom-1842940413.html,"Экспорт ""Газпрома"" снизился до минимума с конц...","Объём экспорта ""Газпрома"" в прошедшем году с...","['Экономика', 'Газпром', 'Россия']",1299,2023-01-03 14:22:00,GAZP
2,РИА,Экономика,https://ria.ru/20230104/shelf-1843063413.html,Ямальской шельфовой компании предоставили два ...,Правительство РФ предоставило Ямальской шель...,"['Экономика', 'Карское море', 'Россия']",1719,2023-01-04 16:36:00,ROSN
3,Интерфакс,ЭКОНОМИКА,https://www.interfax.ru/business/879947,"""Соллерс"" возобновил после новогодних каникул ...","- Российская автомобилестроительная группа ""...","['Соллерс', 'Соллерс Алабуга', 'Татарстан']",2243,2023-01-09 12:28:00,SVAV
4,Kommersant,Фондовый рынок,https://www.kommersant.ru/doc/5757640,Нестабильный рост,Минувший год запомнится инвесторам как время с...,,12414,2023-01-06 10:02:00,SBER


# Get data for different deltas

In [6]:
# To send get requests consequently in time
companies_df = companies_df.sort_values(by=['datetime'])

In [17]:
companies_string = 'http://iss.moex.com/iss/engines/stock/markets/shares/securities/{}/candles.json?from={}&till={}&interval=1'
companies_df['price_release'] = np.nan
time_deltas = [5, 10, 15, 30, 45, 60, 75, 90]

for timedelta in tqdm(time_deltas, total=len(time_deltas)):
    companies_df[f'price_lag_{timedelta}'] = np.nan

    for i in range(len(companies_df)):
        start = companies_df['datetime'][i]
        end = companies_df['datetime'][i] + datetime.timedelta(minutes=timedelta)
        company = companies_df['company'][i]
        connect_timeout = 300
        read_timeout = 120
        n = requests.get(companies_string.format(company, start, end), timeout=(connect_timeout, read_timeout)).json()
        try:
            companies_df.loc[i, 'price_release'] = n['candles']['data'][0][0]
            companies_df.loc[i, f'price_lag_{timedelta}'] = n['candles']['data'][-1][0]
        except IndexError:
            continue
    #     if i % randint() == 0:
    #         delay = uniform(5, 10)
    #         time.sleep(delay)

In [35]:
companies_df = companies_df[companies_df['price_release'].notna()]
companies_df.to_csv(f'deltas_test.csv', index=False)

In [66]:
companies_df = pd.read_csv('deltas_test.csv')

# For convinience (in fact, does not make difference)
companies_df = companies_df.drop(['tags'], axis=1).dropna()
companies_df.reset_index(inplace=True)

# Text preprocessing

## Setup

In [70]:
additional_stopwords = [
    'которых', 'которые', 'твой', 'которой', 'которого', 'сих', 'ком', 'свой',
    'твоя', 'этими', 'слишком', 'нами', 'всему', 'будь', 'саму', 'чаще',
    'ваше', 'сами', 'наш', 'затем', 'самих', 'наши', 'ту', 'каждое', 'мочь',
    'весь', 'этим', 'наша', 'своих', 'оба', 'который', 'зато', 'те', 'этих',
    'вся', 'ваш', 'такая', 'теми', 'ею', 'которая', 'нередко', 'каждая',
    'также', 'чему', 'собой', 'самими', 'нем', 'вами', 'ими', 'откуда',
    'такие', 'тому', 'та', 'очень', 'сама', 'нему', 'алло', 'оно', 'этому',
    'кому', 'тобой', 'таки', 'твоё', 'каждые', 'твои', 'нею', 'самим', 'ваши',
    'ваша', 'кем', 'мои', 'однако', 'сразу', 'свое', 'ними', 'всё', 'неё',
    'тех', 'хотя', 'всем', 'тобою', 'тебе', 'одной', 'другие', 'само', 'эта',
    'самой', 'моё', 'своей', 'такое', 'всею', 'будут', 'своего', 'кого',
    'свои', 'мог', 'нам', 'особенно', 'её', 'самому', 'наше', 'кроме',
    'вообще', 'вон', 'мною', 'никто', 'это'
]

stop_words = stopwords.words('russian') + additional_stopwords

In [71]:
# Natasha <3
segmenter = Segmenter()
morph_vocab = MorphVocab()

emb = NewsEmbedding()
morph_tagger = NewsMorphTagger(emb)

ner_tagger = NewsNERTagger(emb)

In [72]:
del_n = re.compile('\n')
del_tags = re.compile('<[^>]*>')
del_brackets = re.compile('\([^)]*\)')
clean_text = re.compile('[^а-яa-z\s]')
del_spaces = re.compile('\s{2,}')


def prepare_text(text):
    text = del_n.sub(' ', str(text).lower())
    text = del_tags.sub('', text)
    text = del_brackets.sub('', text)
    res_text = clean_text.sub('', text)
    return del_spaces.sub(' ', res_text)


def del_stopwords(text):
    clean_tokens = tuple(
        map(lambda x: x if x not in stop_words else '', word_tokenize(text)))
    res_text = ' '.join(clean_tokens)
    return res_text


def lemmatize(text):
    text = Doc(text)
    text.segment(segmenter)
    text.tag_morph(morph_tagger)
    for token in text.tokens:
        token.lemmatize(morph_vocab)
    text.tag_ner(ner_tagger)
    for span in text.spans:
        span.normalize(morph_vocab)
    return ' '.join([token.lemma for token in text.tokens])

In [73]:
def preprocess_and_save(df, save_name):
    txt_lst = []
    txts = df.body.copy()

    for text in tqdm(txts):
        text = prepare_text(text)
        text = del_stopwords(text)
        text = lemmatize(text)
        txt_lst.append(text)
    
    # Save locally file with preprocessed texts (for convenience)
    df_safe = pd.concat(
        [df, pd.DataFrame(txt_lst, columns=['text_clear'])], axis=1)

    df_safe.to_csv(f'deltas_test_{save_name}.csv', index=False)

In [74]:
def define_classes(df, main_col, class_col, p=0.05):
    df_ret = df.copy()

    q_l = df_ret[main_col].quantile(p)
    q_u = df_ret[main_col].quantile(1 - p)

    df_ret[class_col] = 1
    df_ret.loc[df_ret[main_col] <= q_l, class_col] = 0
    df_ret.loc[df_ret[main_col] >= q_u, class_col] = 2

    print(df_ret[class_col].value_counts())
    
    return df_ret

## Preprocessing

In [75]:
preprocess_and_save(companies_df, 'comp')

100%|██████████████████████████████████████████████| 6934/6934 [09:20<00:00, 12.38it/s]


In [80]:
df_comp = pd.read_csv('deltas_test_comp.csv')

for i, text in enumerate(df_comp.text_clear):
    df_comp.loc[i, 'text_clear'] = prepare_text(text).strip()

# Models testing with different deltas

## Setup

In [82]:
for timedelta in time_deltas:
    df_comp[f'price_diff_{timedelta}'] = df_comp[f'price_lag_{timedelta}'] - df_comp['price_release']
    df_comp[f'price_diff_percent_{timedelta}'] = df_comp[f'price_diff_{timedelta}'] / df_comp['price_release'] * 100
    
    df_comp = define_classes(df_comp, f'price_diff_percent_{timedelta}', f'price_diff_cat_{timedelta}', 0.05)

price_diff_cat_5
1    6240
0     347
2     347
Name: count, dtype: int64
price_diff_cat_10
1    6240
0     347
2     347
Name: count, dtype: int64
price_diff_cat_15
1    6240
2     347
0     347
Name: count, dtype: int64
price_diff_cat_30
1    6240
2     347
0     347
Name: count, dtype: int64
price_diff_cat_45
1    6240
2     347
0     347
Name: count, dtype: int64
price_diff_cat_60
1    6240
0     347
2     347
Name: count, dtype: int64
price_diff_cat_75
1    6240
2     347
0     347
Name: count, dtype: int64
price_diff_cat_90
1    6240
2     347
0     347
Name: count, dtype: int64


In [111]:
def model_my_lstm_fit_predict(y, X, epochs=20):
    texts = X.copy()
    labels = y.copy()

    X_train, X_test, y_train, y_test = train_test_split(texts,
                                                        labels,
                                                        random_state=42,
                                                        test_size=0.25,
                                                        stratify=labels)

    max_words = 10000  # max number of words to use in the vocabulary
    max_len = 500  # max length of each text (in terms of number of words)
    embedding_dim = 500  # dimension of word embeddings
    lstm_units = 32  # number of units in the LSTM layer
    num_classes = len(set(labels))  # number of classes

    tokenizer = Tokenizer(num_words=max_words)
    tokenizer.fit_on_texts(X_train)

    sequences_train = tokenizer.texts_to_sequences(X_train)
    sequences_test = tokenizer.texts_to_sequences(X_test)

    X_train = pad_sequences(sequences_train, maxlen=max_len)
    X_test = pad_sequences(sequences_test, maxlen=max_len)

    y_train = pd.get_dummies(y_train).values
    y_test = pd.get_dummies(y_test).values

    keras.backend.clear_session()

    model = Sequential()
    model.add(Embedding(max_words, embedding_dim, input_length=max_len))
    model.add(LSTM(lstm_units))
    model.add(Dense(num_classes, activation='softmax'))

    model.compile(loss='categorical_crossentropy',
                  optimizer=tf.keras.optimizers.Adam(clipvalue=0.7),
                  metrics=[tfa.metrics.F1Score(num_classes=num_classes)])

    scheduler = keras.callbacks.LearningRateScheduler(
        tf.keras.optimizers.schedules.ExponentialDecay(
            initial_learning_rate=6e-5, decay_rate=0.85, decay_steps=100))

    weights = compute_class_weight(class_weight='balanced',
                                   classes=np.unique(labels),
                                   y=labels)

    weights = {i: weight for i, weight in enumerate(weights)}

    model.fit(X_train,
              y_train,
              batch_size=32,
              epochs=epochs,
              callbacks=[scheduler],
              class_weight=weights,
              validation_data=(X_test, y_test))

    return model

## time_delta==5

In [112]:
model_my_lstm_fit_predict(df_comp['price_diff_cat_5'], df_comp['text_clear'])

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.src.engine.sequential.Sequential at 0x285b3e50bd0>

## time_delta==10

In [113]:
model_my_lstm_fit_predict(df_comp['price_diff_cat_10'], df_comp['text_clear'])

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.src.engine.sequential.Sequential at 0x285d70f7e90>

## time_delta==15

In [114]:
model_my_lstm_fit_predict(df_comp['price_diff_cat_15'], df_comp['text_clear'])

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.src.engine.sequential.Sequential at 0x28604398290>

## time_delta==30

In [115]:
model_my_lstm_fit_predict(df_comp['price_diff_cat_30'], df_comp['text_clear'])

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.src.engine.sequential.Sequential at 0x285ee14fa50>

## time_delta==45

In [116]:
model_my_lstm_fit_predict(df_comp['price_diff_cat_45'], df_comp['text_clear'])

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.src.engine.sequential.Sequential at 0x2860ada1fd0>

## time_delta==60

In [117]:
model_my_lstm_fit_predict(df_comp['price_diff_cat_60'], df_comp['text_clear'])

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.src.engine.sequential.Sequential at 0x286060da290>

## time_delta==75

In [118]:
model_my_lstm_fit_predict(df_comp['price_diff_cat_75'], df_comp['text_clear'])

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.src.engine.sequential.Sequential at 0x28611ba3510>

## time_delta==90

In [119]:
model_my_lstm_fit_predict(df_comp['price_diff_cat_90'], df_comp['text_clear'])

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.src.engine.sequential.Sequential at 0x285eef06950>

# Result

> **Conclusion:** As we can see, there is no much difference between various time deltas taken, that is why we will stop on using **time_delta==30**, as it was adviced in the paper dedicated to this question