# IMPORT

In [35]:
import pandas as pd
from sklearn.model_selection import train_test_split
from tqdm import tqdm
from torch.utils.data import TensorDataset, DataLoader
from torch import nn
import torch
from torch.amp import autocast, GradScaler
from transformers import AutoTokenizer, LongformerConfig, LongformerForSequenceClassification, BertForSequenceClassification, BertConfig, BertModel
from tqdm import tqdm
import natasha
from IPython.display import display, clear_output
import psycopg2
import psycopg2.extras

import os
import sys
from dotenv import load_dotenv
sys.path.append(os.getcwd()[:-7])
load_dotenv()
DB_HOST = os.environ.get("DB_HOST")
DB_PORT = os.environ.get("DB_PORT")
DB_USER = os.environ.get("DB_USER")
DB_PASS = os.environ.get("DB_PASS")
DB_NAME = os.environ.get("DB_NAME")


device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

device(type='cuda')

In [2]:
def torch_accuracy(pred:torch.Tensor, true:torch.Tensor):
    return true.argmax(dim=1).eq(pred.argmax(dim=1)).sum().item()

In [3]:
def news_clean(text: str, segmenter:natasha.Segmenter, word: str = None):
    sentenses = []
    doc = natasha.Doc(text)
    doc.segment(segmenter)

    for sent in doc.sents:
        for w in word:
            if w in sent.text:
                sentenses.append(sent.text)
                break

    sentenses = ' '.join(sentenses)
    return sentenses

In [4]:
query = f"SELECT * FROM ticker_describe"
with psycopg2.connect(dbname = DB_NAME, user = DB_USER, password = DB_PASS, host = DB_HOST, port = DB_PORT) as conn:
    with conn.cursor() as cur:
        cur.execute(query)
        ticker_describe = pd.DataFrame(cur.fetchall(), columns=["ticker", "sector", "name", "describe"])

In [5]:
batch_size = 24

# MODEL

In [48]:
model_name = "matvej-melikhov/ruBERT-finetuned-lenta"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = BertConfig.from_pretrained(model_name)
model.max_position_embeddings = 1024
model.num_labels = 3
model.output_hidden_states = True
model = BertForSequenceClassification(model)
model_base = BertForSequenceClassification.from_pretrained(model_name, num_labels=3)

for name, param in model_base.named_parameters():
    if 'position_embeddings' not in name:
        model.state_dict()[name].copy_(param.data)

model.bert.embeddings.position_embeddings.weight = torch.nn.Parameter(model_base.bert.embeddings.position_embeddings.weight.repeat([2, 1]))

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at matvej-melikhov/ruBERT-finetuned-lenta and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


# final_2.csv

In [11]:
final_2 = pd.read_csv('data/final_2.csv').drop(columns=["Unnamed: 0", "idx", "channel_url", "views", "forwards", "entities"]).rename(columns={"tickers":"ticker"})
final_2

Unnamed: 0,date,message,ticker,sentiment,label
0,2021-11-26 13:42:34+00:00,А акции Новатэка так вообще смогли закрыть утр...,новатэк,Positive,1
1,2021-09-07 07:22:01+00:00,Объем торгов за первые 10 мин наглядно демонст...,газпром,Positive,1
2,2021-08-27 20:59:11+00:00,"Итак, на этой неделе акции Газпрома наконец-то...",газпром,Positive,1
3,2021-08-24 08:04:25+00:00,​​Не один раз в этом году на всех уровнях я то...,газпром,Positive,1
4,2021-08-20 09:37:02+00:00,Сегодня состоится последний визит в Москву Анг...,газпром,Positive,1
...,...,...,...,...,...
2491,2022-01-14 14:03:01+00:00,Красноречивые обороты сегодня в Сбере. 72 млрд...,газпром,Neutral,0
2492,2022-01-17 19:56:42+00:00,​​По Сберу сформировался очень сильный уровень...,сбер,Neutral,0
2493,2022-01-18 19:19:44+00:00,"​​Чем дальше в лес, тем больше дров! Сегодня о...",сбер,Negative,2
2494,2022-02-25 08:34:58+00:00,На протяжении многих лет не один раз на рынке ...,сбер,Negative,2


In [14]:
cte = final_2[final_2["ticker"].apply(lambda row: len(row.split(" "))) > 1]
for row in cte.itertuples(index=False, name=None):
    for ticker in row[2].split(" "):
        final_2.loc[len(final_2)] = [row[0], row[1], ticker, row[3], row[4]]
    
final_2.drop(index=cte.index, inplace=True)
final_2.reset_index(inplace=True, drop=True)

In [15]:
final_2.ticker.unique()

array(['новатэк', 'газпром', 'сбер', 'яндекс', 'лукойл', 'роснефть',
       'магнит', 'северсталь', 'x5', 'нлмк', 'татнефть', 'норникель',
       'фосагро', 'полиметалл', 'мтс', 'тинькофф', 'озон', 'втб',
       'алроса', 'ростелеком', 'система', 'лента', 'уралкалий', 'россети',
       'полюс'], dtype=object)

In [10]:
def rename_ticker(ticker):
    norm = {
        "новатэк":"NVTK", "тинькофф":"T",
        "газпром":"GAZP", "озон":"OZON",
        "сбер":"SBER", "втб":"VTBR",
        "яндекс":"YDEX", "алроса":"ALRS",
        "лукойл":"LKOH", "ростелеком":"RTKM",
        "роснефть":"ROSN", "система":"AFKS",
        "магнит":"MGNT", "лента":"LENT",
        "северсталь":"CHMF", "уралкалий":"URKAS",
        "x5":'X5', "россети":"MSRS",
        "нлмк":"NLMK", "полюс":"PLZL",
        "татнефть":"TATN", "мтс":"MTSS",
        "норникель":"GMKN", "полиметалл":"POLY",
        "фосагро":"PHOR"
    }
    return norm.get(ticker)

In [17]:
final_2["ticker"] = final_2["ticker"].apply(rename_ticker)

In [18]:
final_2 = pd.merge(final_2, ticker_describe, how="left")

In [19]:
final_2

Unnamed: 0,date,message,ticker,sentiment,label,sector,name,describe
0,2021-11-26 13:42:34+00:00,А акции Новатэка так вообще смогли закрыть утр...,NVTK,Positive,1,Энергетика,НОВАТЭК,НОВАТЭК — крупнейший независимый производитель...
1,2021-09-07 07:22:01+00:00,Объем торгов за первые 10 мин наглядно демонст...,GAZP,Positive,1,Энергетика,Газпром,«Газпром» — глобальная энергетическая компания...
2,2021-08-27 20:59:11+00:00,"Итак, на этой неделе акции Газпрома наконец-то...",GAZP,Positive,1,Энергетика,Газпром,«Газпром» — глобальная энергетическая компания...
3,2021-08-24 08:04:25+00:00,​​Не один раз в этом году на всех уровнях я то...,GAZP,Positive,1,Энергетика,Газпром,«Газпром» — глобальная энергетическая компания...
4,2021-08-20 09:37:02+00:00,Сегодня состоится последний визит в Москву Анг...,GAZP,Positive,1,Энергетика,Газпром,«Газпром» — глобальная энергетическая компания...
...,...,...,...,...,...,...,...,...
2575,2019-09-27 10:15:06+00:00,Новый глава Башкирии Радий Хабиров затевает во...,TATN,Negative,2,Энергетика,Татнефть,Татнефть — одна из крупнейших российских нефтя...
2576,2020-12-25 06:00:06+00:00,Основные экономически события за 24 часа: ПАО ...,VTBR,Neutral,0,Финансовый сектор,ВТБ,Банк ВТБ — второй банк по величине активов в Р...
2577,2020-12-25 06:00:06+00:00,Основные экономически события за 24 часа: ПАО ...,MTSS,Neutral,0,Телекоммуникации,МТС,МТС — цифровая экосистема и ведущая компания в...
2578,2021-09-14 18:59:13+00:00,Акции главных американских нефтяных мейджеров ...,LKOH,Negative,2,Энергетика,ЛУКОЙЛ,ЛУКОЙЛ — одна из крупнейших нефтегазовых компа...


In [20]:
for index in range(len(final_2)):
    t = final_2['ticker'][index]
    n = final_2['name'][index]
    d = final_2['describe'][index]
    s = final_2['sector'][index]
    m = final_2['message'][index]
    final_2.loc[index, "message"] = f'[{t}, {n}, {s}][SEP]{m}'

In [22]:
final_2["message"][0]

'[NVTK, НОВАТЭК, Энергетика][SEP]А акции Новатэка так вообще смогли закрыть утренний гэп на открытии. Не видно совсем паники в акциях российских "газовиков".'

In [25]:
tokens_model_embed = tokenizer(final_2['message'].tolist(), padding=True, return_tensors='pt')

text_code = torch.unsqueeze(tokens_model_embed['input_ids'], dim=1)
masked = torch.unsqueeze(tokens_model_embed['attention_mask'], dim=1)
Y_sep = torch.from_numpy(final_2['label'].to_numpy()).long()
X_sep = torch.cat((text_code, masked), dim=1)

INDEX = final_2.label.value_counts().min()

torch.manual_seed(12312)
y_negative_index = (Y_sep == 2).nonzero().reshape((-1))
y_negative_index_rand = torch.randperm(y_negative_index.shape[0])
y_negative_index = y_negative_index[y_negative_index_rand][:INDEX]
torch.manual_seed(12312)
y_positive_index = (Y_sep == 1).nonzero().reshape((-1))
y_positive_index_rand = torch.randperm(y_positive_index.shape[0])
y_positive_index = y_positive_index[y_positive_index_rand][:INDEX]
torch.manual_seed(12312)
y_neutral_index = (Y_sep == 0).nonzero().reshape((-1))
y_neutral_index_rand = torch.randperm(y_neutral_index.shape[0])
y_neutral_index = y_neutral_index[y_neutral_index_rand][:INDEX]

X_sep = X_sep[torch.cat((y_negative_index, y_positive_index, y_neutral_index))]
Y_sep = Y_sep[torch.cat((y_negative_index, y_positive_index, y_neutral_index))]

x_train_sep, x_test_sep, y_train_sep, y_test_sep = train_test_split(X_sep, Y_sep, test_size=0.15, random_state=12312, shuffle=True)

loader_train_final_2 = DataLoader(TensorDataset(x_train_sep, y_train_sep), batch_size=batch_size)
loader_test_final_2 = DataLoader(TensorDataset(x_test_sep, y_test_sep), batch_size=batch_size)

# weight_ce_cep = 1 / (y_train_sep.sum(dim=0) / y_train_sep.sum(dim=0).sum()) / (1 / (y_train_sep.sum(dim=0) / y_train_sep.sum(dim=0).sum())).sum()
# weight_ce_cep = torch.tensor([0.3, 0.4, 0.4])

In [32]:
model = model.to(device)
model.gradient_checkpointing_enable()
lr = 5e-5
weight_decay = 1e-2
optimizer = torch.optim.AdamW(model.parameters(), lr=lr, weight_decay=weight_decay)
losser = nn.CrossEntropyLoss()
scaler = torch.amp.GradScaler(device = device)

In [33]:
epochs = 2

for epoch in range(epochs):
    stat_loss_train = 0
    train_accuracy = 0
    stat_loss_test = 0
    test_accuracy = 0

    for x, y in tqdm(loader_train_final_2):
        optimizer.zero_grad()
        text, mask, target = x[:, 0, :].to(device), x[:, 1, :].to(device), y.to(device)
        with autocast(device_type="cuda", dtype=torch.float16):
            pred = model(text, attention_mask=mask).logits
            loss = losser(pred, target)
        scaler.scale(loss).backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
        scaler.step(optimizer)
        scaler.update()
        train_accuracy += pred.argmax(dim=1).eq(target).sum().item()
        stat_loss_train += loss.item()

    for x, y in tqdm(loader_test_final_2):
        text, mask, target = x[:, 0, :].to(device), x[:, 1, :].to(device), y.to(device)
        with torch.no_grad(), autocast(device_type="cuda", dtype=torch.float16):
            pred = model(text, attention_mask=mask).logits
            loss = losser(pred, target)
            scaler.scale(loss)
        stat_loss_test += loss.item()   
        test_accuracy += pred.argmax(dim=1).eq(target).sum().item()

    train_accuracy = train_accuracy / len(loader_train_final_2.dataset)
    test_accuracy = test_accuracy / len(loader_test_final_2.dataset)  

    print(f'Epoch: {epoch+1}', 
            f'train loss:{stat_loss_train}', 
            f'test loss:{stat_loss_test}', 
            f'train accuracy: {round(train_accuracy, 4)}', 
            f'test accuracy: {round(test_accuracy, 4)}', 
            f'koef_train_test: {round(stat_loss_train / stat_loss_test, 4)}',
            sep=', ')
    
    torch.save(model, f'{model_name}/222_mt5_fin2_{round(test_accuracy, 4)}_{round(train_accuracy, 4)}_epoch_{epoch}')

100%|██████████| 58/58 [01:51<00:00,  1.92s/it]
100%|██████████| 11/11 [00:04<00:00,  2.61it/s]


Epoch: 1, train loss:44.75699362158775, test loss:7.419869422912598, train accuracy: 0.6683, test accuracy: 0.7184, koef_train_test: 6.032


100%|██████████| 58/58 [01:54<00:00,  1.98s/it]
100%|██████████| 11/11 [00:04<00:00,  2.59it/s]


Epoch: 2, train loss:30.127362623810768, test loss:7.244628369808197, train accuracy: 0.7924, test accuracy: 0.7469, koef_train_test: 4.1586


# mt-cleaned-labeled5.csv

In [7]:
mt_cleandet_labeled5 = pd.read_csv("data/mt-cleaned-labeled5.csv")#.drop(columns=["idx", "channel_url", "views", "forwards"])
mt_cleandet_labeled5["label"] = mt_cleandet_labeled5["label"].apply(lambda row: 2 if row == -1 else row)
mt_cleandet_labeled5.drop_duplicates(inplace=True)
mt_cleandet_labeled5.dropna(inplace=True)
mt_cleandet_labeled5.reset_index(inplace=True, drop=True)
mt_cleandet_labeled5

Unnamed: 0,idx,channel_url,date,message,views,forwards,label
0,1,https://t.me/markettwits,2022-04-06 11:52:33+00:00,Япония снимет ограничения на въезд для 106 стр...,5117.0,1.0,1
1,31,https://t.me/markettwits,2022-04-06 09:54:15+00:00,Китай - госкомпании выкупают активы проблемных...,35066.0,1.0,1
2,108,https://t.me/markettwits,2022-04-06 04:40:10+00:00,ВОЗ сообщила о снижении на 43% смертности от к...,39175.0,38.0,1
3,149,https://t.me/markettwits,2022-04-05 16:06:32+00:00,"iRemit (партнер Ripple, использует Ripple ODL)...",54555.0,47.0,1
4,164,https://t.me/markettwits,2022-04-05 14:48:52+00:00,сд Позитив рекомендовал дивиденды 1кв 2022г = ...,52335.0,123.0,1
...,...,...,...,...,...,...,...
88322,94039,https://t.me/markettwits,2018-01-18 06:57:04+00:00,НЕФТЬ - DUMB MONEY - COT data Хедж-фонды на би...,1056.0,0.0,2
88323,94406,https://t.me/markettwits,2018-01-11 05:06:39+00:00,КРИПТО - РЕГУЛИРОВАНИЕ - Ю.КОРЕЯ Reuters пишет...,859.0,0.0,2
88324,94430,https://t.me/markettwits,2018-01-10 15:37:45+00:00,КРИПТО - МАЙНИНГ - КИТАЙ CHINA QUIETLY ORDERS ...,5.0,0.0,2
88325,94436,https://t.me/markettwits,2018-01-10 14:54:11+00:00,КИТАЙ - США - БОНДЫ GROSS: RECENT EVIDENCE SHO...,954.0,0.0,2


In [12]:
mins_sample = mt_cleandet_labeled5.label.value_counts().min()
mt_cleandet_labeled5 = mt_cleandet_labeled5.groupby(by='label', group_keys=False).apply(lambda x: x.sample(mins_sample, random_state=12312))

In [13]:
tokens_model_embed_1 = tokenizer(mt_cleandet_labeled5['message'].tolist(), padding=True, return_tensors='pt', max_length=1536)

text_code = torch.unsqueeze(tokens_model_embed_1['input_ids'], dim=1)
masked = torch.unsqueeze(tokens_model_embed_1['attention_mask'], dim=1)
Y_news = torch.from_numpy(mt_cleandet_labeled5[['label']].to_numpy()).long().reshape(-1)
X_news = torch.cat((text_code, masked), dim=1)

x_train_news, x_test_news, y_train_news, y_test_news = train_test_split(X_news, Y_news, test_size=0.15, random_state=12312, shuffle=True)

loader_train_mt_cleandet_labeled5 = DataLoader(TensorDataset(x_train_news, y_train_news), batch_size=batch_size)
loader_test_mt_cleandet_labeled5 = DataLoader(TensorDataset(x_test_news, y_test_news), batch_size=batch_size)



In [14]:
model = model.to(device)
model.gradient_checkpointing_enable()
lr = 3e-5
weight_decay = 1e-2
optimizer = torch.optim.AdamW(model.parameters(), lr=lr, weight_decay=weight_decay)
losser = nn.CrossEntropyLoss()
scaler = torch.amp.GradScaler(device = device)

In [19]:
epochs = 4

for epoch in range(epochs):
    stat_loss_train = 0
    train_accuracy = 0
    stat_loss_test = 0
    test_accuracy = 0

    for x, y in tqdm(loader_train_mt_cleandet_labeled5):
        optimizer.zero_grad()
        text, mask, target = x[:, 0, :].to(device), x[:, 1, :].to(device), y.to(device)
        with autocast(device_type="cuda", dtype=torch.float16):
            pred = model(text, attention_mask=mask).logits
            loss = losser(pred, target)
        scaler.scale(loss).backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
        scaler.step(optimizer)
        scaler.update()
        train_accuracy += pred.argmax(dim=1).eq(target).sum().item()
        stat_loss_train += loss.item()

    for x, y in tqdm(loader_test_mt_cleandet_labeled5):
        text, mask, target = x[:, 0, :].to(device), x[:, 1, :].to(device), y.to(device)
        with torch.no_grad(), autocast(device_type="cuda", dtype=torch.float16):
            pred = model(text, attention_mask=mask).logits
            loss = losser(pred, target)
        scaler.scale(loss)
        stat_loss_test += loss.item()   
        test_accuracy += pred.argmax(dim=1).eq(target).sum().item()

    train_accuracy = train_accuracy / len(loader_train_mt_cleandet_labeled5.dataset)
    test_accuracy = test_accuracy / len(loader_test_mt_cleandet_labeled5.dataset)  

    print(f'Epoch: {epoch+1}', 
            f'train loss:{stat_loss_train}', 
            f'test loss:{stat_loss_test}', 
            f'train accuracy: {round(train_accuracy, 4)}', 
            f'test accuracy: {round(test_accuracy, 4)}', 
            f'koef_train_test: {round(stat_loss_train / stat_loss_test, 4)}',
            sep=', ')
    
    torch.save(model, f'{model_name}/mt5_context-100_{round(test_accuracy, 4)}_{round(train_accuracy, 4)}_epoch_{epoch}')

100%|██████████| 1716/1716 [24:00<00:00,  1.19it/s]
100%|██████████| 303/303 [00:57<00:00,  5.27it/s]


Epoch: 1, train loss:1123.221481487155, test loss:167.85613371431828, train accuracy: 0.7197, test accuracy: 0.7744, koef_train_test: 6.6916


100%|██████████| 1716/1716 [22:43<00:00,  1.26it/s]
100%|██████████| 303/303 [00:52<00:00,  5.76it/s]


Epoch: 2, train loss:783.0999687686563, test loss:168.4883097782731, train accuracy: 0.8172, test accuracy: 0.7895, koef_train_test: 4.6478


  1%|          | 16/1716 [00:13<23:47,  1.19it/s]


KeyboardInterrupt: 