In [16]:
import pandas as pd
import numpy as np
from sklearn.metrics import accuracy_score
import torch
from torch import nn
import re
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer, BertConfig, BertForSequenceClassification
from sklearn.metrics import f1_score
from torch.optim.lr_scheduler import ExponentialLR

In [2]:
from torch import cuda
device = 'cuda' if cuda.is_available() else 'cpu'
print(device)

cuda


In [42]:
data = pd.read_csv('sentiment_text.csv')
data.head()

Unnamed: 0.1,Unnamed: 0,MessageID,ChannelID,issuerid,SentimentScore,DateAdded,DatePosted,MessageText,IsForward
0,0,241407,1203560567,153,2,2023-05-12 19:03:20,2023-05-12 19:02:42,⚠️🇷🇺#SELG #дивиденд сд Селигдар: дивиденды 20...,False
1,1,33684,1136626166,230,4,2023-02-03 20:56:29,2023-02-03 16:46:34,Ozon продолжает развивать специализированные ф...,False
2,2,10090,1063908560,118,4,2023-06-02 19:18:37,2023-06-02 18:50:00,​Фокусы продолжаются🔥Акции и инвестиции 📈ВТБ ...,False
3,3,10090,1063908560,220,5,2023-06-02 19:18:37,2023-06-02 18:50:00,​Фокусы продолжаются🔥Акции и инвестиции 📈ВТБ ...,False
4,4,9826,1063908560,89,2,2023-04-24 17:51:38,2023-04-24 13:54:00,​​Windfall Tax — налог на сверхприбыль. Какие ...,False


In [44]:
data = data[['issuerid', 'SentimentScore', 'MessageText']]
data.head()

Unnamed: 0,issuerid,SentimentScore,MessageText
0,153,2,⚠️🇷🇺#SELG #дивиденд сд Селигдар: дивиденды 20...
1,230,4,Ozon продолжает развивать специализированные ф...
2,118,4,​Фокусы продолжаются🔥Акции и инвестиции 📈ВТБ ...
3,220,5,​Фокусы продолжаются🔥Акции и инвестиции 📈ВТБ ...
4,89,2,​​Windfall Tax — налог на сверхприбыль. Какие ...


In [46]:
issuers = pd.read_excel('issuers.xlsx')
issuers.head()

Unnamed: 0.1,Unnamed: 0,issuerid,EMITENT_FULL_NAME,datetrackstart,datetrackend,BGTicker,OtherTicker
0,0,1,"""Акционерный коммерческий банк ""Держава"" публи...",2021-06-02 12:47:55.100,,,
1,1,2,"""МОСКОВСКИЙ КРЕДИТНЫЙ БАНК"" (публичное акционе...",2021-06-02 12:47:55.100,,CBOM RX,
2,2,3,"""Российский акционерный коммерческий дорожный ...",2021-06-02 12:47:55.100,,,
3,3,4,"Акционерная компания ""АЛРОСА"" (публичное акцио...",2021-06-02 12:47:55.100,,ALRS RX,
4,4,5,"Акционерный Коммерческий банк ""АВАНГАРД"" - пуб...",2021-06-02 12:47:55.100,,,


In [47]:
data = data.merge(issuers[['issuerid', 'EMITENT_FULL_NAME']], on='issuerid', how='left')
data.head()

Unnamed: 0,issuerid,SentimentScore,MessageText,EMITENT_FULL_NAME
0,153,2,⚠️🇷🇺#SELG #дивиденд сд Селигдар: дивиденды 20...,"Публичное акционерное общество ""Селигдар"""
1,230,4,Ozon продолжает развивать специализированные ф...,Озон Холдингс ПиЭлСи (эмитент депозитарных рас...
2,118,4,​Фокусы продолжаются🔥Акции и инвестиции 📈ВТБ ...,"Публичное акционерное общество ""Новороссийский..."
3,220,5,​Фокусы продолжаются🔥Акции и инвестиции 📈ВТБ ...,Globaltrans Investment PLC (Глобалтранс Инвест...
4,89,2,​​Windfall Tax — налог на сверхприбыль. Какие ...,"Публичное акционерное общество ""Магнит"""


In [48]:
names = pd.read_pickle('company_names.pickle')
names.head()

Unnamed: 0,issuerid,l_syns
0,1,"[Держава, DERZP, DERZ, DERZHAVA, ""Акционерный ..."
1,2,"[CBOM RX, ""МОСКОВСКИЙ КРЕДИТНЫЙ БАНК"" (публичн..."
2,3,"[РДБанк, roads Bank, Российский акционерный ко..."
3,4,"[ALRS RX, alrosa, Акционерная компания ""АЛРОСА..."
4,5,"[AVANGARD, Авангард, AVAN, Акционерный Коммерч..."


In [49]:
names['l_syns'] = names['l_syns'].apply(lambda x: ', '.join(x))
names.head()

Unnamed: 0,issuerid,l_syns
0,1,"Держава, DERZP, DERZ, DERZHAVA, ""Акционерный к..."
1,2,"CBOM RX, ""МОСКОВСКИЙ КРЕДИТНЫЙ БАНК"" (публично..."
2,3,"РДБанк, roads Bank, Российский акционерный ком..."
3,4,"ALRS RX, alrosa, Акционерная компания ""АЛРОСА""..."
4,5,"AVANGARD, Авангард, AVAN, Акционерный Коммерче..."


In [50]:
df_names = pd.merge(data, names, on="issuerid", how="left")

In [51]:
df_names.head()

Unnamed: 0,issuerid,SentimentScore,MessageText,EMITENT_FULL_NAME,l_syns
0,153,2,⚠️🇷🇺#SELG #дивиденд сд Селигдар: дивиденды 20...,"Публичное акционерное общество ""Селигдар""","селигдар, SELG, Публичное акционерное общество..."
1,230,4,Ozon продолжает развивать специализированные ф...,Озон Холдингс ПиЭлСи (эмитент депозитарных рас...,"OZON RX, Озон Холдингс ПиЭлСи (эмитент депозит..."
2,118,4,​Фокусы продолжаются🔥Акции и инвестиции 📈ВТБ ...,"Публичное акционерное общество ""Новороссийский...","Публичное акционерное общество ""Новороссийский..."
3,220,5,​Фокусы продолжаются🔥Акции и инвестиции 📈ВТБ ...,Globaltrans Investment PLC (Глобалтранс Инвест...,Globaltrans Investment PLC (Глобалтранс Инвест...
4,89,2,​​Windfall Tax — налог на сверхприбыль. Какие ...,"Публичное акционерное общество ""Магнит""","Magnit , ПАО Магнит, Публичное акционерное общ..."


In [52]:
df_names.isnull().sum()

issuerid             0
SentimentScore       0
MessageText          1
EMITENT_FULL_NAME    3
l_syns               3
dtype: int64

In [53]:
df_names.dropna(inplace=True)

In [55]:
remove_question = lambda text: re.sub(r'\?{2,}', '', text)
df_names['MessageText'] = df_names['MessageText'].apply(remove_question)
remove_u200b = lambda text: re.sub(r'\u200b', '', text)
df_names['MessageText'] = df_names['MessageText'].apply(remove_u200b)
remove_backslash = lambda text: re.sub(r'\\[^ ]*', '', text)
df_names['MessageText'] = df_names['MessageText'].apply(remove_backslash)
remove_at = lambda text: re.sub(r'\@\S{2,}', '', text)
df_names['MessageText'] = df_names['MessageText'].apply(remove_at)
remove_site = lambda text: re.sub(r'http\S{2,}', '', text)
df_names['MessageText'] = df_names['MessageText'].apply(remove_site)
remove_space = lambda text: re.sub(r'\s{2,}', ' ', text)
df_names['MessageText'] = df_names['MessageText'].apply(remove_space)
remove_quotation = lambda text: re.sub(r'\"{2,}', '"', text)
df_names['MessageText'] = df_names['MessageText'].apply(remove_quotation)

In [56]:
data = df_names

In [57]:
data['SentimentScore'].value_counts()/data.shape[0]

SentimentScore
4    0.414755
3    0.378029
2    0.101669
5    0.081099
0    0.017555
1    0.006893
Name: count, dtype: float64

In [58]:
data['SentimentScore'].value_counts()

SentimentScore
4    3851
3    3510
2     944
5     753
0     163
1      64
Name: count, dtype: int64

In [59]:
data.loc[data['SentimentScore']==0, 'SentimentScore'] = 1
data4 = data.loc[data['SentimentScore']==4].sample(1500)
data3 = data.loc[data['SentimentScore']==3].sample(1500)
otherdata = data.loc[(data['SentimentScore']<3)|(data['SentimentScore']==5)]
newdata = pd.concat([otherdata, data3, data4], axis=0)

In [60]:
newdata.shape

(4924, 5)

In [62]:
newdata.head()

Unnamed: 0,issuerid,SentimentScore,MessageText,EMITENT_FULL_NAME,l_syns
0,153,2,⚠️🇷🇺#SELG #дивиденд сд Селигдар: дивиденды 202...,"Публичное акционерное общество ""Селигдар""","селигдар, SELG, Публичное акционерное общество..."
3,220,5,Фокусы продолжаются🔥Акции и инвестиции 📈ВТБ +5...,Globaltrans Investment PLC (Глобалтранс Инвест...,Globaltrans Investment PLC (Глобалтранс Инвест...
4,89,2,Windfall Tax — налог на сверхприбыль. Какие ко...,"Публичное акционерное общество ""Магнит""","Magnit , ПАО Магнит, Публичное акционерное общ..."
5,127,2,Windfall Tax — налог на сверхприбыль. Какие ко...,"Публичное акционерное общество ""Полюс""","Публичное акционерное общество ""Полюс"", PLZL R..."
6,150,2,Windfall Tax — налог на сверхприбыль. Какие ко...,"Публичное акционерное общество ""Сбербанк России""","Публичное акционерное общество ""Сбербанк Росси..."


In [63]:
newdata['SentimentScore'].value_counts()

SentimentScore
3    1500
4    1500
2     944
5     753
1     227
Name: count, dtype: int64

In [64]:
newdata['SentimentScore'] -= 1

In [66]:
newdata = newdata.drop(columns=['EMITENT_FULL_NAME'])

In [67]:
tokenizer = BertTokenizer.from_pretrained('DeepPavlov/rubert-base-cased-conversational')

In [68]:
MAX_LEN = 512
TRAIN_BATCH_SIZE = 12
VALID_BATCH_SIZE = 10
EPOCHS = 2
LEARNING_RATE = 1e-05
MAX_GRAD_NORM = 10

In [69]:
def make_prompt(text, name, tokenizer):
    prompt = ['[CLS]']
    prompt.extend(tokenizer.tokenize(name, is_split_into_words=True))
    prompt.append('[SEP]')
    prompt.extend(tokenizer.tokenize(text, is_split_into_words=True))
    return prompt

In [80]:
class dataset(Dataset):
    def __init__(self, dataframe, tokenizer, max_len):
        self.len = len(dataframe)
        self.data = dataframe
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __getitem__(self, index):
        # step 1: tokenize (and adapt corresponding labels)
        text = self.data.MessageText[index]
        company_name = self.data.l_syns[index]
        tokenized_prompt = make_prompt(text, company_name, self.tokenizer)
        label = self.data.SentimentScore[index]

        # step 3: truncating/padding
        maxlen = self.max_len

        if (len(tokenized_prompt) > maxlen):
          # truncate
          tokenized_prompt = tokenized_prompt[:maxlen]
        else:
          # pad
          tokenized_prompt = tokenized_prompt + ['[PAD]' for _ in range(maxlen - len(tokenized_prompt))]

        # step 4: obtain the attention mask
        attn_mask = [1 if tok != '[PAD]' else 0 for tok in tokenized_prompt]

        # step 5: convert tokens to input ids
        ids = self.tokenizer.convert_tokens_to_ids(tokenized_prompt)

        return {
              'ids': torch.tensor(ids, dtype=torch.long),
              'mask': torch.tensor(attn_mask, dtype=torch.long),
              #'targets': torch.tensor(label_ids, dtype=torch.long)
              'target': np.array(label)
        }

    def __len__(self):
        return self.len

In [81]:
train_size = 0.7
train_dataset = newdata.sample(frac=train_size, random_state=77)
test_dataset = newdata.drop(train_dataset.index).reset_index(drop=True)
train_dataset = train_dataset.reset_index(drop=True)

print("FULL Dataset: {}".format(newdata.shape))
print("TRAIN Dataset: {}".format(train_dataset.shape))
print("TEST Dataset: {}".format(test_dataset.shape))

training_set = dataset(train_dataset, tokenizer, MAX_LEN)
testing_set = dataset(test_dataset, tokenizer, MAX_LEN)

FULL Dataset: (4924, 4)
TRAIN Dataset: (3447, 4)
TEST Dataset: (1477, 4)


In [82]:
training_set = dataset(train_dataset, tokenizer, MAX_LEN)
testing_set = dataset(test_dataset, tokenizer, MAX_LEN)

In [83]:
train_params = {'batch_size': TRAIN_BATCH_SIZE,
                }

test_params = {'batch_size': VALID_BATCH_SIZE,
                }

training_loader = DataLoader(training_set, shuffle=True, **train_params)
testing_loader = DataLoader(testing_set, **test_params)

In [84]:
model = BertForSequenceClassification.from_pretrained('DeepPavlov/rubert-base-cased',
                                                      num_labels=5)
model.to(device)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at DeepPavlov/rubert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(119547, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12

In [85]:
optimizer = torch.optim.Adam(params=model.parameters(), lr=LEARNING_RATE)
criterion = nn.CrossEntropyLoss().cuda()

In [86]:
from tqdm import tqdm

def train(model, train_dataloader, val_dataloader, learning_rate, epochs):
    model.train()
    for epoch_num in range(epochs):

            total_acc_train = 0
            total_loss_train = 0
            for idx, batch in tqdm(enumerate(training_loader)):

                train_label = batch['target'].to(device)
                mask = batch['mask'].to(device)
                input_id = batch['ids'].to(device)

                output = model(input_id, mask, labels=train_label)
                loss, tr_logits = output.loss, output.logits
                #batch_loss = criterion(output, train_label.long())
                total_loss_train += loss.item()

                acc = (tr_logits.argmax(dim=1) == train_label).sum().item()
                total_acc_train += acc

                model.zero_grad()
                loss.backward()
                optimizer.step()

            total_acc_val = 0
            total_loss_val = 0

            with torch.no_grad():

                for idx, val_batch in tqdm(enumerate(val_dataloader)):

                    val_label = val_batch['target'].to(device)
                    mask = val_batch['mask'].to(device)
                    input_id = val_batch['ids'].to(device)

                    output = model(input_id, mask, labels=val_label)

                    #batch_loss = criterion(output, val_label.long())
                    loss, tr_logits = output.loss, output.logits
                    total_loss_val += loss.item()

                    acc = (tr_logits.argmax(dim=1) == val_label).sum().item()
                    total_acc_val += acc

            print(
                f'Epochs: {epoch_num + 1} | Train Loss: {total_loss_train / len(train_dataset): .3f} \
                | Train Accuracy: {total_acc_train / len(train_dataset): .3f} \
                | Val Loss: {total_loss_val / len(test_dataset): .3f} \
                | Val Accuracy: {total_acc_val / len(test_dataset): .3f}')

In [87]:
len(training_loader)

288

In [88]:
len(testing_loader)

148

In [89]:
train(model, training_loader, testing_loader, learning_rate=LEARNING_RATE, epochs=5)

288it [05:28,  1.14s/it]
148it [00:55,  2.67it/s]


Epochs: 1 | Train Loss:  0.118                 | Train Accuracy:  0.357                 | Val Loss:  0.132                 | Val Accuracy:  0.410


288it [05:34,  1.16s/it]
148it [00:55,  2.66it/s]


Epochs: 2 | Train Loss:  0.102                 | Train Accuracy:  0.481                 | Val Loss:  0.122                 | Val Accuracy:  0.483


288it [05:34,  1.16s/it]
148it [00:55,  2.67it/s]


Epochs: 3 | Train Loss:  0.085                 | Train Accuracy:  0.587                 | Val Loss:  0.114                 | Val Accuracy:  0.522


288it [05:35,  1.16s/it]
148it [00:55,  2.68it/s]


Epochs: 4 | Train Loss:  0.069                 | Train Accuracy:  0.677                 | Val Loss:  0.114                 | Val Accuracy:  0.549


288it [05:34,  1.16s/it]
148it [00:56,  2.64it/s]

Epochs: 5 | Train Loss:  0.053                 | Train Accuracy:  0.770                 | Val Loss:  0.121                 | Val Accuracy:  0.534





In [39]:
torch.save(model, 'Bert_5')