In [1]:
import pandas as pd
import numpy as np
from sklearn.metrics import accuracy_score
import torch
from torch import nn

from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer, BertConfig, BertForSequenceClassification
from sklearn.metrics import f1_score
from torch.optim.lr_scheduler import ExponentialLR

In [2]:
from torch import cuda
device = 'cuda' if cuda.is_available() else 'cpu'
print(device)

cuda


In [3]:
data = pd.read_csv('sent_clean.csv', encoding='utf-8')
data.head(2)

Unnamed: 0,MessageID,ChannelID,issuerid,SentimentScore,DateAdded,DatePosted,MessageText,IsForward,text_id,MessageTextClean
0,241407,1203560567,153,2,2023-05-12 19:03:20,2023-05-12 19:02:42,‚ö†Ô∏èüá∑üá∫#SELG #–¥–∏–≤–∏–¥–µ–Ω–¥ —Å–¥ –°–µ–ª–∏–≥–¥–∞—Ä: –¥–∏–≤–∏–¥–µ–Ω–¥—ã 20...,False,0,‚ö†Ô∏èüá∑üá∫#SELG #–¥–∏–≤–∏–¥–µ–Ω–¥ —Å–¥ –°–µ–ª–∏–≥–¥–∞—Ä: –¥–∏–≤–∏–¥–µ–Ω–¥—ã 202...
1,33684,1136626166,230,4,2023-02-03 20:56:29,2023-02-03 16:46:34,Ozon –ø—Ä–æ–¥–æ–ª–∂–∞–µ—Ç —Ä–∞–∑–≤–∏–≤–∞—Ç—å —Å–ø–µ—Ü–∏–∞–ª–∏–∑–∏—Ä–æ–≤–∞–Ω–Ω—ã–µ —Ñ...,False,1,Ozon –ø—Ä–æ–¥–æ–ª–∂–∞–µ—Ç —Ä–∞–∑–≤–∏–≤–∞—Ç—å —Å–ø–µ—Ü–∏–∞–ª–∏–∑–∏—Ä–æ–≤–∞–Ω–Ω—ã–µ —Ñ...


In [4]:
data = data[['issuerid', 'SentimentScore', 'MessageTextClean']]
data.head()

Unnamed: 0,issuerid,SentimentScore,MessageTextClean
0,153,2,‚ö†Ô∏èüá∑üá∫#SELG #–¥–∏–≤–∏–¥–µ–Ω–¥ —Å–¥ –°–µ–ª–∏–≥–¥–∞—Ä: –¥–∏–≤–∏–¥–µ–Ω–¥—ã 202...
1,230,4,Ozon –ø—Ä–æ–¥–æ–ª–∂–∞–µ—Ç —Ä–∞–∑–≤–∏–≤–∞—Ç—å —Å–ø–µ—Ü–∏–∞–ª–∏–∑–∏—Ä–æ–≤–∞–Ω–Ω—ã–µ —Ñ...
2,118,4,–§–æ–∫—É—Å—ã –ø—Ä–æ–¥–æ–ª–∂–∞—é—Ç—Å—èüî•–ê–∫—Ü–∏–∏ –∏ –∏–Ω–≤–µ—Å—Ç–∏—Ü–∏–∏ üìà–í–¢–ë +5...
3,220,5,–§–æ–∫—É—Å—ã –ø—Ä–æ–¥–æ–ª–∂–∞—é—Ç—Å—èüî•–ê–∫—Ü–∏–∏ –∏ –∏–Ω–≤–µ—Å—Ç–∏—Ü–∏–∏ üìà–í–¢–ë +5...
4,89,2,Windfall Tax ‚Äî –Ω–∞–ª–æ–≥ –Ω–∞ —Å–≤–µ—Ä—Ö–ø—Ä–∏–±—ã–ª—å. –ö–∞–∫–∏–µ –∫–æ...


In [5]:
issuers = pd.read_excel('issuers.xlsx')
issuers.head(2)

Unnamed: 0.1,Unnamed: 0,issuerid,EMITENT_FULL_NAME,datetrackstart,datetrackend,BGTicker,OtherTicker
0,0,1,"""–ê–∫—Ü–∏–æ–Ω–µ—Ä–Ω—ã–π –∫–æ–º–º–µ—Ä—á–µ—Å–∫–∏–π –±–∞–Ω–∫ ""–î–µ—Ä–∂–∞–≤–∞"" –ø—É–±–ª–∏...",2021-06-02 12:47:55.100,,,
1,1,2,"""–ú–û–°–ö–û–í–°–ö–ò–ô –ö–†–ï–î–ò–¢–ù–´–ô –ë–ê–ù–ö"" (–ø—É–±–ª–∏—á–Ω–æ–µ –∞–∫—Ü–∏–æ–Ω–µ...",2021-06-02 12:47:55.100,,CBOM RX,


In [6]:
data = data.merge(issuers[['issuerid', 'EMITENT_FULL_NAME']], on='issuerid', how='left')
data.head()

Unnamed: 0,issuerid,SentimentScore,MessageTextClean,EMITENT_FULL_NAME
0,153,2,‚ö†Ô∏èüá∑üá∫#SELG #–¥–∏–≤–∏–¥–µ–Ω–¥ —Å–¥ –°–µ–ª–∏–≥–¥–∞—Ä: –¥–∏–≤–∏–¥–µ–Ω–¥—ã 202...,"–ü—É–±–ª–∏—á–Ω–æ–µ –∞–∫—Ü–∏–æ–Ω–µ—Ä–Ω–æ–µ –æ–±—â–µ—Å—Ç–≤–æ ""–°–µ–ª–∏–≥–¥–∞—Ä"""
1,230,4,Ozon –ø—Ä–æ–¥–æ–ª–∂–∞–µ—Ç —Ä–∞–∑–≤–∏–≤–∞—Ç—å —Å–ø–µ—Ü–∏–∞–ª–∏–∑–∏—Ä–æ–≤–∞–Ω–Ω—ã–µ —Ñ...,–û–∑–æ–Ω –•–æ–ª–¥–∏–Ω–≥—Å –ü–∏–≠–ª–°–∏ (—ç–º–∏—Ç–µ–Ω—Ç –¥–µ–ø–æ–∑–∏—Ç–∞—Ä–Ω—ã—Ö —Ä–∞—Å...
2,118,4,–§–æ–∫—É—Å—ã –ø—Ä–æ–¥–æ–ª–∂–∞—é—Ç—Å—èüî•–ê–∫—Ü–∏–∏ –∏ –∏–Ω–≤–µ—Å—Ç–∏—Ü–∏–∏ üìà–í–¢–ë +5...,"–ü—É–±–ª–∏—á–Ω–æ–µ –∞–∫—Ü–∏–æ–Ω–µ—Ä–Ω–æ–µ –æ–±—â–µ—Å—Ç–≤–æ ""–ù–æ–≤–æ—Ä–æ—Å—Å–∏–π—Å–∫–∏–π..."
3,220,5,–§–æ–∫—É—Å—ã –ø—Ä–æ–¥–æ–ª–∂–∞—é—Ç—Å—èüî•–ê–∫—Ü–∏–∏ –∏ –∏–Ω–≤–µ—Å—Ç–∏—Ü–∏–∏ üìà–í–¢–ë +5...,Globaltrans Investment PLC (–ì–ª–æ–±–∞–ª—Ç—Ä–∞–Ω—Å –ò–Ω–≤–µ—Å—Ç...
4,89,2,Windfall Tax ‚Äî –Ω–∞–ª–æ–≥ –Ω–∞ —Å–≤–µ—Ä—Ö–ø—Ä–∏–±—ã–ª—å. –ö–∞–∫–∏–µ –∫–æ...,"–ü—É–±–ª–∏—á–Ω–æ–µ –∞–∫—Ü–∏–æ–Ω–µ—Ä–Ω–æ–µ –æ–±—â–µ—Å—Ç–≤–æ ""–ú–∞–≥–Ω–∏—Ç"""


In [7]:
data['SentimentScore'].value_counts()/data.shape[0]

SentimentScore
4    0.414899
3    0.377974
2    0.101626
5    0.081064
0    0.017548
1    0.006890
Name: count, dtype: float64

In [8]:
data['SentimentScore'].value_counts()

SentimentScore
4    3854
3    3511
2     944
5     753
0     163
1      64
Name: count, dtype: int64

In [9]:
data.loc[data['SentimentScore']==0, 'SentimentScore'] = 1
data4 = data.loc[data['SentimentScore']==4].sample(1500)
data3 = data.loc[data['SentimentScore']==3].sample(1500)
otherdata = data.loc[(data['SentimentScore']<3)|(data['SentimentScore']==5)]
newdata = pd.concat([otherdata, data3, data4], axis=0)

In [10]:
newdata.shape

(4924, 4)

In [11]:
newdata.isnull().sum()

issuerid             0
SentimentScore       0
MessageTextClean     0
EMITENT_FULL_NAME    0
dtype: int64

In [12]:
newdata.head()

Unnamed: 0,issuerid,SentimentScore,MessageTextClean,EMITENT_FULL_NAME
0,153,2,‚ö†Ô∏èüá∑üá∫#SELG #–¥–∏–≤–∏–¥–µ–Ω–¥ —Å–¥ –°–µ–ª–∏–≥–¥–∞—Ä: –¥–∏–≤–∏–¥–µ–Ω–¥—ã 202...,"–ü—É–±–ª–∏—á–Ω–æ–µ –∞–∫—Ü–∏–æ–Ω–µ—Ä–Ω–æ–µ –æ–±—â–µ—Å—Ç–≤–æ ""–°–µ–ª–∏–≥–¥–∞—Ä"""
3,220,5,–§–æ–∫—É—Å—ã –ø—Ä–æ–¥–æ–ª–∂–∞—é—Ç—Å—èüî•–ê–∫—Ü–∏–∏ –∏ –∏–Ω–≤–µ—Å—Ç–∏—Ü–∏–∏ üìà–í–¢–ë +5...,Globaltrans Investment PLC (–ì–ª–æ–±–∞–ª—Ç—Ä–∞–Ω—Å –ò–Ω–≤–µ—Å—Ç...
4,89,2,Windfall Tax ‚Äî –Ω–∞–ª–æ–≥ –Ω–∞ —Å–≤–µ—Ä—Ö–ø—Ä–∏–±—ã–ª—å. –ö–∞–∫–∏–µ –∫–æ...,"–ü—É–±–ª–∏—á–Ω–æ–µ –∞–∫—Ü–∏–æ–Ω–µ—Ä–Ω–æ–µ –æ–±—â–µ—Å—Ç–≤–æ ""–ú–∞–≥–Ω–∏—Ç"""
5,127,2,Windfall Tax ‚Äî –Ω–∞–ª–æ–≥ –Ω–∞ —Å–≤–µ—Ä—Ö–ø—Ä–∏–±—ã–ª—å. –ö–∞–∫–∏–µ –∫–æ...,"–ü—É–±–ª–∏—á–Ω–æ–µ –∞–∫—Ü–∏–æ–Ω–µ—Ä–Ω–æ–µ –æ–±—â–µ—Å—Ç–≤–æ ""–ü–æ–ª—é—Å"""
6,150,2,Windfall Tax ‚Äî –Ω–∞–ª–æ–≥ –Ω–∞ —Å–≤–µ—Ä—Ö–ø—Ä–∏–±—ã–ª—å. –ö–∞–∫–∏–µ –∫–æ...,"–ü—É–±–ª–∏—á–Ω–æ–µ –∞–∫—Ü–∏–æ–Ω–µ—Ä–Ω–æ–µ –æ–±—â–µ—Å—Ç–≤–æ ""–°–±–µ—Ä–±–∞–Ω–∫ –†–æ—Å—Å–∏–∏"""


In [13]:
newdata['SentimentScore'].value_counts()

SentimentScore
4    1500
3    1500
2     944
5     753
1     227
Name: count, dtype: int64

In [14]:
newdata['SentimentScore'] -= 1

In [15]:
newdata.dropna(inplace=True)

In [16]:
tokenizer = BertTokenizer.from_pretrained('DeepPavlov/rubert-base-cased-conversational')

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [17]:
MAX_LEN = 512
TRAIN_BATCH_SIZE = 12
VALID_BATCH_SIZE = 10
EPOCHS = 2
LEARNING_RATE = 1e-05
MAX_GRAD_NORM = 10

In [18]:
'''text = newdata.loc[1, 'MessageTextClean']
text'''

"text = newdata.loc[1, 'MessageTextClean']\ntext"

In [19]:
'''name = newdata.loc[0, 'EMITENT_FULL_NAME']
name'''

"name = newdata.loc[0, 'EMITENT_FULL_NAME']\nname"

In [20]:
def make_prompt(text, name, tokenizer):
    prompt = ['[CLS]']
    prompt.extend(tokenizer.tokenize(name, is_split_into_words=True))
    prompt.append('[SEP]')
    prompt.extend(tokenizer.tokenize(text, is_split_into_words=True))
    return prompt

In [21]:
class dataset(Dataset):
    def __init__(self, dataframe, tokenizer, max_len):
        self.len = len(dataframe)
        self.data = dataframe
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __getitem__(self, index):
        # step 1: tokenize (and adapt corresponding labels)
        text = self.data.MessageTextClean[index]
        company_name = self.data.EMITENT_FULL_NAME[index]
        tokenized_prompt = make_prompt(text, company_name, self.tokenizer)
        label = self.data.SentimentScore[index]

        # step 3: truncating/padding
        maxlen = self.max_len

        if (len(tokenized_prompt) > maxlen):
          # truncate
          tokenized_prompt = tokenized_prompt[:maxlen]
        else:
          # pad
          tokenized_prompt = tokenized_prompt + ['[PAD]' for _ in range(maxlen - len(tokenized_prompt))]

        # step 4: obtain the attention mask
        attn_mask = [1 if tok != '[PAD]' else 0 for tok in tokenized_prompt]

        # step 5: convert tokens to input ids
        ids = self.tokenizer.convert_tokens_to_ids(tokenized_prompt)

        return {
              'ids': torch.tensor(ids, dtype=torch.long),
              'mask': torch.tensor(attn_mask, dtype=torch.long),
              #'targets': torch.tensor(label_ids, dtype=torch.long)
              'target': np.array(label)
        }

    def __len__(self):
        return self.len

In [22]:
train_size = 0.7
train_dataset = newdata.sample(frac=train_size, random_state=77)
test_dataset = newdata.drop(train_dataset.index).reset_index(drop=True)
train_dataset = train_dataset.reset_index(drop=True)

print("FULL Dataset: {}".format(newdata.shape))
print("TRAIN Dataset: {}".format(train_dataset.shape))
print("TEST Dataset: {}".format(test_dataset.shape))

training_set = dataset(train_dataset, tokenizer, MAX_LEN)
testing_set = dataset(test_dataset, tokenizer, MAX_LEN)

FULL Dataset: (4924, 4)
TRAIN Dataset: (3447, 4)
TEST Dataset: (1477, 4)


In [23]:
training_set = dataset(train_dataset, tokenizer, MAX_LEN)
testing_set = dataset(test_dataset, tokenizer, MAX_LEN)

In [24]:
train_params = {'batch_size': TRAIN_BATCH_SIZE,
                }

test_params = {'batch_size': VALID_BATCH_SIZE,
                }

training_loader = DataLoader(training_set, shuffle=True, **train_params)
testing_loader = DataLoader(testing_set, **test_params)

In [25]:
'''class BertClassifier(nn.Module):

    def __init__(self, dropout=0.5):

        super(BertClassifier, self).__init__()

        model = BertForSequenceClassification.from_pretrained('DeepPavlov/rubert-base-cased')
        self.bert = model.bert
        self.dropout = nn.Dropout(dropout)
        self.linear = nn.Linear(768, 5)
        self.softmax = nn.Softmax()

    def forward(self, input_id, mask):

        pooled_output = self.bert(input_ids=input_id, attention_mask=mask, return_dict=False)
        dropout_output = self.dropout(pooled_output[0][:,0,:])
        linear_output = self.linear(dropout_output)
        final_layer = self.softmax(linear_output)

        return final_layer'''

"class BertClassifier(nn.Module):\n\n    def __init__(self, dropout=0.5):\n\n        super(BertClassifier, self).__init__()\n\n        model = BertForSequenceClassification.from_pretrained('DeepPavlov/rubert-base-cased')\n        self.bert = model.bert\n        self.dropout = nn.Dropout(dropout)\n        self.linear = nn.Linear(768, 5)\n        self.softmax = nn.Softmax()\n\n    def forward(self, input_id, mask):\n\n        pooled_output = self.bert(input_ids=input_id, attention_mask=mask, return_dict=False)\n        dropout_output = self.dropout(pooled_output[0][:,0,:])\n        linear_output = self.linear(dropout_output)\n        final_layer = self.softmax(linear_output)\n\n        return final_layer"

In [26]:
model = BertForSequenceClassification.from_pretrained('DeepPavlov/rubert-base-cased',
                                                      num_labels=5)
model.to(device)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at DeepPavlov/rubert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(119547, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12

In [27]:
'''model = BertClassifier()
model.to(device)'''

'model = BertClassifier()\nmodel.to(device)'

In [28]:
optimizer = torch.optim.Adam(params=model.parameters(), lr=LEARNING_RATE)
criterion = nn.CrossEntropyLoss().cuda()

In [29]:
'''from tqdm import tqdm

def train(model, train_dataloader, val_dataloader, learning_rate, epochs):

    for epoch_num in range(epochs):

            total_acc_train = 0
            total_loss_train = 0

            for idx, batch in tqdm(enumerate(training_loader)):

                train_label = batch['target'].to(device)
                mask = batch['mask'].to(device)
                input_id = batch['ids'].to(device)

                output = model(input_id, mask)

                batch_loss = criterion(output, train_label.long())
                total_loss_train += batch_loss.item()

                acc = (output.argmax(dim=1) == train_label).sum().item()
                total_acc_train += acc

                model.zero_grad()
                batch_loss.backward()
                optimizer.step()

            total_acc_val = 0
            total_loss_val = 0

            with torch.no_grad():

                for idx, val_batch in tqdm(enumerate(val_dataloader)):

                    val_label = val_batch['target'].to(device)
                    mask = val_batch['mask'].to(device)
                    input_id = val_batch['ids'].to(device)

                    output = model(input_id, mask)

                    batch_loss = criterion(output, val_label.long())
                    total_loss_val += batch_loss.item()

                    acc = (output.argmax(dim=1) == val_label).sum().item()
                    total_acc_val += acc

            print(
                f'Epochs: {epoch_num + 1} | Train Loss: {total_loss_train / len(train_dataset): .3f} \
                | Train Accuracy: {total_acc_train / len(train_dataset): .3f} \
                | Val Loss: {total_loss_val / len(test_dataset): .3f} \
                | Val Accuracy: {total_acc_val / len(test_dataset): .3f}')'''

"from tqdm import tqdm\n\ndef train(model, train_dataloader, val_dataloader, learning_rate, epochs):\n    \n    for epoch_num in range(epochs):\n        \n            total_acc_train = 0\n            total_loss_train = 0\n\n            for idx, batch in tqdm(enumerate(training_loader)):\n\n                train_label = batch['target'].to(device)\n                mask = batch['mask'].to(device)\n                input_id = batch['ids'].to(device)\n\n                output = model(input_id, mask)\n                \n                batch_loss = criterion(output, train_label.long())\n                total_loss_train += batch_loss.item()\n                \n                acc = (output.argmax(dim=1) == train_label).sum().item()\n                total_acc_train += acc\n\n                model.zero_grad()\n                batch_loss.backward()\n                optimizer.step()\n            \n            total_acc_val = 0\n            total_loss_val = 0\n\n            with torch.no_grad():\n\n 

In [30]:
from tqdm import tqdm

def train(model, train_dataloader, val_dataloader, learning_rate, epochs):
    model.train()
    for epoch_num in range(epochs):

            total_acc_train = 0
            total_loss_train = 0
            for idx, batch in tqdm(enumerate(training_loader)):

                train_label = batch['target'].to(device)
                mask = batch['mask'].to(device)
                input_id = batch['ids'].to(device)

                output = model(input_id, mask, labels=train_label)
                loss, tr_logits = output.loss, output.logits
                #batch_loss = criterion(output, train_label.long())
                total_loss_train += loss.item()

                acc = (tr_logits.argmax(dim=1) == train_label).sum().item()
                total_acc_train += acc

                model.zero_grad()
                loss.backward()
                optimizer.step()

            total_acc_val = 0
            total_loss_val = 0

            with torch.no_grad():

                for idx, val_batch in tqdm(enumerate(val_dataloader)):

                    val_label = val_batch['target'].to(device)
                    mask = val_batch['mask'].to(device)
                    input_id = val_batch['ids'].to(device)

                    output = model(input_id, mask, labels=val_label)

                    #batch_loss = criterion(output, val_label.long())
                    loss, tr_logits = output.loss, output.logits
                    total_loss_val += loss.item()

                    acc = (tr_logits.argmax(dim=1) == val_label).sum().item()
                    total_acc_val += acc

            print(
                f'Epochs: {epoch_num + 1} | Train Loss: {total_loss_train / len(train_dataset): .3f} \
                | Train Accuracy: {total_acc_train / len(train_dataset): .3f} \
                | Val Loss: {total_loss_val / len(test_dataset): .3f} \
                | Val Accuracy: {total_acc_val / len(test_dataset): .3f}')

In [31]:
len(training_loader)

288

In [32]:
len(testing_loader)

148

In [None]:
train(model, training_loader, testing_loader, learning_rate=LEARNING_RATE, epochs=5)

15it [00:16,  1.14s/it]

In [None]:
# 4 - 0.54

In [36]:
torch.save(model, 'TSAbaseline')