### Imports

In [64]:
import os
import random
import re
from os import getenv

import numpy as np
import pandas as pd

import nltk
from nltk.corpus import stopwords
from pymorphy3 import MorphAnalyzer
from sklearn.feature_extraction.text import TfidfVectorizer
from transformers import AutoTokenizer, AutoModelForSequenceClassification, get_linear_schedule_with_warmup
from gensim.models import KeyedVectors
from huggingface_hub import hf_hub_download

import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Embedding, Dropout, Bidirectional
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.callbacks import EarlyStopping
import torch
from torch.nn import CrossEntropyLoss
from torch.optim import AdamW, Adam

from sklearn.metrics import accuracy_score, f1_score
from sklearn.utils.class_weight import compute_class_weight
from sklearn.linear_model import LogisticRegression

from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader, TensorDataset, random_split

from tqdm import tqdm

from dotenv import load_dotenv

nltk.download("stopwords")

load_dotenv()

HF_TOKEN = getenv("HF_TOKEN")

random.seed(42)
np.random.seed(42)
torch.manual_seed(42)

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Alexandra\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


<torch._C.Generator at 0x1c3d48a69b0>

### Loading data

In [65]:
data = pd.read_excel('../ru_data_test.xlsx')
data.head()

Unnamed: 0,sentence,object,tonality
0,Схему обнаружили аналитики департамента Digita...,Digital Risk Protection,positive
1,Лидеры ВОГ не избавлялись от непрофильных акти...,ВОГ,negative
2,Полиция задержала президента общества глухих С...,ВОГ,negative
3,Американская технологическая компания Google о...,Google,positive
4,"В январе в Microsoft заявили, что киберпреступ...",Microsoft,neutral


In [66]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 111 entries, 0 to 110
Data columns (total 3 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   sentence  111 non-null    object
 1   object    111 non-null    object
 2   tonality  111 non-null    object
dtypes: object(3)
memory usage: 2.7+ KB


In [67]:
data.describe()

Unnamed: 0,sentence,object,tonality
count,111,111,111
unique,111,73,3
top,Схему обнаружили аналитики департамента Digita...,Positive Technologies,positive
freq,1,15,56


In [68]:
data['tonality'].unique()

array(['positive', 'negative', 'neutral'], dtype=object)

### Preprocessing

In [69]:
morph = MorphAnalyzer()
stop_words = set(stopwords.words("russian"))

In [71]:
def preprocess_text(sentence, object_name):
    pattern = rf'\b{re.escape(object_name)}\b'

    sentence = re.sub(pattern, '[COMPANY]', sentence, flags=re.IGNORECASE)
    sentence = sentence.lower()
    sentence = re.sub(r'[^\w\s]', '', sentence)

    tokens = sentence.split()
    tokens = [morph.parse(word)[0].normal_form for word in tokens if word not in stop_words]
    
    processed_sentence = ' '.join(tokens).replace("company", "[COMPANY]")
    return processed_sentence

In [72]:
data["processed_sentence"] = data.apply(lambda row: preprocess_text(row["sentence"], row["object"]), axis=1)

In [73]:
sentiment_mapping = {"positive": 0, "neutral": 1, "negative": 2}
data["tonality_numeric"] = data["tonality"].map(sentiment_mapping)

In [74]:
data.head()

Unnamed: 0,sentence,object,tonality,processed_sentence,tonality_numeric
0,Схему обнаружили аналитики департамента Digita...,Digital Risk Protection,positive,схема обнаружить аналитик департамент [COMPANY...,0
1,Лидеры ВОГ не избавлялись от непрофильных акти...,ВОГ,negative,лидер [COMPANY] избавляться непрофильный актив...,2
2,Полиция задержала президента общества глухих С...,ВОГ,negative,полиция задержать президент общество глухой ст...,2
3,Американская технологическая компания Google о...,Google,positive,американский технологический компания [COMPANY...,0
4,"В январе в Microsoft заявили, что киберпреступ...",Microsoft,neutral,январь [COMPANY] заявить киберпреступник получ...,1


В Sentence заменил название компании из object на [COMPANY], чтобы более явно указать точку для анализа, удалил незначемые символы и привел все в Lowercase. Для значений тональности сделал численные аналоги

### Model

Настроим 3 модели и выберем лучшую из них.

Выбирать будем из логистической регрессии, LSTM и RuBERT

Каждая из них требует разного количества ресурсов и предоставит ответы разной точности. Одним из критериев выбора будет мощность моего компьютера, но предполагаю, что RuBert даст более точный результат.

#### Logistic Regression с TF-IDF векторизацией

In [75]:
X_train, X_test, y_train, y_test = train_test_split(data["processed_sentence"], data["tonality_numeric"], test_size=0.2, random_state=42)

In [76]:
tfidf_vectorizer = TfidfVectorizer(max_features=100)
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)

In [77]:
log_reg = LogisticRegression(max_iter=1000)
log_reg.fit(X_train_tfidf, y_train)

In [78]:
y_pred_log_reg = log_reg.predict(X_test_tfidf)
accuracy_log_reg = accuracy_score(y_test, y_pred_log_reg)
f1_log_reg = f1_score(y_test, y_pred_log_reg, average='weighted')

print(f"Logistic Regression - Accuracy: {accuracy_log_reg}, F1 Score: {f1_log_reg}")

Logistic Regression - Accuracy: 0.43478260869565216, F1 Score: 0.35253801670593277


достаточно низкие значения точности. либо модель слабовата, либо данных мало для нее

с простейшей логистической регрессией не прокатило)

#### LSTM

In [79]:
max_words = 10000
max_len = 100

tokenizer = Tokenizer(num_words=max_words)
tokenizer.fit_on_texts(X_train)
X_train_seq = tokenizer.texts_to_sequences(X_train)
X_test_seq = tokenizer.texts_to_sequences(X_test)

In [80]:
X_train_padded = pad_sequences(X_train_seq, maxlen=max_len)
X_test_padded = pad_sequences(X_test_seq, maxlen=max_len)

In [81]:
lstm_model = Sequential([
    Embedding(max_words, 128),
    LSTM(64, dropout=0.2, recurrent_dropout=0.2),
    Dense(3, activation='softmax')  # 3 класса: positive, neutral, negative
])

In [82]:
lstm_model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
lstm_model.fit(X_train_padded, y_train, epochs=10, batch_size=16, validation_split=0.2)


Epoch 1/10
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 129ms/step - accuracy: 0.4106 - loss: 1.0918 - val_accuracy: 0.4444 - val_loss: 1.0932
Epoch 2/10
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 48ms/step - accuracy: 0.5642 - loss: 1.0384 - val_accuracy: 0.4444 - val_loss: 1.1033
Epoch 3/10
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 48ms/step - accuracy: 0.5894 - loss: 0.9335 - val_accuracy: 0.4444 - val_loss: 1.2011
Epoch 4/10
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 50ms/step - accuracy: 0.5668 - loss: 0.8651 - val_accuracy: 0.4444 - val_loss: 1.2088
Epoch 5/10
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 47ms/step - accuracy: 0.5468 - loss: 0.8691 - val_accuracy: 0.4444 - val_loss: 1.1933
Epoch 6/10
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 49ms/step - accuracy: 0.6613 - loss: 0.7794 - val_accuracy: 0.4444 - val_loss: 1.2130
Epoch 7/10
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━

<keras.src.callbacks.history.History at 0x1c3870727b0>

In [83]:
y_pred_lstm = np.argmax(lstm_model.predict(X_test_padded), axis=-1)
accuracy_lstm = accuracy_score(y_test, y_pred_lstm)
f1_lstm = f1_score(y_test, y_pred_lstm, average='weighted')

print(f"LSTM - Accuracy: {accuracy_lstm}, F1 Score: {f1_lstm}")

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 234ms/step
LSTM - Accuracy: 0.43478260869565216, F1 Score: 0.306919121084619


здесь можем заметить, что при обучении валидационная точность почти не меняется. Вероятно модели не за что "зацепиться"

стоит попробовать веркторизировать признаки

#### RuBERT

In [94]:
tokenizer = AutoTokenizer.from_pretrained("DeepPavlov/rubert-base-cased", token=HF_TOKEN)
special_tokens_dict = {'additional_special_tokens': ['[COMPANY]']}
tokenizer.add_special_tokens(special_tokens_dict)

rubert_model = AutoModelForSequenceClassification.from_pretrained("DeepPavlov/rubert-base-cased", num_labels=3)
rubert_model.resize_token_embeddings(len(tokenizer))

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at DeepPavlov/rubert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Embedding(119548, 768, padding_idx=0)

In [95]:
def tokenize_data(sentences, labels):
    encodings = tokenizer(sentences, truncation=True, padding=True, max_length=128, return_tensors='pt')
    input_ids = encodings['input_ids']
    attention_mask = encodings['attention_mask']
    labels = torch.tensor(labels)
    return TensorDataset(input_ids, attention_mask, labels)

In [96]:
dataset = tokenize_data(X_train.tolist(), y_train.tolist())
train_size = int(0.8 * len(dataset))
val_size = len(dataset) - train_size
train_dataset, val_dataset = random_split(dataset, [train_size, val_size])
test_dataset = tokenize_data(X_test.tolist(), y_test.tolist())

In [97]:
train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=8)
test_loader = DataLoader(test_dataset, batch_size=8)

class_weights = compute_class_weight('balanced', classes=np.unique(y_train), y=y_train)
class_weights = torch.tensor(class_weights, dtype=torch.float)

In [98]:
optimizer = AdamW(rubert_model.parameters(), lr=1e-5, weight_decay=0.01)
total_steps = len(train_loader) * 10  # 10 эпох
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)
criterion = CrossEntropyLoss(weight=class_weights)

In [99]:
rubert_model.train()
for epoch in range(10):
    total_loss = 0
    rubert_model.train()
    for batch in tqdm(train_loader, desc=f"Epoch {epoch + 1}"):
        optimizer.zero_grad()
        input_ids, attention_mask, labels = batch
        outputs = rubert_model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        scheduler.step()
        total_loss += loss.item()
    print(f"Epoch {epoch + 1} Loss: {total_loss / len(train_loader):.4f}")

    rubert_model.eval()
    val_preds, val_labels = [], []
    with torch.no_grad():
        for batch in val_loader:
            input_ids, attention_mask, labels = batch
            outputs = rubert_model(input_ids=input_ids, attention_mask=attention_mask)
            logits = outputs.logits
            val_preds.extend(torch.argmax(logits, dim=1).cpu().numpy())
            val_labels.extend(labels.cpu().numpy())
    val_accuracy = accuracy_score(val_labels, val_preds)
    print(f"Epoch {epoch + 1} Validation Accuracy: {val_accuracy:.4f}")

Epoch 1: 100%|██████████| 9/9 [00:28<00:00,  3.17s/it]


Epoch 1 Loss: 1.0331
Epoch 1 Validation Accuracy: 0.5000


Epoch 2: 100%|██████████| 9/9 [00:24<00:00,  2.74s/it]


Epoch 2 Loss: 0.9741
Epoch 2 Validation Accuracy: 0.5000


Epoch 3: 100%|██████████| 9/9 [00:26<00:00,  2.90s/it]


Epoch 3 Loss: 0.9264
Epoch 3 Validation Accuracy: 0.5000


Epoch 4: 100%|██████████| 9/9 [00:24<00:00,  2.78s/it]


Epoch 4 Loss: 0.8633
Epoch 4 Validation Accuracy: 0.5000


Epoch 5: 100%|██████████| 9/9 [00:26<00:00,  2.89s/it]


Epoch 5 Loss: 0.7945
Epoch 5 Validation Accuracy: 0.5000


Epoch 6: 100%|██████████| 9/9 [00:24<00:00,  2.76s/it]


Epoch 6 Loss: 0.7227
Epoch 6 Validation Accuracy: 0.5000


Epoch 7: 100%|██████████| 9/9 [00:26<00:00,  2.91s/it]


Epoch 7 Loss: 0.6368
Epoch 7 Validation Accuracy: 0.5000


Epoch 8: 100%|██████████| 9/9 [00:24<00:00,  2.76s/it]


Epoch 8 Loss: 0.6124
Epoch 8 Validation Accuracy: 0.5000


Epoch 9: 100%|██████████| 9/9 [00:25<00:00,  2.80s/it]


Epoch 9 Loss: 0.5313
Epoch 9 Validation Accuracy: 0.5556


Epoch 10: 100%|██████████| 9/9 [00:25<00:00,  2.78s/it]


Epoch 10 Loss: 0.5523
Epoch 10 Validation Accuracy: 0.5556


In [100]:
rubert_model.eval()
preds, true_labels = [], []
with torch.no_grad():
    for batch in test_loader:
        input_ids, attention_mask, labels = batch
        outputs = rubert_model(input_ids=input_ids, attention_mask=attention_mask)
        logits = outputs.logits
        preds.extend(torch.argmax(logits, dim=1).cpu().numpy())
        true_labels.extend(labels.cpu().numpy())

In [101]:
accuracy = accuracy_score(true_labels, preds)
print(f"Test Accuracy: {accuracy:.4f}")

Test Accuracy: 0.5217


для руберта уже явно указал на что обращать внимание через специальный токен.

в результате точность выросла, но это все еще не то, что хотелось бы видеть

In [103]:
rubert_model.save_pretrained("./rubert_finetuned")
tokenizer.save_pretrained("./rubert_finetuned")

('./rubert_finetuned\\tokenizer_config.json',
 './rubert_finetuned\\special_tokens_map.json',
 './rubert_finetuned\\vocab.txt',
 './rubert_finetuned\\added_tokens.json',
 './rubert_finetuned\\tokenizer.json')

### LSTM (двунаправленная)

In [104]:
word2vec = KeyedVectors.load_word2vec_format(hf_hub_download(repo_id="Word2vec/wikipedia2vec_ruwiki_20180420_100d", filename="ruwiki_20180420_100d.txt"))

# Здесь можно улучшить качество, скачав файл 300d. Я этого делать не стал т.к. он весит в 2.5 раза больше

In [105]:
max_words = 10000
max_len = 100
tokenizer = Tokenizer(num_words=max_words)
tokenizer.fit_on_texts(X_train)
X_train_seq = tokenizer.texts_to_sequences(X_train)
X_test_seq = tokenizer.texts_to_sequences(X_test)

In [106]:
X_train_padded = pad_sequences(X_train_seq, maxlen=max_len)
X_test_padded = pad_sequences(X_test_seq, maxlen=max_len)

In [107]:
embedding_dim = word2vec.vector_size
embedding_matrix = np.zeros((max_words, embedding_dim))
for word, i in tokenizer.word_index.items():
    if i < max_words and word in word2vec:
        embedding_matrix[i] = word2vec[word]

In [108]:
lstm_model = Sequential([
    Embedding(input_dim=max_words, output_dim=embedding_dim, weights=[embedding_matrix], input_length=max_len, trainable=False),
    Bidirectional(LSTM(64, dropout=0.3, recurrent_dropout=0.3)),
    Dropout(0.3),
    Dense(3, activation='softmax')  # 3 класса: positive, neutral, negative
])

lstm_model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])



In [109]:
early_stopping = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)

history = lstm_model.fit(X_train_padded, y_train, epochs=20, batch_size=16, validation_split=0.2, callbacks=[early_stopping])
# history = lstm_model.fit(X_train_padded, y_train, epochs=20, batch_size=16, validation_split=0.2, callbacks=[early_stopping])

Epoch 1/20
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 173ms/step - accuracy: 0.3114 - loss: 1.0999 - val_accuracy: 0.4444 - val_loss: 1.1018
Epoch 2/20
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 40ms/step - accuracy: 0.4930 - loss: 1.0416 - val_accuracy: 0.4444 - val_loss: 1.1099
Epoch 3/20
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 38ms/step - accuracy: 0.5464 - loss: 0.9794 - val_accuracy: 0.4444 - val_loss: 1.1375
Epoch 4/20
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 39ms/step - accuracy: 0.5460 - loss: 0.9442 - val_accuracy: 0.4444 - val_loss: 1.1752


In [110]:
y_pred_lstm = np.argmax(lstm_model.predict(X_test_padded), axis=-1)
accuracy_lstm = accuracy_score(y_test, y_pred_lstm)
f1_lstm = f1_score(y_test, y_pred_lstm, average='weighted')

print(f"LSTM - Accuracy: {accuracy_lstm}, F1 Score: {f1_lstm}")

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 429ms/step
LSTM - Accuracy: 0.391304347826087, F1 Score: 0.22010869565217392


удивительно, но значения не меняются, даже если запускать с конфигурацией ранней остановки

из всего этого можно сделать вывод, что 100 строк данных попросту недостаточно для получения нормального результата

из сделанного, оставим модель руберта как самую удачную