# 7. Классификатор тональности

1. Использовать в классификации внешний словарь тональностей.
2. Улучшить качество базовой предсказательной модели на тестовой выборке за счет добавления и модификации признаков.
3. Сравнить качество классификации на леммах и подтокенах.
4. Обучить fasttext-классификатор, сравнить качество классификации с предобученными эмбеддингами и обученными с нуля при классификации.


- В качестве решения любого задания <b>не принимается</b> модель с качеством менее 62.00% макроусредненной F1 на тесте.
- <b>Можно</b> улучшать модели сверх предложенных условий: <b>добавлять свои признаки к указанным в задании</b>, изменять способ классификации и подбирать гиперпараметры.
- Тестовые данные можно использовать только при оценке моделей.


Данные для обучения моделей:

### Импорт датасета

In [None]:
!pip install spacy
!python -m spacy download ru_core_news_sm

Collecting ru-core-news-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/ru_core_news_sm-3.8.0/ru_core_news_sm-3.8.0-py3-none-any.whl (15.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m15.3/15.3 MB[0m [31m17.5 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting pymorphy3>=1.0.0 (from ru-core-news-sm==3.8.0)
  Downloading pymorphy3-2.0.3-py3-none-any.whl.metadata (1.9 kB)
Collecting dawg2-python>=0.8.0 (from pymorphy3>=1.0.0->ru-core-news-sm==3.8.0)
  Downloading dawg2_python-0.9.0-py3-none-any.whl.metadata (7.5 kB)
Collecting pymorphy3-dicts-ru (from pymorphy3>=1.0.0->ru-core-news-sm==3.8.0)
  Downloading pymorphy3_dicts_ru-2.4.417150.4580142-py2.py3-none-any.whl.metadata (2.0 kB)
Downloading pymorphy3-2.0.3-py3-none-any.whl (53 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m53.8/53.8 kB[0m [31m1.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dawg2_python-0.9.0-py3-none-any.whl (9.3 kB)
Downloading pymorphy3

In [None]:
! wget https://www.dropbox.com/s/t1gs701zvqaxqnk/rusentiment_random_posts.csv
! wget https://www.dropbox.com/s/gr4z1x39y1j6dtx/rusentiment_test.csv

--2025-06-04 10:46:51--  https://www.dropbox.com/s/t1gs701zvqaxqnk/rusentiment_random_posts.csv
Resolving www.dropbox.com (www.dropbox.com)... 162.125.3.18, 2620:100:6018:18::a27d:312
Connecting to www.dropbox.com (www.dropbox.com)|162.125.3.18|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://www.dropbox.com/scl/fi/y17smfk1ptufngw720uny/rusentiment_random_posts.csv?rlkey=p9e77phv8eu6fwh6tou0fz232 [following]
--2025-06-04 10:46:51--  https://www.dropbox.com/scl/fi/y17smfk1ptufngw720uny/rusentiment_random_posts.csv?rlkey=p9e77phv8eu6fwh6tou0fz232
Reusing existing connection to www.dropbox.com:443.
HTTP request sent, awaiting response... 302 Found
Location: https://ucf045c585fde7389ce1f30c530f.dl.dropboxusercontent.com/cd/0/inline/Cq9w8kmS62Z1mcQpzYC50AXMrmH-Plp6uCbzvhjE13pr0MkkeLStKTAqzhTSp1Ynw0JuboMcIQk3LSzlgPJx9zrAD7F3Oha2RluORE-QN-AKKZudQVqGKP6ESSpIPqBJ045JqiMQVAahDL22v3CQjAny/file# [following]
--2025-06-04 10:46:52--  https://ucf045c585fde7389ce1

In [None]:
import pandas as pd, re, numpy as np # pymorphy2,
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
import spacy
nlp = spacy.load("ru_core_news_sm")
import re, pandas as pd, numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.corpus import stopwords
import nltk; nltk.download('stopwords')

ru_stop = stopwords.words("russian")

# morph = pymorphy2.MorphAnalyzer()

df_train = pd.read_csv('rusentiment_random_posts.csv')
df_test  = pd.read_csv('rusentiment_test.csv')          # **использовать ТОЛЬКО для финальной оценки!**




[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [None]:
def clean(text):
    text = re.sub(r'http\S+|\W+', ' ', str(text).lower())
    return re.sub(r'\s+', ' ', text).strip()

def lemmatize(text):
    doc = nlp(text)
    lemmas = [token.lemma_ for token in doc]
    return ' '.join(lemmas)





In [None]:
df_train['lemmas'] = df_train['text'].map(clean).map(lemmatize)

In [None]:
df_test['lemmas'] = df_test['text'].map(clean).map(lemmatize)

In [None]:
!pip install nltk



### 1 . Добавляем признаки из внешнего словаря тональностей (RuSentiLex)

In [None]:
import urllib.request

url = 'https://www.labinform.ru/pub/rusentilex/rusentilex_2017.txt'
save_path = 'rusentilex_2017.txt'

urllib.request.urlretrieve(url, save_path)

('rusentilex_2017.txt', <http.client.HTTPMessage at 0x7d901c1e6dd0>)

In [None]:
lexicon = {}

with open("rusentilex_2017.txt", encoding='utf-8') as f:
    for line in f:
        line = line.strip()
        if not line or line.startswith('!'):
            continue  # пропускаем комментарии и пустые строки

        parts = [p.strip() for p in line.split(',')]
        if len(parts) < 4:
            continue  # пропускаем повреждённые строки

        lemma = parts[2].lower()
        polarity = parts[3].lower()

        # игнорируем амбивалентные (positive/negative) — чтобы не вносить шум
        if polarity in ['positive', 'negative', 'neutral']:
            lexicon[lemma] = polarity


In [None]:
print(f"Размер словаря lexicon: {len(lexicon)}")

unique_values = set()
for value in lexicon.values():
    unique_values.add(value)

print(f"Уникальные значения по всем ключам: {unique_values}")

Размер словаря lexicon: 13295
Уникальные значения по всем ключам: {'negative', 'neutral', 'positive'}


In [None]:
def lexicon_feats(text):
    pos = neg = 0
    for w in text.split():
        s = lexicon.get(w)
        if s == 'positive': pos += 1
        elif s == 'negative': neg += 1
    total = len(text.split()) or 1
    return pd.Series({'pos_cnt':pos, 'neg_cnt':neg, 'pos_ratio':pos/total, 'neg_ratio':neg/total})

lex_feats_train = df_train['lemmas'].apply(lexicon_feats)
lex_feats_test = df_test['lemmas'].apply(lexicon_feats)

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import f1_score
from scipy.sparse import hstack

tfidf = TfidfVectorizer(
    analyzer='word',
    ngram_range=(1,3),
    max_features=60000,
    min_df=2,
    sublinear_tf=True
)

X_train_tfidf = tfidf.fit_transform(df_train['lemmas'])
X_test_tfidf = tfidf.transform(df_test['lemmas'])

# нормализация лексиконных признаков
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_train_lex = scaler.fit_transform(lex_feats_train)
X_test_lex  = scaler.transform(lex_feats_test)

# объединение
from scipy.sparse import hstack
X_train_all = hstack([X_train_tfidf, X_train_lex])
X_test_all  = hstack([X_test_tfidf,  X_test_lex])
y_train = df_train['label']
y_test = df_test['label']



In [None]:
# LogisticRegression с более мощной регуляризацией
from sklearn.linear_model import LogisticRegression
clf = LogisticRegression(C=6.0, class_weight='balanced', solver='liblinear', max_iter=1200)
clf.fit(X_train_all, y_train)
y_pred = clf.predict(X_test_all)



In [None]:
from sklearn.metrics import f1_score
f1 = f1_score(y_test, y_pred, average='macro')
print(f1)

0.6250281887176338


### 2 . Улучшить качество базовой предсказательной модели на тестовой выборке за счет добавления и модификации признаков.

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.sparse import hstack
from sklearn.linear_model import LogisticRegression

word_v = TfidfVectorizer(analyzer='word', ngram_range=(1,3), max_features=40_000,
                         sublinear_tf=True, min_df=3)
char_v = TfidfVectorizer(analyzer='char', ngram_range=(3,5), min_df=5)

X_word = word_v.fit_transform(df_train['lemmas'])
X_char = char_v.fit_transform(df_train['lemmas'])
X      = hstack([X_word, X_char, lex_feats_train.values])

X_tr, X_val, y_tr, y_val = train_test_split(X, df_train['label'], test_size=0.2, random_state=42, stratify=df_train['label'])

In [None]:
clf = LogisticRegression(solver='liblinear', C=4, class_weight='balanced', max_iter=300)
clf.fit(X_tr, y_tr)
y_pred = clf.predict(X_val)
print(f1_score(y_val, y_pred, average='macro')) # получилось с 1 раза, успешный успех

0.6293205946201473


### 3 .Сравнить качество классификации на леммах и подтокенах.

In [None]:
!pip install sentencepiece




In [None]:
def lexicon_feats(text):
    pos = neg = neu = 0
    for w in text.split():
        s = lexicon.get(w)
        if s == 'positive': pos += 1
        elif s == 'negative': neg += 1
        elif s == 'neutral':  neu += 1
    total = len(text.split()) or 1
    return pd.Series({
        'pos_cnt': pos, 'neg_cnt': neg, 'neu_cnt': neu,
        'pos_ratio': pos/total, 'neg_ratio': neg/total, 'neu_ratio': neu/total
    })


In [None]:
import pandas as pd, re, sentencepiece as spm
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score
from sklearn.preprocessing import StandardScaler
from scipy.sparse import hstack



with open("corpus_lemmas.txt", "w", encoding="utf-8") as f:
    for line in df_train['lemmas']:
        f.write(line + "\n")

spm.SentencePieceTrainer.train(
    input='corpus_lemmas.txt',
    model_prefix='bpe_lemmas',
    vocab_size=16000,
    model_type='bpe',
    character_coverage=1.0
)

sp = spm.SentencePieceProcessor(model_file='bpe_lemmas.model')
df_train['bpe'] = df_train['lemmas'].apply(lambda x: ' '.join(sp.encode(x, out_type=str)))
df_test['bpe']  = df_test['lemmas'].apply(lambda x: ' '.join(sp.encode(x, out_type=str)))

In [None]:
lexicon = {}
with open("rusentilex_2017.txt", encoding='utf-8') as f:
    for line in f:
        if line.startswith('!') or not line.strip():
            continue
        parts = [p.strip() for p in line.split(',')]
        if len(parts) >= 4:
            lemma, polarity = parts[2].lower(), parts[3].lower()
            if polarity in ['positive', 'negative', 'neutral']:
                lexicon[lemma] = polarity

def lexicon_feats(text):
    pos = neg = neu = 0
    for w in text.split():
        s = lexicon.get(w)
        if s == 'positive': pos += 1
        elif s == 'negative': neg += 1
        elif s == 'neutral':  neu += 1
    total = len(text.split()) or 1
    return pd.Series({
        'pos_cnt': pos, 'neg_cnt': neg, 'neu_cnt': neu,
        'pos_ratio': pos/total, 'neg_ratio': neg/total, 'neu_ratio': neu/total
    })

lex_feats_train = df_train['lemmas'].apply(lexicon_feats)
lex_feats_test  = df_test['lemmas'].apply(lexicon_feats)

In [None]:
# TF-IDF (BPE + char)
tfidf_word = TfidfVectorizer(
    analyzer='word',
    ngram_range=(1,2),
    max_features=50000,
    min_df=2,
    sublinear_tf=True
)
tfidf_char = TfidfVectorizer(
    analyzer='char',
    ngram_range=(3,5),
    max_features=20000,
    min_df=2,
    sublinear_tf=True
)

X_train_word = tfidf_word.fit_transform(df_train['bpe'])
X_test_word  = tfidf_word.transform(df_test['bpe'])

X_train_char = tfidf_char.fit_transform(df_train['bpe'])
X_test_char  = tfidf_char.transform(df_test['bpe'])

In [None]:
# --- 6. Нормализация признаков lexicon
scaler = StandardScaler()
lex_feats_train_scaled = scaler.fit_transform(lex_feats_train)
lex_feats_test_scaled  = scaler.transform(lex_feats_test)

# --- 7. Объединение признаков
X_train_all = hstack([X_train_word, X_train_char, lex_feats_train_scaled])
X_test_all  = hstack([X_test_word,  X_test_char,  lex_feats_test_scaled])
y_train = df_train['label']
y_test = df_test['label']

In [None]:
clf = LogisticRegression(
    C=1.0,
    class_weight='balanced',
    solver='liblinear',
    max_iter=1000
)
clf.fit(X_train_all, y_train)
y_pred = clf.predict(X_test_all)

In [None]:
# --- 9. Оценка
f1 = f1_score(y_test, y_pred, average='macro')
print(f"f1 для подтокенов = {f1}")
print(f"f1 на леммах получился = 0.6293205946201473") # значение из 2 пункта
print(f"f1 мера на подтокенах получается лучше")

f1 для подтокенов = 0.6360764709254842
f1 на леммах получился = 0.6293205946201473
f1 мера на подтокенах получается лучше


#### 4. Обучить fasttext-классификатор, сравнить качество классификации с предобученными эмбеддингами и обученными с нуля при классификации.


In [None]:
! pip install fasttext

Collecting fasttext
  Downloading fasttext-0.9.3.tar.gz (73 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/73.4 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m73.4/73.4 kB[0m [31m2.1 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Collecting pybind11>=2.2 (from fasttext)
  Using cached pybind11-2.13.6-py3-none-any.whl.metadata (9.5 kB)
Using cached pybind11-2.13.6-py3-none-any.whl (243 kB)
Building wheels for collected packages: fasttext
  Building wheel for fasttext (pyproject.toml) ... [?25l[?25hdone
  Created wheel for fasttext: filename=fasttext-0.9.3-cp311-cp311-linux_x86_64.whl size=4313504 sha256=8fdf03c825d5a28928319abcf0190e3f103b3757bfb8f60afab9b1c0752f1806
  Stored in directory: /root/.cache/pip/wheels/65/4f/35/5057db0249224e9ab55a51

In [None]:
def to_fasttext_format(df, path, text_col='lemmas'):
    with open(path, 'w', encoding='utf-8') as f:
        for text, label in zip(df[text_col], df['label']):
            f.write(f"__label__{label} {text.strip()}\n")

to_fasttext_format(df_train, "train_ft.txt")
to_fasttext_format(df_test, "test_ft.txt")


In [None]:
import fasttext

model_ft = fasttext.train_supervised(
    input="train_ft.txt",
    lr=0.8,
    epoch=50,
    wordNgrams=2,
    minn=2,
    maxn=5,
    dim=100,
    loss='ova'
)

In [1]:
N, P, R = model_ft.test("test_ft.txt")
f1 = 2 * P * R / (P + R)
print(f"f1 fastText: {f1}")

f1 fastText (без предобученных эмб): 0.6545331985170205


а теперь используем предобученную:

In [None]:
!pip install wget
# нагло нагуглил, как импортировать
import wget
import os

url = 'https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.ru.300.vec.gz'

gz_filename = 'cc.ru.300.vec.gz'
vec_filename = 'cc.ru.300.vec'

if not os.path.exists(gz_filename):
    print(f"Скачивание {gz_filename}...")
    wget.download(url, gz_filename)
    print(f"\nСкачивание {gz_filename} завершено.")
else:
    print(f"Файл {gz_filename} уже существует, скачивание пропущено.")

if not os.path.exists(vec_filename):
    print(f"Распаковка {gz_filename} в {vec_filename}...")
    import gzip
    with gzip.open(gz_filename, 'rb') as f_in:
        with open(vec_filename, 'wb') as f_out:
            # Чтение по частям для экономии памяти
            chunk_size = 4096
            while True:
                chunk = f_in.read(chunk_size)
                if not chunk:
                    break
                f_out.write(chunk)
    print(f"Распаковка завершена.")
else:
     print(f"Файл {vec_filename} уже существует, распаковка пропущена.")

print(f"Файл с предобученными векторами доступен по пути: {vec_filename}")

Collecting wget
  Downloading wget-3.2.zip (10 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: wget
  Building wheel for wget (setup.py) ... [?25l[?25hdone
  Created wheel for wget: filename=wget-3.2-py3-none-any.whl size=9655 sha256=eb44b048c04b76d67ac3bdedba389ee4fbf87b4acb98f1e7adc5548f52716361
  Stored in directory: /root/.cache/pip/wheels/40/b3/0f/a40dbd1c6861731779f62cc4babcb234387e11d697df70ee97
Successfully built wget
Installing collected packages: wget
Successfully installed wget-3.2
Скачивание cc.ru.300.vec.gz...

Скачивание cc.ru.300.vec.gz завершено.
Распаковка cc.ru.300.vec.gz в cc.ru.300.vec...
Распаковка завершена.
Файл с предобученными векторами доступен по пути: cc.ru.300.vec


In [None]:
pretrained_vectors_path = "cc.ru.300.vec"
model_pre = fasttext.train_supervised(
    input="train_ft.txt",
    pretrainedVectors=pretrained_vectors_path, # Путь к предобученным векторам
    epoch=50,                          # вернул 50
    lr=0.5,
    wordNgrams=2,
    dim=300,                           # Размерность векторов (должна совпадать с dim предобученных векторов)
    loss='ova',
)

In [2]:
N, P, R = model_pre.test("test_ft.txt")
f1 = 2 * P * R / (P + R)
print(f"f1 fastText: {f1}")

f1 fastText (c предобученных эмб): 0.6819472685037192
