In [1]:
from corus import load_lenta
import pandas as pd

path = 'lenta-ru-news.csv.gz'
records = load_lenta(path)
# next(records)


Загрузка данных


In [2]:
records = load_lenta(path)
data = []
for record in records:
    if record.topic is None:
        continue
    data.append({
        'title': record.title,
        'text': record.text,
        'topic': record.topic
    })
    if len(data) >= 100_000:
        break
df = pd.DataFrame(data)

Предобработка текста

In [3]:
import re
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

import nltk
nltk.download('punkt_tab')
nltk.download('stopwords')

stop_words = set(stopwords.words('russian'))
df['full_text'] = df['title'] + ' ' + df['text']

def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'[^а-яё\s]', '', text)
    tokens = word_tokenize(text)
    tokens = [t for t in tokens if t not in stop_words and len(t) > 2]
    return ' '.join(tokens)

df['processed_text'] = df['full_text'].apply(preprocess_text)

[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\Rustam\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Rustam\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [4]:
# Посчитаем частоту каждого топика
topic_counts = df['topic'].value_counts()
print(topic_counts)

# Удалим классы с менее чем 2 экземплярами
valid_topics = topic_counts[topic_counts >= 2].index
df_filtered = df[df['topic'].isin(valid_topics)].copy()

topic
Россия               15151
Мир                  14421
Спорт                10045
Экономика             7682
Интернет и СМИ        6935
Силовые структуры     6925
Бывший СССР           6810
Культура              6578
Наука и техника       5645
Из жизни              4903
Ценности              4480
Дом                   3408
Путешествия           3223
Бизнес                1993
69-я параллель         815
Крым                   661
Культпросвет           307
                        17
Оружие                   1
Name: count, dtype: int64


Разделение данных

In [5]:
from sklearn.model_selection import train_test_split

X = df_filtered['processed_text']
y = df_filtered['topic']

X_train, X_temp, y_train, y_temp = train_test_split(
    X, y, test_size=0.4, stratify=y, random_state=42
)
X_val, X_test, y_val, y_test = train_test_split(
    X_temp, y_temp, test_size=0.5, stratify=y_temp, random_state=42
)

In [6]:
from sklearn.dummy import DummyClassifier
from sklearn.metrics import accuracy_score

dummy = DummyClassifier(strategy='most_frequent')
dummy.fit(X_train, y_train)
y_pred_dummy = dummy.predict(X_val)
print(f'Baseline accuracy: {accuracy_score(y_val, y_pred_dummy):.4f}')

Baseline accuracy: 0.1515


In [7]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression

# Модель с CountVectorizer
pipe_count = Pipeline([
    ('vec', CountVectorizer(max_features=20_000)),
    ('clf', LogisticRegression(max_iter=1000))
])
pipe_count.fit(X_train, y_train)
y_pred_count = pipe_count.predict(X_val)
print(f'CountVectorizer accuracy: {accuracy_score(y_val, y_pred_count):.4f}')

# Модель с TfidfVectorizer
pipe_tfidf = Pipeline([
    ('vec', TfidfVectorizer(max_features=20_000)),
    ('clf', LogisticRegression(max_iter=1000))
])
pipe_tfidf.fit(X_train, y_train)
y_pred_tfidf = pipe_tfidf.predict(X_val)
print(f'TfidfVectorizer accuracy: {accuracy_score(y_val, y_pred_tfidf):.4f}')

CountVectorizer accuracy: 0.8380
TfidfVectorizer accuracy: 0.8363


In [8]:
from sklearn.model_selection import GridSearchCV

params = {
    'vec__ngram_range': [(1, 1), (1, 2)],
    'vec__max_features': [10_000, 20_000, 30_000],
    'clf__C': [0.1, 1, 10]
}

grid = GridSearchCV(pipe_tfidf, params, cv=3, n_jobs=-1)
grid.fit(X_train, y_train)
print(f'Best params: {grid.best_params_}')
print(f'Best CV accuracy: {grid.best_score_:.4f}')

Best params: {'clf__C': 10, 'vec__max_features': 30000, 'vec__ngram_range': (1, 2)}
Best CV accuracy: 0.8454


In [9]:
best_model = grid.best_estimator_
y_test_pred = best_model.predict(X_test)
print(f'Test accuracy: {accuracy_score(y_test, y_test_pred):.4f}')

Test accuracy: 0.8552
