In [2]:
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
import joblib

In [5]:
df = pd.read_csv('../data/preprocessed.csv', parse_dates=['date'],
                 dtype={'url': object, 'title': object, 'text': object, 'topic': object, 'title_clean': object,
                        'text_clean': object, 'title_tokens': object, 'text_tokens': object})

df = df.sort_values('date').reset_index(drop=True)

In [6]:
df = df.drop(df[df.isna().any(axis=1)].index)

In [7]:
train_end = int(df.shape[0] * 0.9)
val_end = int(df.shape[0] * 0.95)

train_df = df[:train_end]
val_df = df[train_end:val_end]
test_df = df[val_end:]

In [8]:
X_train, X_val, X_test = train_df['text_tokens'], val_df['text_tokens'], test_df['text_tokens']
y_train, y_val, y_test = train_df['topic'], val_df['topic'], test_df['topic']

In [None]:
pipeline_logreg = Pipeline([
    ('tfidf', TfidfVectorizer(
        max_features=20000,
        ngram_range=(1, 2),
        token_pattern=r'\b\w+\b'
    )),
    ('clf', LogisticRegression(
        solver='saga',
        C=1.0,
        max_iter=1000,
        random_state=42,
        n_jobs=-1
    ))
])

pipeline_logreg.fit(X_train, y_train)

In [6]:
pipeline_logreg = joblib.load('../models/tfidf_logreg_pipeline.joblib')

In [7]:
y_pred = pipeline_logreg.predict(X_val)

classification_report(y_val, y_pred, digits=4)

                   precision    recall  f1-score   support

           Бизнес     0.5440    0.4136    0.4699       926
      Бывший СССР     0.8631    0.8393    0.8510      2471
              Дом     0.8205    0.5850    0.6830      1453
         Из жизни     0.4465    0.7724    0.5659      1393
   Интернет и СМИ     0.7037    0.7331    0.7181      2488
         Культура     0.8319    0.7774    0.8037      2502
              Мир     0.8043    0.7899    0.7971      5370
  Наука и техника     0.8393    0.7734    0.8050      1986
      Путешествия     0.8940    0.5411    0.6742      1216
           Россия     0.6340    0.8269    0.7177      6042
Силовые структуры     0.6860    0.5118    0.5862      2757
            Спорт     0.9622    0.9612    0.9617      3656
         Ценности     0.9823    0.7383    0.8430      1949
        Экономика     0.7609    0.7108    0.7350      2552

         accuracy                         0.7552     36761
        macro avg     0.7695    0.7125    0.7294     3

In [44]:
texts = [
    "Экономические итоги первого квартала перевыполнили прогнозы.",
    "Новый фильм режиссёра выйдет в прокат этим летом."
]

predicted_topics = pipeline_logreg.predict(texts)
predicted_topics

array(['Россия', 'Культура'], dtype=object)

In [9]:
from sklearn.metrics import accuracy_score, f1_score

logreg_stats = {
        'model': 'logreg',
        'accuracy': accuracy_score(y_val, y_pred),
        'f1_macro': f1_score(y_val, y_pred, average='weighted'),
    }

In [25]:
# joblib.dump(pipeline_logreg, '../models/tfidf_logreg_pipeline.joblib')

['../models/tfidf_logreg_pipeline.joblib']

In [10]:
vect = pipeline_logreg.named_steps['tfidf']
X_train_tfidf = vect.transform(X_train)
X_val_tfidf   = vect.transform(X_val)
X_test_tfidf  = vect.transform(X_test)

In [11]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier

models = {
    'SVC': LinearSVC(C=1.0, max_iter=10000),
    'NB': MultinomialNB(alpha=0.1),
    'RF': RandomForestClassifier(n_estimators=200, max_depth=20, n_jobs=-1),
    'KNN': KNeighborsClassifier(n_neighbors=5)
}

In [None]:
results = []
results.append(logreg_stats)

for name, model in models.items():
    model.fit(X_train_tfidf, y_train)

    y_pred = model.predict(X_val_tfidf)

    acc = accuracy_score(y_val, y_pred)
    f1 = f1_score(y_val, y_pred, average='weighted')

    results.append({
        'model': name,
        'accuracy': acc,
        'f1_weighted': f1
    })

df_results = pd.DataFrame(results).sort_values('f1_weighted', ascending=False)
df_results.sort_values('f1_weighted', ascending=False)

In [37]:
best_model = df_results.loc[df_results['f1_weighted'].idxmax(), 'model']

In [40]:
best_pipeline = Pipeline([
    ('tfidf', vect),
    ('clf', models[best_model])
])

In [3]:
# best_pipeline = joblib.load(f'../models/tfidf_svc_pipeline(best).joblib')

In [11]:
y_pred = best_pipeline.predict(X_val)

print(classification_report(y_val, y_pred, digits=4))

                   precision    recall  f1-score   support

           Бизнес     0.5071    0.4622    0.4836       926
      Бывший СССР     0.8525    0.8539    0.8532      2471
              Дом     0.8035    0.5712    0.6677      1453
         Из жизни     0.4713    0.7365    0.5748      1393
   Интернет и СМИ     0.6978    0.7436    0.7200      2488
         Культура     0.8329    0.8010    0.8166      2502
              Мир     0.8057    0.7912    0.7984      5370
  Наука и техника     0.8458    0.7679    0.8050      1986
      Путешествия     0.8517    0.5855    0.6940      1216
           Россия     0.6442    0.8130    0.7188      6042
Силовые структуры     0.6696    0.5183    0.5843      2757
            Спорт     0.9608    0.9664    0.9636      3656
         Ценности     0.9708    0.7686    0.8580      1949
        Экономика     0.7714    0.6900    0.7284      2552

         accuracy                         0.7581     36761
        macro avg     0.7632    0.7192    0.7333     3

In [42]:
joblib.dump(best_pipeline, f'../models/tfidf_{best_model.lower()}_pipeline(best).joblib')

['../models/tfidf_svc_pipeline(best).joblib']