In [4]:
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score, f1_score
from sklearn.model_selection import GridSearchCV
import joblib

In [7]:
fields = ['text_clean', 'topic']

train_df = pd.read_csv('../data/train.csv',
                       dtype={'topic': object,
                              'text_clean': object},
                       usecols=fields)
val_df = pd.read_csv('../data/val.csv',
                     dtype={'topic': object,
                            'text_clean': object},
                     usecols=fields)
test_df = pd.read_csv('../data/test.csv',
                      dtype={'topic': object,
                             'text_clean': object},
                      usecols=fields)

In [8]:
X_train, X_val, X_test = train_df['text_clean'], val_df['text_clean'], test_df['text_clean']
y_train, y_val, y_test = train_df['topic'], val_df['topic'], test_df['topic']

In [4]:
vect = TfidfVectorizer(
    max_features=20000,
    ngram_range=(1, 2),
    token_pattern=r'\b\w+\b'
)
X_train_tfidf = vect.fit_transform(X_train)
X_val_tfidf = vect.transform(X_val)
X_test_tfidf = vect.transform(X_test)

In [5]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier

models = {
    'LogReg': LogisticRegression(solver='saga', C=1.0, max_iter=1000, random_state=42, n_jobs=-1),
    'SVC': LinearSVC(C=1.0, max_iter=10000),
    'NB': MultinomialNB(alpha=0.1),
    'RF': RandomForestClassifier(n_estimators=200, max_depth=20, n_jobs=-1),
    'KNN': KNeighborsClassifier(n_neighbors=5)
}

In [6]:
results = []
best_models = {}

for name, model in models.items():
    print(f'fitting {name}')
    model.fit(X_train_tfidf, y_train)
    y_pred = model.predict(X_val_tfidf)

    acc = accuracy_score(y_val, y_pred)
    f1_macro = f1_score(y_val, y_pred, average='macro')
    f1_weighted = f1_score(y_val, y_pred, average='weighted')

    results.append({
        'model': name,
        'accuracy': acc,
        'f1_macro': f1_macro,
        'f1_weighted': f1_weighted
    })

df_results = pd.DataFrame(results).sort_values('f1_macro', ascending=False)
df_results.sort_values('f1_macro', ascending=False)

fitting LogReg
fitting SVC
fitting NB
fitting RF
fitting KNN


Unnamed: 0,model,accuracy,f1_macro
0,LogReg,0.734386,0.667077
1,SVC,0.7284,0.666139
2,NB,0.663367,0.591631
3,RF,0.450309,0.305026
4,KNN,0.147317,0.101493


In [9]:
best_model = df_results.loc[df_results['f1_macro'].idxmax(), 'model']

In [10]:
best_pipeline = Pipeline([
    ('tfidf', vect),
    ('clf', models[best_model])
])

In [11]:
texts = [
    "Экономические итоги первого квартала перевыполнили прогнозы.",
    "Новый фильм режиссёра выйдет в прокат этим летом."
]

predicted_topics = best_pipeline.predict(texts)
predicted_topics

array(['Россия', 'Культура'], dtype=object)

In [5]:
best_pipeline = joblib.load(f'../models/tfidf_logreg_pipeline(best).joblib')

In [12]:
y_pred = best_pipeline.predict(X_test)

print(classification_report(y_test, y_pred, digits=4))

                   precision    recall  f1-score   support

           Бизнес     0.0000    0.0000    0.0000         4
      Бывший СССР     0.7824    0.7827    0.7825      2609
              Дом     0.8299    0.7625    0.7948       800
         Из жизни     0.5676    0.8189    0.6705      2777
   Интернет и СМИ     0.6220    0.6223    0.6222      2531
         Культура     0.8434    0.7876    0.8146      2298
              Мир     0.7685    0.7931    0.7806      6746
  Наука и техника     0.7655    0.7764    0.7709      3090
      Путешествия     0.9125    0.4121    0.5678      1291
           Россия     0.6415    0.7207    0.6788      6891
Силовые структуры     0.5395    0.3991    0.4588      1729
            Спорт     0.9399    0.9443    0.9421      3359
         Ценности     0.9499    0.5260    0.6770      1405
        Экономика     0.8503    0.7181    0.7786      4065

         accuracy                         0.7344     39595
        macro avg     0.7152    0.6474    0.6671     3

In [17]:
joblib.dump(best_pipeline, f'../models/tfidf_{best_model.lower()}_pipeline(best).joblib')

['../models/tfidf_logreg_pipeline(best).joblib']