In [1]:
# !pip install gensim

In [2]:
from gensim import models
from gensim.models import Word2Vec
import gzip
from dataclasses import dataclass
from typing import Iterator
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import classification_report, accuracy_score
import nltk
from nltk.corpus import stopwords
import pymorphy2


In [3]:
@dataclass
class Text:
    label: str
    title: str
    text: str


def read_texts(fn: str) -> Iterator[Text]:
    with gzip.open(fn, "rt", encoding="utf-8") as f:
        for line in f:
            yield Text(*line.strip().split("\t"))

texts = list(read_texts("../Data/news.txt.gz"))

In [4]:
def pymorphy2_311_hotfix():
    # https://github.com/pymorphy2/pymorphy2/issues/160#issuecomment-1486657176
    # https://github.com/natasha/yargy/blob/master/yargy/morph.py

    from inspect import getfullargspec
    from pymorphy2.units.base import BaseAnalyzerUnit

    def _get_param_names_311(klass):
        if klass.__init__ is object.__init__:
            return []
        args = getfullargspec(klass.__init__).args
        return sorted(args[1:])

    setattr(BaseAnalyzerUnit, '_get_param_names', _get_param_names_311)

pymorphy2_311_hotfix()

morph = pymorphy2.MorphAnalyzer(lang="ru")

In [5]:
sentenses = []
splitted_sentenses = []
stop_words = set(stopwords.words('russian'))

for text in texts:
    words = nltk.word_tokenize(text=text.text, language="russian")
    filtered_words = [word for word in words if word.lower() not in stop_words]
    lemmatized_words = [morph.parse(word)[0].normal_form for word in filtered_words]

    sentenses.append(' '.join(lemmatized_words[j] for j in range (0, len(lemmatized_words))))
    splitted_sentenses.append(lemmatized_words)


In [6]:
sentenses[0]

'парусный гонка giraglia rolex cup пройти средиземный море 64-й раз. победитель соревнование , проводить 1953 год yacht club italiano , помимо другой приз традиционно получать подарок часы швейцарский бренд rolex . сообщаться пресс-релиз , поступить редакция « лента.ру » среда , 8 май . rolex yacht-master 40 фото : пресс-служба mercury соревнование быть проходить 10 18 июнь . первый этап : ночной переход сан-рть сен-тропа 10-11 июнь ( дистанция 50 морской миля — около 90 километр ) . второй этап : серия прибрежный гонка бухта сен-тропа 11 14 июнь . финальный этап пройти 15 18 июнь : оффшорный гонка маршрут сен-тропа — генуя ( 243 морской миля — 450 километр ) . маршрут проходить скалистый остров джиралья север корсика завершаться генуе.регат , 1997 год проходить поддержка rolex , считаться один самый значительный яхтенный гонка средиземноморье . год ожидаться участие три российский экипаж .'

In [7]:
model = Word2Vec(sentences=splitted_sentenses, workers=8)

In [8]:
model.save("word2vec.model")

In [9]:
def average_word_vectors(sentence, model):
    word_vectors = []
    
    for word in sentence:
        if word in model.wv:
            word_vectors.append(model.wv[word])
    
    if word_vectors:
        return np.mean(word_vectors, axis=0)
    else:
        return np.zeros(model.vector_size)

In [10]:
vectors = [average_word_vectors(sentence, model) for sentence in splitted_sentenses]

In [11]:
print(vectors[0])

[-0.06904832 -0.08142804  0.09907641  0.3446452   0.3945781  -0.49021947
  0.4609607   0.6448814  -0.46921492 -0.4363179  -0.0892451  -0.19145203
 -0.01555345  0.23038162  0.20583303  0.45850426  0.3401238   0.17050976
 -0.28324565 -0.5078048   0.33006963  0.07126041 -0.05799299 -0.56073105
  0.1835928   0.01304387 -0.04058496 -0.04948281 -0.2304053  -0.06325565
  0.2952608  -0.04963791  0.17807652 -0.22071345  0.33693838  0.19330445
  0.08691394  0.15333341  0.30263948 -0.5456517   0.33436    -0.17979413
 -0.95310104  0.0820649   0.27382812 -0.00785228  0.07325777 -0.18739973
  0.3857527  -0.05377727 -0.14798917 -0.61622906 -0.08538739  0.17335361
  0.23979409  0.06387627  0.20320457  0.04303995 -0.3415391   0.29902488
 -0.26022607  0.06532786 -0.34965274 -0.08803541 -0.20901525  0.42487594
  0.6711397   0.12268247 -0.09132371  0.39411318 -0.3615868   0.19407414
  0.10699016 -0.05230116 -0.32196024  0.25937325 -0.25245    -0.30965412
  0.2352765  -0.11491367 -0.5600821  -0.29632518 -0

In [12]:
labels = []
for text in texts:
    labels.append(text.label)

In [13]:
X_train, X_test, y_train, y_test = train_test_split(vectors, labels, test_size=0.2, random_state=77)

In [14]:
SVC_model = SVC()  
SVC_model.fit(np.asarray(X_train), np.asarray(y_train))

In [15]:
y_pred = SVC_model.predict(X_test)

print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

Accuracy: 0.8205
              precision    recall  f1-score   support

    business       0.51      0.30      0.38        64
     culture       0.86      0.84      0.85       273
   economics       0.77      0.86      0.81       265
      forces       0.73      0.88      0.80       170
        life       0.77      0.82      0.80       262
       media       0.82      0.77      0.79       295
     science       0.84      0.83      0.84       282
       sport       0.96      0.95      0.95       318
       style       0.85      0.62      0.72        37
      travel       0.86      0.35      0.50        34

    accuracy                           0.82      2000
   macro avg       0.80      0.72      0.74      2000
weighted avg       0.82      0.82      0.82      2000



In [16]:
tfidf_vectorizer = TfidfVectorizer()
X_tfidf = tfidf_vectorizer.fit_transform(sentenses)
X_train_tfidf, X_test_tfidf, y_train, y_test = train_test_split(X_tfidf, labels, test_size=0.2, random_state=77)

In [17]:
SVC_model = SVC()  
SVC_model.fit(X_train_tfidf, np.asarray(y_train))

In [18]:
y_pred = SVC_model.predict(X_test_tfidf)

print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

Accuracy: 0.868
              precision    recall  f1-score   support

    business       0.93      0.20      0.33        64
     culture       0.93      0.92      0.92       273
   economics       0.82      0.92      0.87       265
      forces       0.84      0.82      0.83       170
        life       0.76      0.94      0.84       262
       media       0.89      0.86      0.88       295
     science       0.84      0.89      0.87       282
       sport       0.98      0.97      0.97       318
       style       0.90      0.51      0.66        37
      travel       1.00      0.44      0.61        34

    accuracy                           0.87      2000
   macro avg       0.89      0.75      0.78      2000
weighted avg       0.88      0.87      0.86      2000

