In [159]:
# !pip install gensim

In [160]:
from gensim import models
from gensim.models import Word2Vec
import gzip
from dataclasses import dataclass
from typing import Iterator
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import classification_report, accuracy_score


In [161]:

@dataclass
class Text:
    label: str
    title: str
    text: str


def read_texts(fn: str) -> Iterator[Text]:
    with gzip.open(fn, "rt", encoding="utf-8") as f:
        for line in f:
            yield Text(*line.strip().split("\t"))

texts = list(read_texts("../Data/news.txt.gz"))

In [162]:
sentenses = []
splitted_sentenses = []
for text in texts:
    sentenses.append(str(text.text))
    splitted_sentenses.append(str(text.text).split())


In [163]:
splitted_sentenses[0]

['Парусная',
 'гонка',
 'Giraglia',
 'Rolex',
 'Cup',
 'пройдет',
 'в',
 'Средиземном',
 'море',
 'в',
 '64-й',
 'раз.',
 'Победители',
 'соревнования,',
 'проводимого',
 'с',
 '1953',
 'года',
 'Yacht',
 'Club',
 'Italiano,',
 'помимо',
 'других',
 'призов',
 'традиционно',
 'получают',
 'в',
 'подарок',
 'часы',
 'от',
 'швейцарского',
 'бренда',
 'Rolex.',
 'Об',
 'этом',
 'сообщается',
 'в',
 'пресс-релизе,',
 'поступившем',
 'в',
 'редакцию',
 '«Ленты.ру»',
 'в',
 'среду,',
 '8',
 'мая.',
 'Rolex',
 'Yacht-Master',
 '40',
 'Фото:',
 'пресс-служба',
 'Mercury',
 'Соревнования',
 'будут',
 'проходить',
 'с',
 '10',
 'по',
 '18',
 'июня.',
 'Первый',
 'этап:',
 'ночной',
 'переход',
 'из',
 'Сан-Ремо',
 'в',
 'Сен-Тропе',
 '10-11',
 'июня',
 '(дистанция',
 '50',
 'морских',
 'миль',
 '—',
 'около',
 '90',
 'километров).',
 'Второй',
 'этап:',
 'серия',
 'прибрежных',
 'гонок',
 'в',
 'бухте',
 'Сен-Тропе',
 'с',
 '11',
 'по',
 '14',
 'июня.',
 'Финальный',
 'этап',
 'пройдет',
 'с',


In [164]:
model = Word2Vec(sentences=splitted_sentenses, vector_size=100, window=5, min_count=5, workers=8)


In [165]:
model.save("word2vec.model")


In [166]:
def average_word_vectors(sentence, model):
    word_vectors = []
    
    for word in sentence:
        if word in model.wv:
            word_vectors.append(model.wv[word])
    
    if word_vectors:  # Если есть векторы слов
        return np.mean(word_vectors, axis=0)
    else:
        return np.zeros(model.vector_size)

In [167]:
vectors = [average_word_vectors(sentence, model) for sentence in splitted_sentenses]

In [168]:
print(vectors[0])

[-0.28305933  0.1781377   0.6482528   0.05339202  0.14094466 -0.6575912
  0.1773094   0.750503   -0.04260653 -0.5870149   0.07889684 -0.4774491
  0.16424078  0.11721242 -0.12543549 -0.29640886  0.16696887 -0.4006402
 -0.46330282 -0.8066981   0.27674818  0.28572816  0.05628825 -0.42521837
 -0.17028381  0.5518004  -0.11539298  0.1724218  -0.32792455 -0.01776486
  0.29239812 -0.47888553  0.01594489 -0.5756222  -0.25329664  0.3787091
  0.58829963 -0.8145929  -0.09155973 -0.48030734  0.02259251  0.12485061
 -0.06894988 -0.40472665  0.13122831 -0.01679922 -0.11982724  0.13023798
  0.05684418  0.38985637  0.41506922 -0.47579008 -0.59448653 -0.16051212
 -0.30574295  0.194872    0.5892478   0.01851656 -0.27384531  0.161676
 -0.21030824  0.41349292  0.5681323  -0.28515032  0.09921865  0.3660423
  0.07787875  0.24499524 -0.42196092 -0.19294138  0.26230213  0.24468961
  0.19447999 -0.32646596 -0.05145654 -0.04575399 -0.15079089 -0.10263439
 -0.4060244  -0.12799448 -0.4553816   0.36513677 -0.092810

In [169]:
labels = []
for text in texts:
    labels.append(text.label)

In [170]:
X_train, X_test, y_train, y_test = train_test_split(vectors, labels, test_size=0.2, random_state=77)

In [172]:
SVC_model = SVC()  
SVC_model.fit(np.asarray(X_train), np.asarray(y_train))

In [None]:
y_pred = SVC_model.predict(X_test)

print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

Accuracy: 0.6405
              precision    recall  f1-score   support

    business       0.00      0.00      0.00        64
     culture       0.64      0.68      0.66       273
   economics       0.59      0.83      0.69       265
      forces       0.55      0.49      0.52       170
        life       0.62      0.63      0.62       262
       media       0.53      0.51      0.52       295
     science       0.59      0.72      0.65       282
       sport       0.95      0.84      0.89       318
       style       0.88      0.19      0.31        37
      travel       0.00      0.00      0.00        34

    accuracy                           0.64      2000
   macro avg       0.53      0.49      0.49      2000
weighted avg       0.62      0.64      0.62      2000



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [175]:
tfidf_vectorizer = TfidfVectorizer()
X_tfidf = tfidf_vectorizer.fit_transform(sentenses)
X_train_tfidf, X_test_tfidf, y_train, y_test = train_test_split(X_tfidf, labels, test_size=0.2, random_state=77)

In [176]:
SVC_model = SVC()  
SVC_model.fit(X_train_tfidf, np.asarray(y_train))

In [179]:
y_pred = SVC_model.predict(X_test_tfidf)

print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

Accuracy: 0.839
              precision    recall  f1-score   support

    business       1.00      0.08      0.14        64
     culture       0.91      0.89      0.90       273
   economics       0.79      0.92      0.85       265
      forces       0.81      0.72      0.76       170
        life       0.73      0.93      0.82       262
       media       0.85      0.83      0.84       295
     science       0.80      0.89      0.84       282
       sport       0.98      0.96      0.97       318
       style       1.00      0.30      0.46        37
      travel       1.00      0.29      0.45        34

    accuracy                           0.84      2000
   macro avg       0.89      0.68      0.70      2000
weighted avg       0.85      0.84      0.82      2000

