In [1]:
""" Imports """
import re

import nltk

nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer
from nltk.stem.snowball import SnowballStemmer

from natasha import Segmenter, MorphVocab, NewsEmbedding, NewsMorphTagger, Doc

from sklearn.base import BaseEstimator, TransformerMixin

from string import punctuation
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
import numpy as np
import pandas as pd
from IPython.display import display, Markdown

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/kapuchinka/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


### Инициализация компонентов

In [2]:
segmenter = Segmenter()
morph_vocab = MorphVocab()

emb = NewsEmbedding()
morph_tagger = NewsMorphTagger(emb)

stop_words = list(punctuation)
stop_words.extend(stopwords.words('russian'))
tokenizer = RegexpTokenizer(r'\w+')

snow_stemmer = SnowballStemmer(language='russian')

### Получение датасета из файла

In [3]:
def open_dataset():
    l_dataset = pd.read_csv('./resources/dataset.csv')
    display(Markdown(f"""#### Исходный Датасет"""))
    display(l_dataset)
    return l_dataset

### Токенизация, Лемматизация, Стемминг

In [4]:
def preprocess_texts_natasha(p_texts):
    processed_texts = []
    
    for text in p_texts:
        doc = Doc(text)
        doc.segment(segmenter)
        doc.tag_morph(morph_tagger)
        lemmas = []
        for token in doc.tokens:
            token.lemmatize(morph_vocab)
            stem = snow_stemmer.stem(token.lemma)
            lemmas.append(stem)
        processed_texts.append(" ".join(lemmas))
        
    return processed_texts

### Очистка от стоп-слов

In [5]:
def clean_texts(p_texts):
    text = p_texts.lower()
    text = re.sub(r"[^а-яА-ЯёЁ\s]", " ", text)
    tokens = text.split()
    tokens = [word for word in tokens if word not in stop_words]
    
    return " ".join(tokens)

### Обучение sklearn

In [6]:
def build_text_pipeline():
    class NatashaPreprocessor(BaseEstimator, TransformerMixin):
        def fit(self, X, y=None):
            return self
        def transform(self, X):
            return [clean_texts(text) for text in preprocess_texts_natasha(X)]
    pipeline = Pipeline([
        ('preprocessing', NatashaPreprocessor()),
        ('vectorizer', TfidfVectorizer())
    ])
    return pipeline

### Разделение выборки на обучающую и тестовую

In [7]:
def get_split_data(param_df, param_text_col, param_label_col, param_test_size=0.33, param_random_state=42):
    return train_test_split(param_df[param_text_col], param_df[param_label_col], test_size=param_test_size, random_state=param_random_state)

In [8]:
def fill_results(param_results, param_pipe, param_approach, param_model_name, param_x_val, param_y_val):
    predicts = param_pipe.predict(param_x_val)
    local_results = param_results
    local_results.loc[f'{param_approach} + {param_model_name}', 'accuracy'] = accuracy_score(param_y_val, predicts) #   Процент правильно классифицированных объектов из общего числа
    local_results.loc[f'{param_approach} + {param_model_name}', 'precision'] = precision_score(param_y_val, predicts, average='weighted', zero_division=0) # Из всех объектов предсказанных как   "положительные", сколько действительно оказались положительными.
    local_results.loc[f'{param_approach} + {param_model_name}', 'recall'] = recall_score(param_y_val, predicts, average='weighted', zero_division=0) # Из всех настоящих положительных объектов, сколько модель правильно нашла.
    local_results.loc[f'{param_approach} + {param_model_name}', 'f1-score'] = f1_score(param_y_val, predicts, average='weighted', zero_division=0) # Гармоническое среднее precision и recall
    return local_results

### Обучение моделей

In [9]:
def train_models(param_x_train, param_y_train, param_x_val, param_y_val, param_vectorizers, param_results_df):
    models = [
        (LinearSVC(random_state=42, max_iter=1000), 'SVM'),
        (LogisticRegression(random_state=42), 'LogReg'),
        (RandomForestClassifier(random_state=42), 'RF')
    ]
    
    for vec, vec_name in param_vectorizers:
        for model, model_name in models:
            pipe = Pipeline([
                ('vectorizer', vec),
                ('model', model)
            ])
            pipe.fit(param_x_train, param_y_train)
            local_results_df = fill_results(param_results_df, pipe, vec_name, model_name, param_x_val, param_y_val)
    return local_results_df

In [10]:
def build_co_occurrence_matrix(param_tokenized_texts, param_window_size=2):
    words = list(set(sum(param_tokenized_texts, [])))
    matrix = pd.DataFrame(np.zeros((len(words), len(words))), index=words, columns=words)
    
    for tokens in param_tokenized_texts:
        for idx, token in enumerate(tokens):
            start = max(0, idx - param_window_size)
            end = min(len(tokens), idx + param_window_size + 1)
            context = tokens[start:idx] + tokens[idx+1:end]
        for word in context:
            matrix.loc[token, word] += 1
    
    return matrix

# Результаты

In [11]:
dataset = open_dataset()

preprocess_result = preprocess_texts_natasha(dataset['text'])
dataset['clean_text'] = dataset['text'].apply(clean_texts)

pipeline = build_text_pipeline()
X_transformed = pipeline.fit_transform(dataset['clean_text'].head(10))
transformed_df = pd.DataFrame(X_transformed.toarray())

display(Markdown(f"""#### Датасет после препроцессинга """))
display(dataset)

X_train, X_val, Y_train, Y_val = get_split_data(dataset, param_text_col='clean_text', param_label_col='category')

display(Markdown(f""" ### Разделённая выборка """))
display(X_train.to_frame())

vectorizers_bow = [
  (CountVectorizer(), 'BoW'),
  (CountVectorizer(ngram_range=(1, 2)), 'BoW ngrams')
]

vectorizers_tfidf = [
  (TfidfVectorizer(), 'TF-IDF'),
  (TfidfVectorizer(ngram_range=(1, 2)), 'TF-IDF ngrams')
]

results_bow = pd.DataFrame(columns=['accuracy', 'precision', 'recall', 'f1-score'])
results_tfidf = pd.DataFrame(columns=['accuracy', 'precision', 'recall', 'f1-score'])

results_bow = train_models(X_train, Y_train, X_val, Y_val, vectorizers_bow, results_bow)
results_tfidf = train_models(X_train, Y_train, X_val, Y_val, vectorizers_tfidf, results_tfidf)

display(Markdown(f""" ### Результаты сравнения """))
display(results_bow)
display(results_tfidf)

tokenized_data = dataset['clean_text'].head(10).apply(lambda x: x.split())
co_matrix = build_co_occurrence_matrix(tokenized_data.tolist())

display(Markdown(f"""### Матрица встречаемости слов """))
display(co_matrix)

co_matrix = pd.DataFrame(X_transformed.toarray())
output_path = "./out/output.csv"
co_matrix.to_csv(output_path, index=False)

display(Markdown(f"""### Результат сохранен в файл: {output_path}"""))

#### Исходный Датасет

Unnamed: 0,oid,category,text
0,365271984,winter_sport,Волшебные фото Виктория Поплавская ЕвгенияМедв...
1,503385563,extreme,Возвращение в подземелье Треша 33 Эйфория тупо...
2,146016084,football,Лучшие чешские вратари – Доминик Доминатор Гаш...
3,933865449,boardgames,Rtokenoid Warhammer40k валрак решил нас подкор...
4,713550145,hockey,Шестеркин затаскивает Рейнджерс в финал Восточ...
...,...,...,...
38735,910636962,autosport,8 битная буря снова накрыла пикселями автомоби...
38736,669736851,autosport,Ира Сидоркова объясняет как сказалась на ее ма...
38737,558919241,tennis,24 я ракетка мира хорват Марин Чилич обыграл и...
38738,776944963,volleyball,Стал известен календарь мужской сборной России...


#### Датасет после препроцессинга 

Unnamed: 0,oid,category,text,clean_text
0,365271984,winter_sport,Волшебные фото Виктория Поплавская ЕвгенияМедв...,волшебные фото виктория поплавская евгениямедв...
1,503385563,extreme,Возвращение в подземелье Треша 33 Эйфория тупо...,возвращение подземелье треша эйфория тупости ж...
2,146016084,football,Лучшие чешские вратари – Доминик Доминатор Гаш...,лучшие чешские вратари доминик доминатор гашек...
3,933865449,boardgames,Rtokenoid Warhammer40k валрак решил нас подкор...,валрак решил подкормить сильно свежими слухами...
4,713550145,hockey,Шестеркин затаскивает Рейнджерс в финал Восточ...,шестеркин затаскивает рейнджерс финал восточно...
...,...,...,...,...
38735,910636962,autosport,8 битная буря снова накрыла пикселями автомоби...,битная буря снова накрыла пикселями автомобиль...
38736,669736851,autosport,Ира Сидоркова объясняет как сказалась на ее ма...,ира сидоркова объясняет сказалась машине резул...
38737,558919241,tennis,24 я ракетка мира хорват Марин Чилич обыграл и...,ракетка мира хорват марин чилич обыграл испанц...
38738,776944963,volleyball,Стал известен календарь мужской сборной России...,стал известен календарь мужской сборной россии...


 ### Разделённая выборка 

Unnamed: 0,clean_text
30973,хара прайс кевин хэйс претенденты приз верност...
1705,легендарных борца аниуар гедуев роман власов п...
9434,выиграй настольную игру место преступления поч...
11358,презентация издательства звезда граниконе года...
38169,роман йоси набрал е очко нхл игре сиэтлом игро...
...,...
6265,публикации бегофоточки кидайте комменарии этом...
11284,универсальная кредитная карта тинькофф платину...
38158,экзотические методы тренировок развитие силы т...
860,нравятся игры объяснение слов ощущение жесты о...




 ### Результаты сравнения 

Unnamed: 0,accuracy,precision,recall,f1-score
BoW + SVM,0.79648,0.799406,0.79648,0.797526
BoW + LogReg,0.800548,0.809893,0.800548,0.803691
BoW + RF,0.762221,0.790656,0.762221,0.76987
BoW ngrams + SVM,0.813297,0.818613,0.813297,0.815082
BoW ngrams + LogReg,0.803285,0.817155,0.803285,0.807652
BoW ngrams + RF,0.760579,0.808073,0.760579,0.77363


Unnamed: 0,accuracy,precision,recall,f1-score
TF-IDF + SVM,0.845287,0.846422,0.845287,0.845541
TF-IDF + LogReg,0.818616,0.829487,0.818616,0.821807
TF-IDF + RF,0.763473,0.783125,0.763473,0.769056
TF-IDF ngrams + SVM,0.852249,0.852895,0.852249,0.851974
TF-IDF ngrams + LogReg,0.815722,0.823186,0.815722,0.817498
TF-IDF ngrams + RF,0.763082,0.796376,0.763082,0.772023


### Матрица встречаемости слов 

Unnamed: 0,харрикейнз,км,кейнс,хоккейное,сегодня,основной,территории,киллтим,поддержку,границе,...,ноябре,атилланских,ценный,проделал,подписал,спорта,ребята,релиз,рандом,гилфорд
харрикейнз,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
км,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
кейнс,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
хоккейное,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
сегодня,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
спорта,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
ребята,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
релиз,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
рандом,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### Результат сохранен в файл: ./out/output.csv