In [14]:
import pandas as pd
dataset = pd.read_csv('DataSets/train.csv', delimiter=';')

In [15]:
import pymorphy3
from razdel import tokenize
import nltk
from nltk.corpus import stopwords

nltk.download("stopwords")
russian_stopwords = stopwords.words("russian")

dataset = dataset.drop_duplicates()
dataset = dataset[dataset['Текст инцидента'] != '\'Сообщение без текста']

morph = pymorphy3.MorphAnalyzer()

stopTags = ['PNCT', 'NUMB', 'UNKN', 'LATN', 'ROMN']

def preprocess_text(text):
    tokens = tokenize(text.lower())
    tokens = [morph.parse(token.text)[0].normal_form for token in tokens 
              if morph.parse(token.text)[0].normal_form not in russian_stopwords and 
              not any(tag in morph.parse(token.text)[0].tag for tag in stopTags)]
    text = " ".join(tokens) 
    return text

def get_themes_code():
    themes = dataset['Тема'].unique()
    dict = {themes[i]:i for i in range(0, len(themes))}
    return dict

themes_code = get_themes_code()

def get_group_code():
    themes = dataset['Группа тем'].unique()
    dict = {themes[i]:i for i in range(0, len(themes))}
    return dict

group_code = get_group_code()

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\skinn\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [16]:
import numpy as np
import torch
from transformers import AutoTokenizer, AutoModel

data = dataset

x = data['Текст инцидента']
y1 = data['Группа тем']
y1 = np.array(y1.replace(group_code))
y2 = data['Тема']
y2 = np.array(y2.replace(themes_code))

In [9]:
#не запускать
tokenizer = AutoTokenizer.from_pretrained("cointegrated/LaBSE-en-ru")
model = AutoModel.from_pretrained("cointegrated/LaBSE-en-ru")

list = []

for s in x:
    s = preprocess_text(s)
    encoded_input = tokenizer(s, padding=True, truncation=True, max_length=256, return_tensors='pt')
    with torch.no_grad():
        model_output = model(**encoded_input)
    emb = model_output.pooler_output
    list.append((emb)[0].numpy())

x_embeddings = np.asarray(list)

#сохранить embeddings
np.save(f'embeddings_{len(x_embeddings)}', x_embeddings)

In [17]:
#загрузить embeddings
x_embeddings = np.load('embeddings_22841.npy')

In [18]:
from sklearn.svm import LinearSVC
from sklearn.metrics import f1_score
from sklearn.model_selection import train_test_split

x_group_train, x_group_test, y_group_train, y_group_test = train_test_split(x_embeddings, y1, test_size=0.1, random_state=0)

group_model = LinearSVC(dual=False)
group_model.fit(x_group_train, y_group_train)

y_group_pred = group_model.predict(x_group_test)

print('F1_GROUP: ', round(f1_score(y_group_test, y_group_pred, average='weighted'), 3))

F1_GROUP:  0.704


In [19]:
from sklearn.metrics import accuracy_score
print('accuracy_GROUP: ', round(accuracy_score(y_group_test, y_group_pred), 3))

accuracy_GROUP:  0.711


In [27]:
x_theme_train, x_theme_test, y_theme_train, y_theme_test = train_test_split(x_embeddings, y2, test_size=0.1, random_state=0)
x_theme_train = np.hstack((x_theme_train, y_group_train.reshape(-1, 1)))
x_theme_test = np.hstack((x_theme_test, y_group_test.reshape(-1, 1)))

theme_model = LinearSVC(dual=False)
theme_model.fit(x_theme_train, y_theme_train)

y_theme_pred = theme_model.predict(x_theme_test)

print('F1_THEME: ', round(f1_score(y_theme_test, y_theme_pred, average='weighted'), 3))

F1_THEME:  0.424


In [28]:
print('accuracy_GROUP: ', round(accuracy_score(y_theme_test, y_theme_pred), 3))

accuracy_GROUP:  0.426


~~~
22841

SVC
F1_GROUP:  0.717
F1_THEME:  0.512
~~~