In [1]:
import pandas as pd
import numpy as np
dataset = pd.read_csv('DataSets/train.csv', delimiter=';')

In [2]:
import pymorphy3
from razdel import tokenize
import nltk
from nltk.corpus import stopwords

nltk.download("stopwords")
russian_stopwords = stopwords.words("russian")

dataset = dataset.drop_duplicates()
dataset = dataset[dataset['Текст инцидента'] != '\'Сообщение без текста']
for msg in dataset['Текст инцидента']:
    if len(msg.split(' ')) <= 2:    
        dataset = dataset[dataset['Текст инцидента'] != msg]

morph = pymorphy3.MorphAnalyzer()

stopTags = ['PNCT', 'NUMB', 'UNKN', 'LATN', 'ROMN']

def preprocess_text(text):
    tokens = tokenize(text.lower())
    tokens = [morph.parse(token.text)[0].normal_form for token in tokens 
              if morph.parse(token.text)[0].normal_form not in russian_stopwords and 
              not any(tag in morph.parse(token.text)[0].tag for tag in stopTags)]
    text = " ".join(tokens) 
    return text

def get_group_code():
    groups = dataset['Группа тем'].unique()
    dict = {groups[i]:i for i in range(0, len(groups))}
    return dict

group_codes = get_group_code()

def get_themes_code():
    themes = dataset['Тема'].unique()
    dict = {themes[i]:i for i in range(0, len(themes))}
    return dict

themes_codes = get_themes_code()

def get_group_vectors(group_codes):
    groups = dataset['Группа тем'].unique()
    vectors = np.linspace(-len(groups), len(groups), len(groups))
    dict = {group_codes[groups[i]]:vectors[i] for i in range(0, len(groups))}
    return dict

group_vectors = get_group_vectors(group_codes)

data = dataset

x = data['Текст инцидента']
y1 = data['Группа тем']
y1 = np.array(y1.replace(group_codes))
y2 = data['Тема']
y2 = np.array(y2.replace(themes_codes))

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\skinn\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [3]:
'''
import torch
from transformers import AutoTokenizer, AutoModel

tokenizer = AutoTokenizer.from_pretrained("cointegrated/LaBSE-en-ru")
model = AutoModel.from_pretrained("cointegrated/LaBSE-en-ru")

list = []

for s in x:
    s = preprocess_text(s)
    encoded_input = tokenizer(s, padding=True, truncation=True, max_length=256, return_tensors='pt')
    with torch.no_grad():
        model_output = model(**encoded_input)
    emb = model_output.pooler_output
    list.append((emb)[0].numpy())

x_embeddings = np.asarray(list)

#сохранить embeddings
np.save(f'embeddings_{len(x_embeddings)}', x_embeddings)
'''

'\nimport torch\nfrom transformers import AutoTokenizer, AutoModel\n\ntokenizer = AutoTokenizer.from_pretrained("cointegrated/LaBSE-en-ru")\nmodel = AutoModel.from_pretrained("cointegrated/LaBSE-en-ru")\n\nlist = []\n\nfor s in x:\n    s = preprocess_text(s)\n    encoded_input = tokenizer(s, padding=True, truncation=True, max_length=256, return_tensors=\'pt\')\n    with torch.no_grad():\n        model_output = model(**encoded_input)\n    emb = model_output.pooler_output\n    list.append((emb)[0].numpy())\n\nx_embeddings = np.asarray(list)\n\n#сохранить embeddings\nnp.save(f\'embeddings_{len(x_embeddings)}\', x_embeddings)\n'

In [4]:
#загрузить embeddings
x_embeddings = np.load('embeddings_22554.npy')

In [5]:
from sklearn.svm import SVC
from sklearn.metrics import f1_score
from sklearn.model_selection import train_test_split

x_group_train, x_group_test, y_group_train, y_group_test = train_test_split(x_embeddings, y1, test_size=0.1, random_state=5)

x_theme_train, x_theme_test, y_theme_train, y_theme_test = train_test_split(x_embeddings, y2, test_size=0.1, random_state=5)

x_theme_train = np.hstack((x_theme_train, y_group_train.reshape(-1, 1)))
x_theme_test = np.hstack((x_theme_test, y_group_test.reshape(-1, 1)))

In [6]:
group_model = SVC(C = 3)
group_model.fit(x_group_train, y_group_train)

y_group_pred = group_model.predict(x_group_test)

print('F1_GROUP: ', round(f1_score(y_group_test, y_group_pred, average='weighted'), 3))

F1_GROUP:  0.746


In [7]:
theme_model = SVC(C = 4)
theme_model.fit(x_theme_train, y_theme_train)

y_theme_pred = theme_model.predict(x_theme_test)

print('F1_THEME: ', round(f1_score(y_theme_test, y_theme_pred, average='weighted'), 3))

F1_THEME:  0.555


### F1_GROUP:  0.741
### F1_THEME:  0.565