In [93]:
import pandas as pd
#F1: 0.725 with lengths=64 count_data=23000
#F1: 0.639 with lengths=256 сount_data=2000

#F1_Y_GROUP:  0.604
#F1_Y_THEME:  0.246
dataset = pd.read_csv('DataSets/train.csv', delimiter=';')

In [94]:
import pymorphy3
from razdel import tokenize
import nltk
from nltk.corpus import stopwords

nltk.download("stopwords")
russian_stopwords = stopwords.words("russian")

dataset = dataset.drop_duplicates()
dataset = dataset[dataset['Текст инцидента'] != '\'Сообщение без текста']

morph = pymorphy3.MorphAnalyzer()

stopTags = ['PNCT', 'NUMB', 'UNKN', 'LATN', 'ROMN']

def preprocess_text(text):
    tokens = tokenize(text.lower())
    tokens = [morph.parse(token.text)[0].normal_form for token in tokens 
              if morph.parse(token.text)[0].normal_form not in russian_stopwords and 
              not any(tag in morph.parse(token.text)[0].tag for tag in stopTags)]
    text = " ".join(tokens) 
    return text

def get_themes_code():
    themes = dataset['Тема'].unique()
    dict = {themes[i]:i for i in range(0, len(themes))}
    return dict

themes_code = get_themes_code()

def get_group_code():
    themes = dataset['Группа тем'].unique()
    dict = {themes[i]:i for i in range(0, len(themes))}
    return dict

group_code = get_group_code()

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\skinn\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [108]:
import numpy as np
import torch
from transformers import AutoTokenizer, AutoModel

data = dataset.iloc[:5000, :]

x = data['Текст инцидента']
y1 = data['Группа тем']
y1 = np.array(y1.replace(group_code)).reshape(-1, 1)
y2 = data['Тема']
y2 = np.array(y2.replace(themes_code)).reshape(-1, 1)
y = np.hstack((y1,y2))

In [109]:
tokenizer = AutoTokenizer.from_pretrained("cointegrated/LaBSE-en-ru")
model = AutoModel.from_pretrained("cointegrated/LaBSE-en-ru")

list = []

for s in x:
    s = preprocess_text(s)
    encoded_input = tokenizer(s, padding=True, truncation=True, max_length=256, return_tensors='pt')
    with torch.no_grad():
        model_output = model(**encoded_input)
    emb = model_output.pooler_output
    list.append((emb)[0].numpy())

x_embeddings = np.asarray(list)

5000

SVC
F1_Y_GROUP:  0.681
F1_Y_THEME:  0.283
F1:  0.482

SVC(kernel='linear', C=1.0))
F1_Y_GROUP:  0.701
F1_Y_THEME:  0.344
F1:  0.522

In [114]:
from sklearn.multioutput import MultiOutputClassifier
from sklearn.svm import SVC
from sklearn.metrics import f1_score
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(x_embeddings, y, test_size=0.1, random_state=8)

model = MultiOutputClassifier(SVC(kernel='linear', C=1.0))
model.fit(x_train, y_train)

y_pred = model.predict(x_test)
f1_y1 = f1_score(y_test.T[0].reshape(-1, 1), y_pred.T[0].reshape(-1, 1), average='weighted')
f1_y2 = f1_score(y_test.T[1].reshape(-1, 1), y_pred.T[1].reshape(-1, 1), average='weighted')
print('F1_Y_GROUP: ', round(f1_y1, 3))
print('F1_Y_THEME: ', round(f1_y2, 3))
print('F1: ', round((f1_y1 + f1_y2)/2, 3))

F1_Y_GROUP:  0.701
F1_Y_THEME:  0.344
F1:  0.522
