Так как надо протестировать все модели по несколько раз (без K-Fold, с K-Fold, с K-Fold и SMOTE), можно заранее векторизовать датасет.

In [1]:
import pandas as pd
import os

pd.set_option('display.max_colwidth', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)

os.chdir('../')

In [2]:
df = pd.read_csv('data/processed/rule_classified.csv')

In [3]:
df_labeled = df[df['answer'].notna()].copy()
df_labeled = df_labeled.reset_index(drop=True)

df_labeled['answer'].value_counts()

answer
Отказ                                     217
Частично удовлетворено                    138
Обращение рассмотрено                     130
Взыскание обращено                        106
Запрос направлен                           80
Возбуждено исполнительное производство     76
Постановление вынесено                     37
Удовлетворено                              27
Заявления и жалобы рассматриваются         26
Объявлен исполнительный розыск             20
Применены меры для исполнения              19
Запрет действий                            14
Name: count, dtype: int64

In [4]:
from src.classification_utils import split_data
import numpy as np

X_train, X_test, y_train, y_test = split_data(df_labeled)
vectorizers = {}

In [5]:
from pathlib import Path
import pickle
from scipy.sparse import save_npz

def save_vec_model_pkl(model, model_name):
    with open(f'models/vectorization/{model_name}_model.pkl', 'wb') as f:
        pickle.dump(model, f)

def save_vec_np_arrays(X_train, X_test, model_name, np_type):
    folder_path = Path(f'vectors/{model_name}')
    folder_path.mkdir(parents=True, exist_ok=True)

    if np_type == 'npz':
        train_path = f'vectors/{model_name}/X_train_{model_name}.npz'
        test_path = f'vectors/{model_name}/X_test_{model_name}.npz'
        save_npz(train_path, X_train)
        save_npz(test_path, X_test)
    elif np_type == 'npy':
        train_path = f'vectors/{model_name}/X_train_{model_name}.npy'
        test_path = f'vectors/{model_name}/X_test_{model_name}.npy'
        np.save(train_path, X_train)
        np.save(test_path, X_test)
    else:
        raise ValueError('np_type должен быть "npz" или "npy"')
    
    return train_path, test_path

### TF-IDF

In [6]:
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.sparse import save_npz

tf_idf_model = TfidfVectorizer(max_features=1000, ngram_range=(1, 2))
X_train_tf_idf = tf_idf_model.fit_transform(X_train)
X_test_tf_idf = tf_idf_model.transform(X_test)

save_vec_model_pkl(tf_idf_model, 'tf_idf')
X_train_tf_idf_path, X_test_tf_idf_path = save_vec_np_arrays(X_train_tf_idf, X_test_tf_idf, 'tf_idf', 'npz')

vectorizers['tf_idf'] = (X_train_tf_idf_path, X_test_tf_idf_path)

### Hashing Vectorizer

In [None]:
from sklearn.feature_extraction.text import HashingVectorizer

hashvec_model = HashingVectorizer(n_features=1000, ngram_range=(1, 2))
X_train_hashvec = hashvec_model.transform(X_train)
X_test_hashvec = hashvec_model.transform(X_test)

X_train_hashvec_path, X_test_hashvec_path = save_vec_np_arrays(X_train_hashvec, X_test_hashvec, 'hashvec', 'npz')

vectorizers['hashvec'] = (X_train_hashvec_path, X_test_hashvec_path)

### Word2Vec

In [8]:
from gensim.models import Word2Vec

word2vec_model = Word2Vec(sentences=[text.split() for text in X_train], 
                     vector_size=100, window=5, min_count=1)

def text_to_word2vec(texts, model, vector_size=100):
    vectors = []
    for text in texts:
        words = text.split()
        word_vectors = [model.wv[w] for w in words if w in model.wv]
        if word_vectors:
            vectors.append(np.mean(word_vectors, axis=0))
        else:
            vectors.append(np.zeros(vector_size))
    return np.array(vectors)

X_train_word2vec = text_to_word2vec(X_train, word2vec_model, 100)
X_test_word2vec = text_to_word2vec(X_test, word2vec_model, 100)

X_train_word2vec_path, X_test_word2vec_path = save_vec_np_arrays(X_train_word2vec, X_test_word2vec, 'word2vec', 'npy')

vectorizers['word2vec'] = (X_train_word2vec_path, X_test_word2vec_path)

Можно было бы обучить модель, но датасет слишком маленький

In [10]:
%%script cmd /c ""
from gensim.models import Word2Vec

# Word2Vec с обучением
tokenized_train = [text.split() for text in X_train]

w2v_model = Word2Vec(
    sentences=tokenized_train,
    vector_size=100,
    window=5,
    min_count=2,
    workers=4,
    epochs=10,
    sg=1
)

def text_to_w2v(texts, model):
    vectors = []
    for text in texts:
        words = text.split()
        word_vectors = [model.wv[w] for w in words if w in model.wv]
        if word_vectors:
            vectors.append(np.mean(word_vectors, axis=0))
        else:
            vectors.append(np.zeros(100))
    return np.array(vectors)

X_train_w2v_fitted = text_to_w2v(X_train, w2v_model)
X_test_w2v_fitted = text_to_w2v(X_test, w2v_model)

create_vectors_folder('w2v_fitted')

np.save('vectors/w2v/X_train_w2v_fitted.npy', X_train_w2v_fitted)
np.save('vectors/w2v/X_test_w2v_fitted.npy', X_test_w2v_fitted)

vectorizers['word2vec_fitten'] = ('vectors/w2v/X_train_w2v_fitted.npy', 'vectors/w2v/X_test_w2v_fitted.npy')

### Doc2Vec

In [9]:
from gensim.models.doc2vec import Doc2Vec, TaggedDocument

tagged_train = [TaggedDocument(words=text.split(), tags=[str(i)]) for i, text in enumerate(X_train)]

doc2vec_model = Doc2Vec(vector_size=100, min_count=1, epochs=40)
doc2vec_model.build_vocab(tagged_train)
doc2vec_model.train(tagged_train, total_examples=doc2vec_model.corpus_count, epochs=doc2vec_model.epochs)

def text_to_doc2vec(texts, model):
    vectors = []
    for text in texts:
        vectors.append(model.infer_vector(text.split()))
    return np.array(vectors)

X_train_doc2vec = text_to_doc2vec(X_train, doc2vec_model)
X_test_doc2vec = text_to_doc2vec(X_test, doc2vec_model)

save_vec_model_pkl(doc2vec_model, 'doc2vec')
X_train_doc2vec_path, X_test_doc2vec_path = save_vec_np_arrays(X_train_doc2vec, X_test_doc2vec, 'doc2vec', 'npy')

vectorizers['doc2vec'] = (X_train_doc2vec_path, X_test_doc2vec_path)

### FastText предобученный

In [None]:
from gensim.models.fasttext import load_facebook_model

# Кинуть в папку models/vectorization/ https://fasttext.cc/docs/en/crawl-vectors.html
fasttext_model = load_facebook_model('models/vectorization/cc.ru.300.bin')

def text_to_fasttext(texts, model, vector_size=100):
    vectors = []
    for text in texts:
        words = text.split()
        word_vectors = [model.wv[w] for w in words]
        if word_vectors:
            vectors.append(np.mean(word_vectors, axis=0))
        else:
            vectors.append(np.zeros(vector_size))
    return np.array(vectors)

X_train_fasttext = text_to_fasttext(X_train, fasttext_model, 100)
X_test_fasttext = text_to_fasttext(X_test, fasttext_model, 100)

X_train_fasttext_path, X_test_fasttext_path = save_vec_np_arrays(X_train_fasttext, X_test_fasttext, 'fasttext', 'npy')

vectorizers['fasttext'] = (X_train_fasttext_path, X_test_fasttext_path)

Можно было бы обучить модель, но датасет слишком маленький

In [13]:
%%script cmd /c ""
from gensim.models import FastText

# FastText с обучением
tokenized_train = [text.split() for text in X_train]

ft_model = FastText(
    sentences=tokenized_train,
    vector_size=100,
    window=5,
    min_count=2,
    workers=4,
    epochs=10,
    sg=1
)

def text_to_fasttext(texts, model, vector_size=100):
    vectors = []
    for text in texts:
        words = text.split()
        word_vectors = [model.wv[w] for w in words]
        if word_vectors:
            vectors.append(np.mean(word_vectors, axis=0))
        else:
            vectors.append(np.zeros(vector_size))
    return np.array(vectors)

X_train_ft_fitted = text_to_fasttext(X_train, ft_model, 100)
X_test_ft_fitted = text_to_fasttext(X_test, ft_model, 100)

create_vectors_folder('ft')

np.save('vectors/ft/X_train_ft_fitted.npy', X_train_ft_fitted)
np.save('vectors/ft/X_test_ft_fitted.npy', X_test_ft_fitted)
# ft_model.save('fasttext_fitted.model')

vectorizers['fasttext_fitted'] = ('vectors/ft/X_train_ft_fitted.npy', 'vectors/ft/X_test_ft_fitted.npy')

### Navec (GloVe)

Новее моделей RusVectores и Navec 2020 года нету. Navec показывает лучшие метрики с меньшим размером

In [None]:
from navec import Navec

# Кинуть в папку models/vectorization/ https://github.com/natasha/navec
navec_path = 'models/vectorization/navec_hudlit_v1_12B_500K_300d_100q.tar'
navec = Navec.load(navec_path)

def text_to_navec(texts, navec_model):
    vectors = []
    for text in texts:
        words = text.split()
        word_vectors = [navec_model[v] for v in words if v in navec_model]
        if word_vectors:
            vectors.append(np.mean(word_vectors, axis=0))
        else:
            vectors.append(np.zeros(navec_model.dims))
    return np.array(vectors)

X_train_navec = text_to_navec(X_train, navec)
X_test_navec = text_to_navec(X_test, navec)

X_train_navec_path, X_test_navec_path = save_vec_np_arrays(X_train_navec, X_test_navec, 'navec', 'npy')

vectorizers['glove'] = (X_train_navec_path, X_test_navec_path)

### ruBERT (проблема, что нету uncased популярной)

In [11]:
from sentence_transformers import SentenceTransformer

rusbert_model = SentenceTransformer('sergeyzh/rubert-mini-uncased')

X_train_rusbert = rusbert_model.encode(list(X_train), show_progress_bar=True)
X_test_rusbert = rusbert_model.encode(list(X_test), show_progress_bar=True)

X_train_rusbert_path, X_test_rusbert_path = save_vec_np_arrays(X_train_rusbert, X_test_rusbert, 'sbert', 'npy')

vectorizers['sbert'] = (X_train_rusbert_path, X_test_rusbert_path)

Batches:   0%|          | 0/20 [00:00<?, ?it/s]

Batches:   0%|          | 0/9 [00:00<?, ?it/s]

### Сохранение

In [None]:
with open('vectors/vectorizers.pkl', 'wb') as f:
    import pickle
    pickle.dump(vectorizers, f)