## Решение на основе нейронных сетей (word2vec)

In [1]:
import os
import pandas as pd
import numpy as np
from tqdm import tqdm
import pickle
from collections import Counter

In [2]:
paths = {
    'MODELS':'models',
    'LR':os.path.join('models','LogisticRegression'),
    'FASTTEXT':os.path.join('models','fasttext'),
    'VECTOR':os.path.join('models','vector'),
    'DATA':'data',
    'TRAIN':os.path.join('data','train'),
    'TEST':os.path.join('data','test'),
    'UTILS':'utils',
}

### Загрузка предобработанных данных

In [None]:
df_gensim = pd.read_csv(os.path.join(paths['DATA'],'gensim.csv'), index_col=0).dropna()

In [3]:
df_cleaner = pd.read_csv(os.path.join(paths['DATA'],'cleaner.csv'), index_col=0).dropna()

### Формирование тестовой и обучающей выборки с учетом несбалансированности классов

In [4]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

In [None]:
X_train_gensim, X_test_gensim, y_train_gensim, y_test_gensim = train_test_split(df_gensim.text, 
                                                                                df_gensim.topic,  
                                                                                test_size=0.2, 
                                                                                stratify=df_gensim.topic,
                                                                                random_state=11)

In [5]:
X_train_clean, X_test_clean, y_train_clean, y_test_clean = train_test_split(df_cleaner.text,
                                                                            df_cleaner.topic,
                                                                            test_size=0.2, 
                                                                            stratify=df_cleaner.topic, 
                                                                            random_state=11)

In [6]:
from gensim.utils import simple_preprocess
result = []
for text in tqdm(X_train_clean.values):
    result.append(text.split(' '))


100%|███████████████████████████████████████████████████████████████████████| 592874/592874 [00:31<00:00, 19055.31it/s]


In [7]:
output_len = len(np.unique(y_train_clean))

#### word2vec

In [8]:
from gensim.models.word2vec import Word2Vec
import gensim.downloader as api 

In [12]:
russuia_w2v = api.load('word2vec-ruscorpora-300')

In [13]:
from collections import defaultdict
import re
def get_vocab(words:list, vectors:list)->dict:
    dct = defaultdict(str)
    print(len(words),len(vectors))
    for word, vector in zip(words,vectors):
        key = re.sub('_[A-Z]*','',word)
        dct[key] = vector
    return dct
    

In [14]:
dct_words = get_vocab(russuia_w2v.index_to_key,russuia_w2v.vectors)
dct_idx = {name:idx for idx, name  in enumerate(dct_words)}

184973 184973


In [15]:
np.asarray([1,2,3])

array([1, 2, 3])

In [16]:
with open(os.path.join(paths['UTILS'],'dct_words.bin'),'wb') as file:
    pickle.dump(dct_words,file)

In [18]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.layers import Embedding
from sklearn.preprocessing import OneHotEncoder

In [19]:
def tokenizer(text: str, dct_idx:dict)->np.array:
    result = []
    for word in text.split():
        value = dct_idx.get(word, None)
        if value is not None:
            result.append(value)
    return np.asarray(result)
            
    

In [20]:
def get_w2v_sentence(text:str, dct_vocab:dict):
    result = []
    for word in text.split()[-100:]:
        try:
            vector = dct_vocab[word]
            if not isinstance(vector,str):
                result.append(vector)
        except KeyError :
            result.append([])
            
    return np.asarray(result, dtype=object).flatten()
    # return np.array(result).flatten()
        

In [21]:
embending_matrix = np.zeros(shape=(len(dct_words),300))
for idx, word in tqdm(enumerate(dct_words)):
    embed_vector = dct_words.get(word,None)
    if embed_vector is not None:
        embending_matrix[idx] = embed_vector

183533it [00:00, 195170.64it/s]


In [33]:
y_train_1 = {name:i for i, name in enumerate(np.unique(y_train_clean))}
y_train_2 = {str(idx):name for name, idx in y_train_1.items()}
y_train = np.asarray([y_train_1[item] for item in y_train_clean])
y_test = np.asarray([y_train_1[item] for item in y_test_clean])
y_train = keras.utils.to_categorical(y_train).astype(np.int16)
y_test = keras.utils.to_categorical(y_test).astype(np.int16)

In [25]:
X_train = []
for text in tqdm(X_train_clean):
    vector_sentence = tokenizer(text, dct_idx)
    X_train.append(vector_sentence)
X_train = keras.preprocessing.sequence.pad_sequences(X_train,maxlen=100)

100%|████████████████████████████████████████████████████████████████████████| 592874/592874 [01:57<00:00, 5034.94it/s]


In [26]:
X_test = []
for text in tqdm(X_test_clean):
    vector_sentence = tokenizer(text, dct_idx)
    X_test.append(vector_sentence)
X_test = keras.preprocessing.sequence.pad_sequences(X_test,maxlen=100)

100%|███████████████████████████████████████████████████████████████████████| 148219/148219 [00:10<00:00, 14745.96it/s]


In [27]:
VOCAB_SIZE = len(dct_words)
VECTOR_SIZE =  russuia_w2v.vector_size
MAX_LENGTH = 100
OUTPUT_LEN = y_train.shape[1]

In [28]:
embedding_layers = Embedding(input_dim=VOCAB_SIZE,
                             output_dim=VECTOR_SIZE,
                             input_length=MAX_LENGTH,
                             embeddings_initializer=keras.initializers.Constant(embending_matrix))

In [45]:
X_train.shape

(592874, 100)

In [48]:
modelGRU = keras.models.Sequential()
modelGRU.add(embedding_layers)
modelGRU.add(keras.layers.GRU(1024))
modelGRU.add(keras.layers.Dense(OUTPUT_LEN, activation='softmax'))
modelGRU.compile(
    optimizer=keras.optimizers.RMSprop(),
    loss=keras.losses.CategoricalCrossentropy(),
    metrics=keras.metrics.CategoricalAccuracy(),
)

In [49]:
modelGRU.summary()

Model: "sequential_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 100, 300)          55059900  
                                                                 
 gru_2 (GRU)                 (None, 1024)              4073472   
                                                                 
 dense_2 (Dense)             (None, 15)                15375     
                                                                 
Total params: 59,148,747
Trainable params: 59,148,747
Non-trainable params: 0
_________________________________________________________________


In [50]:
ch_point = tf.keras.callbacks.ModelCheckpoint(
    os.path.join(paths['MODELS'],'Word_vect','GRU_2.hdf5'),
    monitor='val_categorical_accuracy',
    save_best_only=True,
    save_weights_only=False,
    mode='max',
    save_freq='epoch',
)
erly_stop = tf.keras.callbacks.EarlyStopping(
    monitor='val_categorical_accuracy',
    min_delta=0.01,
    patience=2,
    mode='max',
)

In [None]:
modelGRU.fit(
    x=X_train,
    y=y_train, 
    epochs=5, 
    batch_size=64,
    validation_data=(X_test,y_test),
    callbacks=[ch_point,erly_stop])

In [38]:
predict = modelGRU.predict(X_test)



In [39]:
np.argmax(predict, axis=1)

array([5, 2, 5, ..., 9, 6, 7], dtype=int64)

In [40]:
np.argmax(y_test, axis=1)

array([5, 2, 5, ..., 7, 6, 7], dtype=int64)

In [41]:
from sklearn.metrics import classification_report

In [42]:
y_true = np.asarray([y_train_2[str(idx)] for idx in np.argmax(y_test, axis=1)])

In [43]:
y_predict = np.asarray([y_train_2[str(idx)] for idx in np.argmax(predict, axis=1)])

In [44]:
print(classification_report(y_true,y_predict))

                   precision    recall  f1-score   support

           Бизнес       0.57      0.46      0.51      1480
      Бывший_СССР       0.78      0.72      0.75      7094
              Дом       0.84      0.75      0.79      4347
         Из_жизни       0.75      0.47      0.58      5521
   Интернет_и_СМИ       0.77      0.69      0.73      8933
         Культура       0.85      0.89      0.87     10759
              Мир       0.81      0.82      0.81     27324
  Наука_и_техника       0.81      0.85      0.83     10627
      Путешествия       0.66      0.63      0.65      1281
           Россия       0.77      0.84      0.81     32088
Силовые_структуры       0.68      0.50      0.58      3919
            Спорт       0.96      0.96      0.96     12883
          Украина       0.85      0.83      0.84      4504
         Ценности       0.89      0.84      0.86      1553
        Экономика       0.80      0.87      0.83     15906

         accuracy                           0.81    14