## Решение на основе нейронных сетей (elmo)

In [1]:
import os
import pandas as pd
import numpy as np
from tqdm import tqdm
import pickle

In [2]:
paths = {
    'MODELS':'models',
    'LR':os.path.join('models','LogisticRegression'),
    'FASTTEXT':os.path.join('models','fasttext'),
    'VECTOR':os.path.join('models','vector'),
    'DATA':'data',
    'TRAIN':os.path.join('data','train'),
    'TEST':os.path.join('data','test'),
    'UTILS':'utils',
}

### Загрузка предобработанных данных

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [5]:
!cp 'drive/MyDrive/Colab Notebooks/text/cleaner.csv' data

In [18]:
df_cleaner = pd.read_csv(os.path.join(paths['DATA'],'cleaner.csv'), index_col=0, encoding='utf-8').dropna()

### Формирование тестовой и обучающей выборки с учетом несбалансированности классов

In [19]:
from sklearn.model_selection import train_test_split

In [20]:
X_train_clean, X_test_clean, y_train_clean, y_test_clean = train_test_split(df_cleaner.text,
                                                                            df_cleaner.topic,
                                                                            test_size=0.2, 
                                                                            stratify=df_cleaner.topic, 
                                                                            random_state=11)

#### word2vec

In [21]:
output_len = len(np.unique(y_train_clean))

In [22]:
import gensim.downloader as api 

In [23]:
russuia_w2v = api.load('word2vec-ruscorpora-300')

In [24]:
from collections import defaultdict
import re
def get_vocab(words:list, vectors:list)->dict:
    dct = defaultdict(str)
    print(len(words),len(vectors))
    for word, vector in zip(words,vectors):
        key = re.sub('_[A-Z]*','',word)
        dct[key] = vector
    return dct
    

In [25]:
dct_words = get_vocab(russuia_w2v.index2word,russuia_w2v.vectors)
dct_idx = {name:idx for idx, name  in enumerate(dct_words)}

184973 184973


In [26]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.layers import Embedding
from sklearn.preprocessing import OneHotEncoder

In [27]:
def tokenizer(text: str, dct_idx:dict)->np.array:
    result = []
    for word in text.split():
        value = dct_idx.get(word, None)
        if value is not None:
            result.append(value)
    return np.asarray(result)

In [28]:
def get_w2v_sentence(text:str, dct_vocab:dict):
    result = []
    for word in text.split()[-100:]:
        try:
            vector = dct_vocab[word]
            if not isinstance(vector,str):
                result.append(vector)
        except KeyError :
            result.append([])
            
    return np.asarray(result, dtype=object).flatten()

In [29]:
embending_matrix = np.zeros(shape=(len(dct_words),300))
for idx, word in tqdm(enumerate(dct_words)):
    embed_vector = dct_words.get(word,None)
    if embed_vector is not None:
        embending_matrix[idx] = embed_vector
with open('matrix',"wb") as file:
  pickle.dump(embending_matrix,file)

183533it [00:00, 652642.55it/s]


In [30]:
y_train_1 = {name:i for i, name in enumerate(np.unique(y_train_clean))}
y_train_2 = {str(idx):name for name, idx in y_train_1.items()}
y_train = np.asarray([y_train_1[item] for item in y_train_clean])
y_test = np.asarray([y_train_1[item] for item in y_test_clean])
y_train = keras.utils.to_categorical(y_train).astype(np.int16)
y_test = keras.utils.to_categorical(y_test).astype(np.int16)

In [19]:
X_train = []
for text in tqdm(X_train_clean):
    vector_sentence = tokenizer(text, dct_idx)
    X_train.append(vector_sentence)
X_train = keras.preprocessing.sequence.pad_sequences(X_train,maxlen=100)

100%|██████████| 592874/592874 [00:29<00:00, 19899.34it/s]


In [20]:
X_test = []
for text in tqdm(X_test_clean):
    vector_sentence = tokenizer(text, dct_idx)
    X_test.append(vector_sentence)
X_test = keras.preprocessing.sequence.pad_sequences(X_test,maxlen=100)

100%|██████████| 148219/148219 [00:07<00:00, 19984.24it/s]


In [21]:
VOCAB_SIZE = len(dct_words)
VECTOR_SIZE =  russuia_w2v.vector_size
MAX_LENGTH = 100
OUTPUT_LEN = y_train.shape[1]
vars = tuple([VOCAB_SIZE,VECTOR_SIZE,MAX_LENGTH,OUTPUT_LEN])
with open('vars','wb') as file:
  pickle.dump(vars,file)

In [4]:
with open('x_train','rb') as file:
  X_train = pickle.load(file)
with open('x_test','rb') as file:
  X_test = pickle.load(file)
with open('y_train','rb') as file:
  y_train = pickle.load(file)
with open('y_test','rb') as file:
  y_test = pickle.load(file)
with open('vars','rb') as file:
  vars = pickle.load(file)
with open('matrix',"rb") as file:
  embending_matrix = pickle.load(file)
VOCAB_SIZE, VECTOR_SIZE, MAX_LENGTH, OUTPUT_LEN = vars

In [5]:
embedding_layers = Embedding(input_dim=VOCAB_SIZE,
                             output_dim=VECTOR_SIZE,
                             input_length=MAX_LENGTH,
                             embeddings_initializer=keras.initializers.Constant(embending_matrix))

In [6]:
modelGRU = keras.models.Sequential()
modelGRU.add(embedding_layers)
modelGRU.add(keras.layers.GRU(1024))
modelGRU.add(keras.layers.Dense(OUTPUT_LEN, activation='softmax'))
modelGRU.compile(
    optimizer=keras.optimizers.RMSprop(),
    loss=keras.losses.CategoricalCrossentropy(),
    metrics=keras.metrics.CategoricalAccuracy(),
)

In [25]:
with open('x_train','wb') as file:
  pickle.dump(X_train,file)
with open('x_test','wb') as file:
  pickle.dump(X_test,file)
with open('y_train','wb') as file:
  pickle.dump(y_train,file)
with open('y_test','wb') as file:
  pickle.dump(y_test,file) 

In [7]:
modelGRU.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 100, 300)          55059900  
                                                                 
 gru (GRU)                   (None, 1024)              4073472   
                                                                 
 dense (Dense)               (None, 15)                15375     
                                                                 
Total params: 59,148,747
Trainable params: 59,148,747
Non-trainable params: 0
_________________________________________________________________


In [8]:
ch_point = tf.keras.callbacks.ModelCheckpoint(
    os.path.join(paths['MODELS'],'Word_vect','GRU_2.hdf5'),
    monitor='val_categorical_accuracy',
    save_best_only=True,
    save_weights_only=False,
    mode='max',
    save_freq='epoch',
)
erly_stop = tf.keras.callbacks.EarlyStopping(
    monitor='val_categorical_accuracy',
    min_delta=0.01,
    patience=2,
    mode='max',
)

In [9]:
modelGRU.fit(
    x=X_train,
    y=y_train, 
    epochs=5, 
    batch_size=512,
    validation_data=(X_test,y_test),
    callbacks=[ch_point,erly_stop])

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5


<keras.callbacks.History at 0x7fe9ca4b76d0>

In [14]:
!cp 'models/Word_vect/GRU_2.hdf5' 'drive/MyDrive/Colab Notebooks/text'

In [15]:
from sklearn.metrics import classification_report

In [16]:
predict = modelGRU.predict(X_test)

In [31]:
y_true = np.asarray([y_train_2[str(idx)] for idx in np.argmax(y_test, axis=1)])
y_predict = np.asarray([y_train_2[str(idx)] for idx in np.argmax(predict, axis=1)])
print(classification_report(y_true,y_predict))

                   precision    recall  f1-score   support

           Бизнес       0.65      0.44      0.53      1480
      Бывший_СССР       0.81      0.71      0.76      7094
              Дом       0.81      0.81      0.81      4347
         Из_жизни       0.65      0.58      0.62      5521
   Интернет_и_СМИ       0.78      0.68      0.73      8933
         Культура       0.86      0.90      0.88     10759
              Мир       0.79      0.85      0.82     27324
  Наука_и_техника       0.82      0.84      0.83     10627
      Путешествия       0.73      0.65      0.68      1281
           Россия       0.79      0.84      0.81     32088
Силовые_структуры       0.69      0.53      0.60      3919
            Спорт       0.95      0.97      0.96     12883
          Украина       0.84      0.85      0.85      4504
         Ценности       0.87      0.87      0.87      1553
        Экономика       0.86      0.83      0.84     15906

         accuracy                           0.82    14