## Классификация текстовых документов

In [None]:
%tensorflow_version 2.x
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Embedding, MaxPooling1D, Conv1D, GlobalMaxPooling1D, Dropout, LSTM, GRU
from tensorflow.keras import utils
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.callbacks import ModelCheckpoint
from tensorflow.keras import utils
from tensorflow.keras.preprocessing.text import tokenizer_from_json
import io
import json
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline 

In [None]:
# Максимальное количество слов 
num_words = 14
# Максимальная длина новости
max_news_len = 10
# Количество классов новостей
nb_classes = 2

## Загружаем данные в память

Читаем данные из файла

In [None]:
train = pd.read_csv('train.csv', 
                    header=None, 
                    names=['class', 'text'])

In [None]:
train[:5]

Unnamed: 0,class,text
0,1,лабораторная цель вывод
1,1,лабораторной цель
2,1,лабораторной цель
3,1,лабораторная введение цель
4,1,лабораторная введение цель


Выделяем данные для обучения

In [None]:
news = train['text'].astype(str)

In [None]:
news[:5]

0       лабораторная цель вывод
1             лабораторной цель
2             лабораторной цель
3    лабораторная введение цель
4    лабораторная введение цель
Name: text, dtype: object

Выделяем правильные ответы

In [None]:
y_train = train['class'] - 1

In [None]:
y_train[:5]

0    0
1    0
2    0
3    0
4    0
Name: class, dtype: int64

## Токенизация текста

Создаем токенизатор Keras

In [None]:
tokenizer = Tokenizer(num_words=num_words)

Обучаем токенизатор

In [None]:
tokenizer.fit_on_texts(news)
tokenizer_json = tokenizer.to_json()
with io.open('tokenizer.json', 'w', encoding='utf-8') as f:
    f.write(json.dumps(tokenizer_json, ensure_ascii=False))

Просматриваем словарь токенизатора

In [None]:
tokenizer.word_index

{'nan': 15,
 'вариант': 4,
 'введение': 7,
 'вывод': 6,
 'доклад': 8,
 'заключение': 10,
 'использованных': 14,
 'источников': 11,
 'лабораторная': 3,
 'лабораторной': 1,
 'лабораторных': 13,
 'оглавление': 12,
 'содержание': 9,
 'ход': 5,
 'цель': 2}

Преобразуем новости в числовое представление

In [None]:
sequences = tokenizer.texts_to_sequences(news)

Просматриваем новости в числовом представлении

In [None]:
index = 15
print(news[index])
print(sequences[index])

лабораторной цель
[1, 2]


In [None]:
tokenizer.index_word[11]

'источников'

Ограничиваем длину отзывов

In [None]:
x_train = pad_sequences(sequences, maxlen=max_news_len)

In [None]:
x_train[:5]

array([[0, 0, 0, 0, 0, 0, 0, 3, 2, 6],
       [0, 0, 0, 0, 0, 0, 0, 0, 1, 2],
       [0, 0, 0, 0, 0, 0, 0, 0, 1, 2],
       [0, 0, 0, 0, 0, 0, 0, 3, 7, 2],
       [0, 0, 0, 0, 0, 0, 0, 3, 7, 2]], dtype=int32)

## Сверточная нейронная сеть

In [None]:
model_cnn = Sequential()
model_cnn.add(Embedding(num_words, 32, input_length=max_news_len))
model_cnn.add(Conv1D(256, 5, padding='valid', activation='relu'))
model_cnn.add(GlobalMaxPooling1D())
model_cnn.add(Dense(128, activation='relu'))
model_cnn.add(Dense(1, activation='sigmoid'))

In [None]:
model_cnn.compile(optimizer='adam', 
              loss='binary_crossentropy', 
              metrics=['accuracy'])

In [None]:
model_cnn.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 10, 32)            448       
_________________________________________________________________
conv1d (Conv1D)              (None, 6, 256)            41216     
_________________________________________________________________
global_max_pooling1d (Global (None, 256)               0         
_________________________________________________________________
dense (Dense)                (None, 128)               32896     
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 129       
Total params: 74,689
Trainable params: 74,689
Non-trainable params: 0
_________________________________________________________________


Создаем callback для сохранения нейронной сети на каждой эпохе, если качество работы на проверочном наборе данных улучшилось. Сеть сохраняется в файл `best_model.h5`

In [None]:
model_cnn_save_path = 'best_model_cnn.h5'
checkpoint_callback_cnn = ModelCheckpoint(model_cnn_save_path, 
                                      monitor='val_accuracy',
                                      save_best_only=True,
                                      verbose=1)

In [None]:
history_cnn = model_cnn.fit(x_train, 
                            y_train, 
                            epochs=10,
                            validation_split = 0.1,
                            callbacks=[checkpoint_callback_cnn])

Epoch 1/10

Epoch 00001: val_accuracy improved from -inf to 0.96364, saving model to best_model_cnn.h5
Epoch 2/10

Epoch 00002: val_accuracy did not improve from 0.96364
Epoch 3/10

Epoch 00003: val_accuracy did not improve from 0.96364
Epoch 4/10

Epoch 00004: val_accuracy did not improve from 0.96364
Epoch 5/10

Epoch 00005: val_accuracy improved from 0.96364 to 1.00000, saving model to best_model_cnn.h5
Epoch 6/10

Epoch 00006: val_accuracy did not improve from 1.00000
Epoch 7/10

Epoch 00007: val_accuracy did not improve from 1.00000
Epoch 8/10

Epoch 00008: val_accuracy did not improve from 1.00000
Epoch 9/10

Epoch 00009: val_accuracy did not improve from 1.00000
Epoch 10/10

Epoch 00010: val_accuracy did not improve from 1.00000


## Cеть LSTM

In [None]:
model_lstm = Sequential()
model_lstm.add(Embedding(num_words, 32, input_length=max_news_len))
model_lstm.add(LSTM(16))
model_lstm.add(Dense(1, activation='sigmoid'))

In [None]:
model_lstm.compile(optimizer='adam', 
              loss='binary_crossentropy', 
              metrics=['accuracy'])

In [None]:
model_lstm.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 10, 32)            448       
_________________________________________________________________
lstm (LSTM)                  (None, 16)                3136      
_________________________________________________________________
dense_2 (Dense)              (None, 1)                 17        
Total params: 3,601
Trainable params: 3,601
Non-trainable params: 0
_________________________________________________________________


Создаем callback для сохранения нейронной сети на каждой эпохе, если качество работы на проверочном наборе данных улучшилось. Сеть сохраняется в файл `best_model.h5`

In [None]:
model_lstm_save_path = 'best_model_lstm.h5'
checkpoint_callback_lstm = ModelCheckpoint(model_lstm_save_path, 
                                      monitor='val_accuracy',
                                      save_best_only=True,
                                      verbose=1)

In [None]:
history_lstm = model_lstm.fit(x_train, 
                              y_train, 
                              epochs=10,
                              validation_split = 0.1,
                              callbacks=[checkpoint_callback_lstm])

Epoch 1/10

Epoch 00001: val_accuracy improved from -inf to 0.96364, saving model to best_model_lstm.h5
Epoch 2/10

Epoch 00002: val_accuracy did not improve from 0.96364
Epoch 3/10

Epoch 00003: val_accuracy did not improve from 0.96364
Epoch 4/10

Epoch 00004: val_accuracy did not improve from 0.96364
Epoch 5/10

Epoch 00005: val_accuracy did not improve from 0.96364
Epoch 6/10

Epoch 00006: val_accuracy did not improve from 0.96364
Epoch 7/10

Epoch 00007: val_accuracy did not improve from 0.96364
Epoch 8/10

Epoch 00008: val_accuracy did not improve from 0.96364
Epoch 9/10

Epoch 00009: val_accuracy improved from 0.96364 to 1.00000, saving model to best_model_lstm.h5
Epoch 10/10

Epoch 00010: val_accuracy did not improve from 1.00000


## Cеть GRU

In [None]:
model_gru = Sequential()
model_gru.add(Embedding(num_words, 32, input_length=max_news_len))
model_gru.add(GRU(16))
model_gru.add(Dense(1, activation='sigmoid'))

In [None]:
model_gru.compile(optimizer='adam', 
              loss='binary_crossentropy', 
              metrics=['accuracy'])

In [None]:
model_gru.summary()

Model: "sequential_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      (None, 10, 32)            448       
_________________________________________________________________
gru (GRU)                    (None, 16)                2400      
_________________________________________________________________
dense_3 (Dense)              (None, 1)                 17        
Total params: 2,865
Trainable params: 2,865
Non-trainable params: 0
_________________________________________________________________


Создаем callback для сохранения нейронной сети на каждой эпохе, если качество работы на проверочном наборе данных улучшилось. Сеть сохраняется в файл `best_model_gru.h5`

In [None]:
model_gru_save_path = 'best_model_gru.h5'
checkpoint_callback_gru = ModelCheckpoint(model_gru_save_path, 
                                      monitor='val_accuracy',
                                      save_best_only=True,
                                      verbose=1)

In [None]:
history_gru = model_gru.fit(x_train, 
                              y_train, 
                              epochs=10,
                            validation_split = 0.1,
                              callbacks=[checkpoint_callback_gru])

Epoch 1/10

Epoch 00001: val_accuracy improved from -inf to 0.96364, saving model to best_model_gru.h5
Epoch 2/10

Epoch 00002: val_accuracy did not improve from 0.96364
Epoch 3/10

Epoch 00003: val_accuracy did not improve from 0.96364
Epoch 4/10

Epoch 00004: val_accuracy did not improve from 0.96364
Epoch 5/10

Epoch 00005: val_accuracy did not improve from 0.96364
Epoch 6/10

Epoch 00006: val_accuracy improved from 0.96364 to 1.00000, saving model to best_model_gru.h5
Epoch 7/10

Epoch 00007: val_accuracy did not improve from 1.00000
Epoch 8/10

Epoch 00008: val_accuracy did not improve from 1.00000
Epoch 9/10

Epoch 00009: val_accuracy did not improve from 1.00000
Epoch 10/10

Epoch 00010: val_accuracy did not improve from 1.00000


# Сеть кринж

In [None]:
model = Sequential()
model.add(Dense(max_news_len))
model.add(Dense(2 * max_news_len))
model.add(Dense(max_news_len / 2))
model.add(Dense(1, activation='sigmoid'))

In [None]:
model.compile(optimizer='adam', 
              loss='binary_crossentropy', 
              metrics=['accuracy'])

In [None]:
model_save_path = 'best_model.h5'
checkpoint_callback = ModelCheckpoint(model_save_path, 
                                      monitor='val_accuracy',
                                      save_best_only=True,
                                      verbose=1)

In [None]:
history = model.fit(x_train, 
                              y_train, 
                              epochs=10,
                    validation_split = 0.1,
                              callbacks=[checkpoint_callback])

Epoch 1/10

Epoch 00001: val_accuracy improved from -inf to 0.85455, saving model to best_model.h5
Epoch 2/10

Epoch 00002: val_accuracy improved from 0.85455 to 0.96364, saving model to best_model.h5
Epoch 3/10

Epoch 00003: val_accuracy did not improve from 0.96364
Epoch 4/10

Epoch 00004: val_accuracy did not improve from 0.96364
Epoch 5/10

Epoch 00005: val_accuracy did not improve from 0.96364
Epoch 6/10

Epoch 00006: val_accuracy did not improve from 0.96364
Epoch 7/10

Epoch 00007: val_accuracy did not improve from 0.96364
Epoch 8/10

Epoch 00008: val_accuracy did not improve from 0.96364
Epoch 9/10

Epoch 00009: val_accuracy did not improve from 0.96364
Epoch 10/10

Epoch 00010: val_accuracy did not improve from 0.96364


## Загружаем набор данных для тестирования

In [None]:
test = pd.read_csv('test.csv', 
                    header=None, 
                    names=['class', 'text'])

In [None]:
test[:5]

Unnamed: 0,class,text
0,1,лабораторной введение введение ход
1,1,лабораторной введение цель ход вывод
2,1,лабораторной введение цель
3,1,лабораторная вариант цель цель цель лабораторн...
4,1,лабораторной цель ход


Преобразуем новости в числовое представление

Обратите внимание, что нужно использовать токенизатор, обученный на наборе данных train.

In [None]:
test_sequences = tokenizer.texts_to_sequences(test['text'])

In [None]:
x_test = pad_sequences(test_sequences, maxlen=max_news_len)

In [None]:
x_test[:5]

array([[0, 0, 0, 0, 0, 0, 1, 7, 7, 5],
       [0, 0, 0, 0, 0, 1, 7, 2, 5, 6],
       [0, 0, 0, 0, 0, 0, 0, 1, 7, 2],
       [0, 3, 4, 2, 2, 2, 1, 2, 4, 5],
       [0, 0, 0, 0, 0, 0, 0, 1, 2, 5]], dtype=int32)

Правильные ответы

In [None]:
y_test = test['class'] - 1

In [None]:
y_test[:5]

0    0
1    0
2    0
3    0
4    0
Name: class, dtype: int64

## Оцениваем качество работы сети на тестовом наборе данных

### Одномерная сверточная сеть 

In [None]:
model_cnn.load_weights(model_cnn_save_path)

In [None]:
model_cnn.evaluate(x_test, y_test, verbose=1)
pred1 = model_cnn.predict(x_test)
pred1



array([[3.69640924e-02],
       [3.80210043e-03],
       [1.06385089e-02],
       [4.16847179e-04],
       [8.56326078e-04],
       [5.85839711e-03],
       [5.72579801e-01],
       [3.02590802e-03],
       [2.22884351e-03],
       [1.33530854e-03],
       [2.45684059e-03],
       [3.69640924e-02],
       [5.08417422e-03],
       [4.67566191e-04],
       [7.00602448e-03],
       [2.00568559e-03],
       [4.51854803e-03],
       [6.96980627e-03],
       [3.69640924e-02],
       [4.11825888e-02],
       [3.69640924e-02],
       [1.06148515e-02],
       [5.69728017e-01],
       [3.69640924e-02],
       [1.45952462e-03],
       [5.32368779e-01],
       [1.11685600e-03],
       [1.33530854e-03],
       [6.59374744e-02],
       [1.11685600e-03],
       [1.06148515e-02],
       [1.33530854e-03],
       [1.47221722e-02],
       [3.69640924e-02],
       [1.33530854e-03],
       [5.18327415e-01],
       [4.11825888e-02],
       [3.21158580e-02],
       [1.33530854e-03],
       [1.33530854e-03],


### Сеть LSTM

In [None]:
model_lstm.load_weights(model_lstm_save_path)

In [None]:
model_lstm.evaluate(x_test, y_test, verbose=1)



[0.12101692706346512, 0.9444444179534912]

### Сеть GRU

In [None]:
model_gru.load_weights(model_gru_save_path)

In [None]:
model_gru.evaluate(x_test, y_test, verbose=1)



[0.13232700526714325, 0.9629629850387573]

# Сеть кринж

In [None]:
model.load_weights(model_save_path)

In [None]:
model.evaluate(x_test, y_test, verbose=1)



[0.1412203460931778, 0.9629629850387573]

## Полезные ссылки

1. [Определение тональности текстов отзывов на сайте YELP одномерной сверточной нейросетью](https://colab.research.google.com/drive/1KWS-4MKKOIG7UhiCA58ZJcxwSLG5F6Wm).
2. [Определение тональности текстов отзывов на сайте YELP сетью LSTM](https://colab.research.google.com/drive/19olgYyZ4N5fh8RIPtHxkso2N5HD1yg0X).
3. [Использование командной строки Linux в Colab](https://colab.research.google.com/drive/1vFGZ2nDS0ukNGXPL-0avK097afYQILyq).