## Урок 7. Сверточные нейронные сети для анализа текста


### Задание:  
Берем отызывы за лето (из архива с материалами или предыдущего занятия)
1. Учим conv сеть для классификации
2. Рассмотреть 2-а варианта сеточек
 - 2.1 Инициализировать tf.keras.layers.Embedding предобученными векторами взять к примеру с https://rusvectores.org/ru/
 - 2.2 Инициализировать слой tf.keras.layers.Embedding по умолчанию (ну то есть вам ничего не делать с весами)
Сравнить две архитектуры с предобученными весами и когда tf.keras.layers.Embedding обучается сразу со всей сеточкой, что получилось лучше





#### Решение

In [4]:
!pip install pymorphy2
!pip install stop_words

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [5]:
import warnings
warnings.filterwarnings("ignore")

from pymorphy2 import MorphAnalyzer
from stop_words import get_stop_words
from string import punctuation
import re

import pandas as pd
import numpy as np

import tensorflow as tf
from tensorflow.keras import Sequential
from tensorflow.keras.layers import Dense, Embedding, GlobalMaxPooling1D, Conv1D, Activation
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer

In [6]:
data =  pd.read_excel("/content/отзывы за лето (1).xls")
data.head(5)


Unnamed: 0,Rating,Content,Date
0,5,It just works!,2017-08-14
1,4,В целом удобноное приложение...из минусов хотя...,2017-08-14
2,5,Отлично все,2017-08-14
3,5,Стал зависать на 1% работы антивируса. Дальше ...,2017-08-14
4,5,"Очень удобно, работает быстро.",2017-08-14


#### 2.1 Инициализировать tf.keras.layers.Embedding предобученными векторами

Предобработка

In [7]:
df_w2v = data.copy()

In [8]:
# Сокращаем количество классов до 2
df_w2v = df_w2v[df_w2v['Rating'] != 3]
df_w2v['target'] = (df_w2v['Rating'] > 3)*1
df_w2v = df_w2v.drop(['Rating'], axis=1)
print(df_w2v.iloc[0])

Content    It just works!
Date           2017-08-14
target                  1
Name: 0, dtype: object


In [9]:
df_w2v['target'] = df_w2v['target'].astype(int)
df_w2v['target'].value_counts()

1    16724
0     3024
Name: target, dtype: int64

In [10]:
df_train = df_w2v.loc[:4131]
df_val = df_w2v.loc[4132:]

In [11]:
exclude = set(punctuation)
sw = set(get_stop_words("ru"))
morpher = MorphAnalyzer()

def preprocess_text(txt):
    txt = str(txt)
    txt = "".join(c for c in txt if c not in exclude)
    txt = txt.lower()

    txt = re.sub("\sне", "не", txt)
    
    txt = [morpher.parse(word)[0].normal_form for word in txt.split() if word not in sw]
    txt = [word for word in txt if len(word)>1]
    
    txt = [word for word in txt if word.isalnum()]
    
    return " ".join(txt)

df_train['Content'] = df_train['Content'].apply(preprocess_text)
df_val['Content'] = df_val['Content'].apply(preprocess_text)


Токенизация

In [12]:
train_corpus = " ".join(df_train["Content"])
train_corpus = train_corpus.lower()

In [13]:
import nltk
from nltk.tokenize import word_tokenize
nltk.download("punkt")
from nltk.probability import FreqDist


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [14]:
tokens = word_tokenize(train_corpus)

In [15]:
tokens_filtered = [word for word in tokens if word.isalnum()]

In [16]:
max_words = 2000
max_len = 40

In [17]:
dist = FreqDist(tokens_filtered)
tokens_filtered_top = [pair[0] for pair in dist.most_common(max_words-1)]


In [18]:
vocabulary = {v: k for k, v in dict(enumerate(tokens_filtered_top, 1)).items()}


In [19]:
def text_to_sequence(text, maxlen):
    result = []
    tokens = word_tokenize(text.lower())
    tokens_filtered = [word for word in tokens if word.isalnum()]
    for word in tokens_filtered:
        if word in vocabulary:
            result.append(vocabulary[word])
    padding = [0]*(maxlen-len(result))
    return padding + result[-maxlen:]


In [20]:
x_train = np.asarray([text_to_sequence(text, max_len) for text in df_train["Content"]], dtype=np.int32)
x_val = np.asarray([text_to_sequence(text, max_len) for text in df_val["Content"]], dtype=np.int32)


In [21]:
x_train

array([[   0,    0,    0, ...,  879, 1327, 1328],
       [   0,    0,    0, ...,   13,  531,  386],
       [   0,    0,    0, ...,    0,    0,    5],
       ...,
       [   0,    0,    0, ...,    0,    0,   30],
       [   0,    0,    0, ...,    0,    4,    1],
       [   0,    0,    0, ...,    0,    0,    4]], dtype=int32)

In [22]:
df_train["target"].unique()

array([1, 0])

Создание модели

In [23]:
from tensorflow.keras import utils as np_utils
num_classes = 2
y_train_w2v = np_utils.to_categorical(df_train["target"], num_classes)
y_val_w2v = np_utils.to_categorical(df_val["target"], num_classes)


In [24]:
from gensim.models import Word2Vec


In [25]:
batch_size = 32

In [26]:
modelW2V = Word2Vec(sentences=df_train['Content'].apply(str.split), size=40, window=5, min_count=1, batch_words=batch_size)


In [27]:
vect_idf = TfidfVectorizer()
vect_idf.fit_transform(df_train['Content'])
tfidf = dict(zip(vect_idf.get_feature_names(), vect_idf.idf_))


In [28]:
def get_vect_mean(txt):
    vector_w2v = np.zeros(40)
    n_w2v = 0
    for wrd in txt.split():
        if wrd in modelW2V:
            vector_w2v += modelW2V[wrd]
            n_w2v += 1
    if n_w2v > 0:
        vector_w2v = vector_w2v / n_w2v
    return vector_w2v

In [29]:
from tqdm import tqdm_notebook

arr_vect = []
for txt in tqdm_notebook(df_train['Content']):
    arr_vect.append(get_vect_mean(txt))
    
arr_vect_valid = []
for txt in tqdm_notebook(df_val['Content']):
    arr_vect_valid.append(get_vect_mean(txt))
    
x_train_w2v = np.asarray(arr_vect)    
x_val_w2v = np.asarray(arr_vect_valid)

  0%|          | 0/3950 [00:00<?, ?it/s]

  0%|          | 0/15798 [00:00<?, ?it/s]

In [30]:
model_w2v = tf.keras.Sequential()
model_w2v.add(Embedding(input_dim=max_words, output_dim=128, input_length=max_len))
model_w2v.add(Conv1D(128, 3))
model_w2v.add(Activation("relu"))
model_w2v.add(GlobalMaxPooling1D())
model_w2v.add(Dense(10))
model_w2v.add(Activation("relu"))
model_w2v.add(Dense(num_classes))
model_w2v.add(Activation('softmax'))

model_w2v.summary()


Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 40, 128)           256000    
                                                                 
 conv1d (Conv1D)             (None, 38, 128)           49280     
                                                                 
 activation (Activation)     (None, 38, 128)           0         
                                                                 
 global_max_pooling1d (Globa  (None, 128)              0         
 lMaxPooling1D)                                                  
                                                                 
 dense (Dense)               (None, 10)                1290      
                                                                 
 activation_1 (Activation)   (None, 10)                0         
                                                        

In [31]:
LEARNING_RATE = 0.0001
optimizer = tf.keras.optimizers.Adam(lr=LEARNING_RATE)

model_w2v.compile(optimizer=optimizer,
              loss='categorical_crossentropy',
              metrics=['AUC'])

tensorboard_callback = tf.keras.callbacks.TensorBoard(
    log_dir='logs/my_model_plus_w2v', 
    write_graph=False, update_freq=100, profile_batch=0)

In [32]:
NUM_EPOCHS = 8
batch_size = 512

In [33]:
%%time

history = model_w2v.fit(
    x_train_w2v, y_train_w2v,
    #batch_size=batch_size,
    epochs=NUM_EPOCHS,
    validation_split=0.1,
    callbacks=[tensorboard_callback])


Epoch 1/8
Epoch 2/8
Epoch 3/8
Epoch 4/8
Epoch 5/8
Epoch 6/8
Epoch 7/8
Epoch 8/8
CPU times: user 9.32 s, sys: 1.57 s, total: 10.9 s
Wall time: 20.5 s


In [34]:
loss, accuracy = model_w2v.evaluate(x_train_w2v, y_train_w2v, batch_size=batch_size, verbose=False)
print("Training Loss:  {:.4f}".format(loss))
print("Training Accuracy:  {:.4f}".format(accuracy))
print('\n')
loss, accuracy = model_w2v.evaluate(x_val_w2v, y_val_w2v, batch_size=batch_size, verbose=False)
print("Testing Loss:  {:.4f}".format(loss))
print("Testing Accuracy:  {:.4f}".format(accuracy))


Training Loss:  0.3664
Training Accuracy:  0.8803


Testing Loss:  0.4498
Testing Accuracy:  0.8385


#### 2.2 Модель со слоем tf.keras.layers.Embedding по умолчанию

Предобработка

In [35]:
exclude = set(punctuation)
sw = set(get_stop_words("ru"))
morpher = MorphAnalyzer()

def preprocess_text(txt):
    txt = str(txt)
    txt = "".join(c for c in txt if c not in exclude)
    txt = txt.lower()

    txt = re.sub("\sне", "не", txt)
    
    txt = [morpher.parse(word)[0].normal_form for word in txt.split() if word not in sw]
    txt = [word for word in txt if len(word)>1] # условие "более одного слова в тексте"
    
    return " ".join(txt)

data['text'] = data['Content'].apply(preprocess_text)

In [36]:
data = data[data['Rating'] != 3]
data['target'] = (data['Rating'] > 3)*1


In [37]:
print(data.iloc[0])

Rating                  5
Content    It just works!
Date           2017-08-14
text        it just works
target                  1
Name: 0, dtype: object


In [38]:
df = data.drop(['Content', 'Rating'], axis=1)
print(df.iloc[0])

Date         2017-08-14
text      it just works
target                1
Name: 0, dtype: object


In [39]:
df['target'] = df['target'].astype(int)
df['target'].value_counts()

1    16724
0     3024
Name: target, dtype: int64

In [40]:
df_train, df_val = train_test_split(df, test_size=0.2,random_state=13)

Токенизация

In [41]:
text_corpus_train = df_train['text'].values
text_corpus_val = df_val['text'].values


In [42]:
tokenizer = Tokenizer(num_words=None, 
                     filters='#$%&()*+-<=>@[\\]^_`{|}~\t\n',
                     lower = False, split = ' ')
tokenizer.fit_on_texts(text_corpus_train)

sequences_train = tokenizer.texts_to_sequences(text_corpus_train)
sequences_val = tokenizer.texts_to_sequences(text_corpus_val)

word_count = len(tokenizer.index_word) + 1
training_length = max([len(i.split()) for i in text_corpus_train])

x_train = pad_sequences(sequences_train, maxlen=training_length)
x_val = pad_sequences(sequences_val, maxlen=training_length)

Создание модели

In [43]:
from tensorflow.keras import utils as np_utils
num_classes = 2
y_train = np_utils.to_categorical(df_train["target"], num_classes)
y_val = np_utils.to_categorical(df_val["target"], num_classes)

In [44]:
model = tf.keras.Sequential()
model.add(Embedding(input_dim=word_count, 
                    output_dim=128, 
                    input_length=training_length))
model.add(Conv1D(128, 3))
model.add(Activation("relu"))
model.add(GlobalMaxPooling1D())
model.add(Dense(10))
model.add(Activation("relu"))
model.add(Dense(num_classes))
model.add(Activation('softmax'))

model.summary()

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_1 (Embedding)     (None, 131, 128)          1386112   
                                                                 
 conv1d_1 (Conv1D)           (None, 129, 128)          49280     
                                                                 
 activation_3 (Activation)   (None, 129, 128)          0         
                                                                 
 global_max_pooling1d_1 (Glo  (None, 128)              0         
 balMaxPooling1D)                                                
                                                                 
 dense_2 (Dense)             (None, 10)                1290      
                                                                 
 activation_4 (Activation)   (None, 10)                0         
                                                      

In [50]:
LEARNING_RATE = 0.0001
optimizer = tf.keras.optimizers.Adam(lr=LEARNING_RATE)

model.compile(optimizer=optimizer,
              loss='categorical_crossentropy',
              metrics=['accuracy'])

tensorboard_callback = tf.keras.callbacks.TensorBoard(
    log_dir='logs/my_model', 
    write_graph=False, update_freq=100, profile_batch=0)


In [51]:
batch_size=1024
NUM_EPOCHS=8

In [52]:
%%time

history = model.fit(
    x_train, y_train,
    epochs=NUM_EPOCHS,
    validation_data=(x_val,y_val),
    callbacks=[tensorboard_callback])


Epoch 1/8
Epoch 2/8
Epoch 3/8
Epoch 4/8
Epoch 5/8
Epoch 6/8
Epoch 7/8
Epoch 8/8
CPU times: user 23.5 s, sys: 2.47 s, total: 26 s
Wall time: 20.5 s


In [53]:
loss, accuracy = model.evaluate(x_train, y_train, batch_size=batch_size, verbose=False)
print("Training Loss:  {:.4f}".format(loss))
print("Training Accuracy:  {:.4f}".format(accuracy))
print('\n')
loss, accuracy = model.evaluate(x_val, y_val, batch_size=batch_size, verbose=False)
print("Testing Loss:  {:.4f}".format(loss))
print("Testing Accuracy:  {:.4f}".format(accuracy))


Training Loss:  0.0548
Training Accuracy:  0.9840


Testing Loss:  0.1973
Testing Accuracy:  0.9177


### Вывод:  

Точность на предобученных эмбедингах ниже