### Урок 5. Сверточные нейронные сети для анализа текста.#

### -- Автор: Шенк Евгений Станиславович

In [1]:
import numpy as np
import pandas as pd
import keras
from sklearn.metrics import roc_auc_score
import tensorflow as tf
from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.layers import Dense, Dropout, Activation, Input, Embedding, Conv1D, GlobalMaxPool1D, MaxPooling1D, BatchNormalization, Masking, InputLayer
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.callbacks import TensorBoard
from tensorflow.keras import optimizers
from keras.objectives import categorical_crossentropy
from tensorflow.keras.callbacks import EarlyStopping  
from gensim.models import Word2Vec, FastText

Using TensorFlow backend.


In [2]:
device = tf.device('cpu')  #'cpu' 'gpu'

### Выполнение:   
Максимальный roc_auc_score который удалось получить на валидационной выборке = 0.87.  
Обучение на эмбедингах Word2Vec дает похожий результат, но при обучении эмбедингов (модели M2V) на изначальных твитах (без пред обработки), при обучении на обработанных данных качество сильно падало. FastText отработал чуть хуже W2V.

#### Задание 1. Учим conv сеть для классификации - выбить auc выше 0.95

In [3]:
max_words = 50000
max_len = 50
num_classes = 1

# Training
epochs = 50
batch_size = 64  #64
print_batch_n = 100

In [4]:
df_train = pd.read_csv("../data/train.csv")
df_test = pd.read_csv("../data/test.csv")
df_val = pd.read_csv("../data/val.csv")

### Предобработка

In [5]:
from string import punctuation
from stop_words import get_stop_words
from pymorphy2 import MorphAnalyzer
import re

In [6]:
sw = set(get_stop_words("ru"))
exclude = set(punctuation)
morpher = MorphAnalyzer()

def preprocess_text(txt):
    txt = str(txt)
    txt = re.sub('((www\.[^\s]+)|(https?://[^\s]+))', 'URL', txt) # заменяем URL
    txt = re.sub('@[^\s]+', 'USER', txt) # заменяем username
    txt = "".join(c if c not in exclude else " " for c in txt) # убираем пунктуацию
    txt = txt.lower()
    txt = re.sub("не\s", "не", txt)
    txt = re.sub("\s{2,}", " ", txt) # убираем лишние пробелы
    # Не удаляем стоп-слова, т.к. качество падает (в некоторых твитах все слова оказываются стоп-словами)
    txt = [morpher.parse(word)[0].normal_form for word in txt.split()]  # if word not in sw 
    return " ".join(txt)

df_train['text'] = df_train['text'].apply(preprocess_text)
df_val['text'] = df_val['text'].apply(preprocess_text)
df_test['text'] = df_test['text'].apply(preprocess_text)

In [7]:
#df_train.to_csv("../data/train_2.csv", index=False)
#df_val.to_csv("../data/val_2.csv", index=False)
#df_test.to_csv("../data/test_2.csv", index=False)

In [8]:
#df_train.fillna('', inplace=True)
#df_val.fillna('', inplace=True)
#df_test.fillna('', inplace=True)

train_corpus = " ".join(df_train["text"])
train_corpus = train_corpus.lower()

In [9]:
import nltk
from nltk.tokenize import word_tokenize, TweetTokenizer
# nltk.download("punkt")

# tokens = word_tokenize(train_corpus)
tweet_tokenizer = TweetTokenizer()  # Используем TweetTokenizer, но на качество по-моему не влияет
tokens = tweet_tokenizer.tokenize(train_corpus)

Отфильтруем данные

и соберём в корпус N наиболее частых токенов

In [10]:
tokens_filtered = [word for word in tokens if word.isalnum()]

In [11]:
from nltk.probability import FreqDist
dist = FreqDist(tokens_filtered)
tokens_filtered_top = [pair[0] for pair in dist.most_common(max_words-1)]

In [12]:
tokens_filtered_top[:10]

['user', 'я', 'и', 'в', 'что', 'rt', 'на', 'а', 'url', 'с']

In [13]:
vocabulary = {v: k for k, v in dict(enumerate(tokens_filtered_top, 1)).items()}

### Model

In [14]:
def text_to_sequence(text, maxlen):
    result = []
    tokens = word_tokenize(text.lower())
    tokens_filtered = [word for word in tokens if word.isalnum()]
    for word in tokens_filtered:
        if word in vocabulary:
            result.append(vocabulary[word])
    padding = [0]*(maxlen-len(result))
    return padding + result[-maxlen:]

In [15]:
x_train = np.asarray([text_to_sequence(text, max_len) for text in df_train["text"]], dtype=np.int32)
x_test = np.asarray([text_to_sequence(text, max_len) for text in df_test["text"]], dtype=np.int32)
x_val = np.asarray([text_to_sequence(text, max_len) for text in df_val["text"]], dtype=np.int32)

In [16]:
x_train.shape

(181467, 50)

In [17]:
max_len

50

In [18]:
x_train[1]

array([    0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     6,     1,
         304,     3,  7078,    16,     4,   572,    32,    13,   254,
          41,    63,     3,   119, 29466])

In [19]:
num_classes = 2
y_train = keras.utils.to_categorical(df_train["class"], num_classes)
y_val = keras.utils.to_categorical(df_val["class"], num_classes)

In [20]:
model = Sequential()
model.add(Embedding(input_dim=max_words, output_dim=64, input_length=max_len))

model.add(Conv1D(64, 3))  # 128
model.add(Activation("relu"))
model.add(GlobalMaxPool1D())
model.add(Dropout(0.1))
model.add(Dense(32))
model.add(Activation("relu"))
model.add(Dense(16))
model.add(Activation("relu"))
model.add(Dense(num_classes))
model.add(Activation('softmax')) # sigmoid  softmax

In [21]:
adam = optimizers.Adam(lr=0.0001)

In [22]:
model.compile(loss='categorical_crossentropy', # loss='binary_crossentropy'  loss='categorical_crossentropy'
              optimizer=adam,
              metrics=['AUC'])

In [23]:
with device:
    tensorboard=TensorBoard(log_dir='./logs', write_graph=True, write_images=True)
    early_stopping=EarlyStopping(monitor='val_loss')  


    history = model.fit(x_train, y_train,
                        batch_size=batch_size,
                        epochs=epochs,
                        verbose=1,
                        validation_split=0.2,
                        callbacks=[tensorboard, early_stopping])

Epoch 1/50
Instructions for updating:
use `tf.profiler.experimental.stop` instead.
Epoch 2/50
Epoch 3/50


In [24]:
with device:
    score = model.evaluate(x_val, y_val, batch_size=batch_size, verbose=1)
print('\n')
print('Test score:', score[0])
print('Test accuracy:', score[1])



Test score: 0.46322160959243774
Test accuracy: 0.8642386198043823


In [25]:
with tf.device('cpu'):
    result = model.predict(x_val)

In [26]:
roc_auc_score(y_val, result)

0.8642297362020667

In [27]:
# 0.8642297362020667

### Задание 2. Предобучаем word2vec и его эмбединга инициализируем сетку, как влияет на качество?

In [28]:
df_train = pd.read_csv("../data/train.csv")
df_test = pd.read_csv("../data/test.csv")
df_val = pd.read_csv("../data/val.csv")

In [29]:
num_classes = 2
y_train = keras.utils.to_categorical(df_train["class"], num_classes)
y_val = keras.utils.to_categorical(df_val["class"], num_classes)

In [30]:
sentences = df_train["text"].tolist()

In [31]:
size = 300
modelW2V = Word2Vec(sentences=sentences, size=size, window=5, min_count=1, workers= 32, seed = 34)
#modelW2V = FastText(sentences=sentences, size=300, window=5, min_count=1, workers= 32, seed = 34)

In [32]:
modelW2V.train(sentences=sentences, total_examples=len(df_train["text"]), epochs=20)

(101477133, 294208920)

In [33]:
def sentence_to_vec(model, sentence, size):
    size = size
    vec = np.zeros(size)
    vec_len = 0

    for word in sentence:
        try:  # Бывают слова которых нет в словаре
            vec += model[word].reshape((1, size))[0]
            vec_len += 1
        except KeyError:
            pass

    result = vec / vec_len

    return result    

In [34]:
xtrain_w2v = df_train["text"].apply(lambda x: sentence_to_vec(modelW2V, x, size))
xtrain_w2v = np.array(xtrain_w2v.tolist())
xvalid_w2v = df_val['text'].apply(lambda x: sentence_to_vec(modelW2V, x, size))
xvalid_w2v = np.array(xvalid_w2v.tolist())

  


In [35]:
xtrain_w2v = xtrain_w2v.reshape(len(xtrain_w2v), size, 1)
xvalid_w2v = xvalid_w2v.reshape(len(xvalid_w2v), size, 1)

In [36]:
model_w2v = Sequential()

model_w2v.add(Conv1D(64, 3))  # 128
model_w2v.add(Activation("relu"))
model_w2v.add(GlobalMaxPool1D())
model_w2v.add(Dropout(0.1))
model_w2v.add(Dense(32))
model_w2v.add(Activation("relu"))
model_w2v.add(Dense(16))
model_w2v.add(Activation("relu"))
model_w2v.add(Dense(num_classes))
model_w2v.add(Activation('softmax')) # sigmoid  softmax

In [37]:
adam = optimizers.Adam(lr=0.0001)

In [38]:
model_w2v.compile(loss='categorical_crossentropy', # loss='binary_crossentropy'  loss='categorical_crossentropy'
              optimizer=adam,
              metrics=['AUC'])

In [39]:
with device:
    tensorboard=TensorBoard(log_dir='./logs', write_graph=True, write_images=True)
    early_stopping=EarlyStopping(monitor='val_loss')  


    history = model_w2v.fit(xtrain_w2v, y_train,
                        batch_size=batch_size,
                        epochs=epochs,
                        verbose=1,
                        validation_split=0.2,
                        callbacks=[tensorboard, early_stopping])

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50


In [40]:
with device:
    result_w2v = model_w2v.predict(xvalid_w2v)

In [41]:
roc_auc_score(y_val, result_w2v)

0.829396911861511

In [42]:
# 0.8459531688184458