Берем отызывы за лето (из архива с материалами или предыдущего занятия)

In [49]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
import tensorflow as tf

### Загрузка данных

In [50]:
reviews = pd.read_excel('отзывы за лето (1).xls')

In [51]:
reviews.head()

Unnamed: 0,Rating,Content,Date
0,5,It just works!,2017-08-14
1,4,В целом удобноное приложение...из минусов хотя...,2017-08-14
2,5,Отлично все,2017-08-14
3,5,Стал зависать на 1% работы антивируса. Дальше ...,2017-08-14
4,5,"Очень удобно, работает быстро.",2017-08-14


In [52]:
reviews.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20659 entries, 0 to 20658
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   Rating   20659 non-null  int64 
 1   Content  20656 non-null  object
 2   Date     20659 non-null  object
dtypes: int64(1), object(2)
memory usage: 484.3+ KB


In [53]:
reviews.Rating.value_counts()

5    14586
1     2276
4     2138
3      911
2      748
Name: Rating, dtype: int64

In [54]:
reviews = reviews[['Content', 'Rating']]
reviews.columns = ['text', 'target']

In [55]:
reviews.head()

Unnamed: 0,text,target
0,It just works!,5
1,В целом удобноное приложение...из минусов хотя...,4
2,Отлично все,5
3,Стал зависать на 1% работы антивируса. Дальше ...,5
4,"Очень удобно, работает быстро.",5


In [56]:
reviews['target'] = reviews['target'].apply(lambda x: (x - 1))

In [57]:
reviews['target'].value_counts()

4    14586
0     2276
3     2138
2      911
1      748
Name: target, dtype: int64

In [58]:
max_words = 200
max_len = 40
num_classes = 1

### Разделение датасета на train и test

In [59]:
X_train, X_test, y_train, y_test = train_test_split(reviews['text'], reviews['target'], 
                                                   test_size=0.3, stratify=reviews['target'], 
                                                   shuffle=True, random_state=42)

### Предобработка текста

In [60]:
from string import punctuation
from pymorphy2 import MorphAnalyzer
from stop_words import get_stop_words
import re

In [61]:
stopwords = set(get_stop_words('ru'))
exclude = set(punctuation)
morpher = MorphAnalyzer()

In [62]:
def preprocess_data(text):
    text = str(text)
    text = ''.join(c for c in text if c not in exclude)
    text = text.lower()
    text = re.sub('\sне', 'не', text)
    text = ' '.join([morpher.parse(word)[0].normal_form for word in text.split() if word not in stopwords])
    return text

In [63]:
X_train = X_train.apply(preprocess_data)
X_test = X_test.apply(preprocess_data)

In [64]:
import nltk
from nltk.tokenize import word_tokenize
nltk.download('punkt')

train_corpus = ' '.join(X_train)

tokens = word_tokenize(train_corpus)

[nltk_data] Downloading package punkt to /Users/macbook/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [65]:
tokens_filtered = [word for word in tokens if word.isalnum()]

In [66]:
from nltk.probability import FreqDist

dist = FreqDist(tokens_filtered)
tokens_filtered_top = [pair[0] for pair in dist.most_common(max_words - 1)]

In [67]:
vocabulary = {v: k for k, v in dict(enumerate(tokens_filtered_top, 1)).items()}

In [68]:
def text_to_sequence(text, maxlen):
    result = []
    tokens = word_tokenize(text.lower())
    tokens_filtered = [word for word in tokens if word.isalnum()]
    for word in tokens_filtered:
        if word in vocabulary:
            result.append(vocabulary[word])
    padding = [0] * (maxlen - len(result))
    return padding + result[-maxlen:]

In [69]:
X_train = np.asarray([text_to_sequence(text, max_len) for text in X_train], dtype=np.int32)
X_test = np.asarray([text_to_sequence(text, max_len) for text in X_test], dtype=np.int32)

### Учим conv сеть для классификации

2.2 Инициализировать слой tf.keras.layers.Embedding по умолчанию (ну то есть вам ничего не делать с весами)
Сравнить две архитектуры с предобученными весами и когда tf.keras.layers.Embedding обучается сразу со всей сеточкой, что получилось лучше

In [72]:
y_test.value_counts()

4    4376
0     683
3     642
2     273
1     224
Name: target, dtype: int64

In [73]:
num_classes = 5
y_train = tf.keras.utils.to_categorical(y_train, num_classes)
y_test = tf.keras.utils.to_categorical(y_test, num_classes)

In [175]:
model = tf.keras.models.Sequential()
model.add(tf.keras.layers.Embedding(input_dim=max_words, output_dim=128, input_length=max_len))
model.add(tf.keras.layers.Conv1D(128, 3))
model.add(tf.keras.layers.Activation('relu'))
model.add(tf.keras.layers.MaxPooling1D())
model.add(tf.keras.layers.Conv1D(64, 16))
model.add(tf.keras.layers.Activation('relu'))
model.add(tf.keras.layers.MaxPool1D())
model.add(tf.keras.layers.Conv1D(32, 1))
model.add(tf.keras.layers.Activation('relu'))
model.add(tf.keras.layers.MaxPooling1D())
model.add(tf.keras.layers.Flatten())
model.add(tf.keras.layers.Dense(32))
model.add(tf.keras.layers.Activation('relu'))
model.add(tf.keras.layers.Dense(16))
model.add(tf.keras.layers.Activation('relu'))
model.add(tf.keras.layers.Dense(5))
model.add(tf.keras.layers.Activation('softmax'))

In [176]:
model.compile(loss='categorical_crossentropy', 
              optimizer='adam', 
              metrics=['accuracy'])

In [153]:
tensorboard = tf.keras.callbacks.TensorBoard(log_dir='./logs', write_graph=True, write_images=True)
early_stopping = tf.keras.callbacks.EarlyStopping(monitor='val_loss')

In [77]:
epochs = 20
batch_size = 512
print_batch_n = 100

history = model.fit(X_train, y_train, 
                    batch_size=batch_size,
                    epochs=epochs, 
                    verbose=1, 
                    validation_split=0.1, 
#                     callbacks=[tensorboard, early_stopping]
                   )

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


In [78]:
metrics_table = []
metrics_table.append(model.evaluate(X_test, y_test))



2. Рассмотреть 2-а варианта сеточек 

2.1 Инициализировать tf.keras.layers.Embedding предобученными векторами взять к примеру с https://rusvectores.org/ru/

In [79]:
# from zipfile import ZipFile
from gensim.models import KeyedVectors

# with ZipFile('181.zip', 'r') as archive:
#     stream = archive.open('model.model')
#     model = KeyedVectors.load_word2vec_format(stream, binary=True)

In [84]:
kv_model = KeyedVectors.load('181/model.model')

In [165]:
reviews['vecs'] = reviews['text'].apply(lambda x: np.array([kv_model[word].astype(np.float32) for word in str(x).split()]))
reviews['vecs'] = reviews['vecs'].apply(lambda x: x.mean(axis=0).astype('float32'))
# reviews['vecs'] = np.array([i for i in reviews['vecs'].values])

In [166]:
X_train, X_test, y_train, y_test = train_test_split(reviews['vecs'], reviews['target'], 
                                                   test_size=0.3, stratify=reviews['target'], 
                                                   shuffle=True, random_state=42)

In [168]:
X_train = np.array([i for i in X_train.values])

In [170]:
X_test = np.array([i for i in X_test.values])

In [171]:
max_words = 200
max_len = 300
num_classes = 1

In [172]:
num_classes = 5
y_train = tf.keras.utils.to_categorical(y_train, num_classes)
y_test = tf.keras.utils.to_categorical(y_test, num_classes)

In [177]:
epochs = 20
batch_size = 512
print_batch_n = 100

history = model.fit(X_train, y_train, 
                    batch_size=batch_size,
                    epochs=epochs, 
                    verbose=1, 
                    validation_split=0.1, 
                    callbacks=[tensorboard, early_stopping]
                   )

Epoch 1/20


In [178]:
metrics_table.append(model.evaluate(X_test, y_test))



In [179]:
metrics_table = pd.DataFrame(metrics_table, index=['default', 'pretrained'])

In [180]:
metrics_table

Unnamed: 0,0,1
default,0.681999,0.76444
pretrained,0.983523,0.706034
