# Спроба з ручною токінізацією та без шару embedding

Імпортуємо необхідні бібліотеки та задаємо `seed` для генератору випадкових величин

In [1]:
import os
import re
import itertools
import numpy as np
import tensorflow as tf
import tensorflow.keras as keras

In [2]:
tf.random.set_seed(17)

Викачуємо датасет IMDB з Keras, де вже зберігаються речення з токінізованими словами.

In [None]:
start_char = 1
oov_char = 2
index_from = 2
(x_train, y_train), (x_test, y_test) = keras.datasets.imdb.load_data(
    seed=17,
    start_char=start_char,
    oov_char=oov_char,
    index_from=index_from
)

Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/imdb.npz
[1m17464789/17464789[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 0us/step


Імпортуємо словник зі словами та відповідними їм токенами, та модифікуємо задля використання для нашого датасету

In [None]:
word_index = keras.datasets.imdb.get_word_index(path="imdb_word_index.json")
min_index = min(word_index.values())
word_index["[START]"] = min_index - index_from
word_index["[OOV]"] = min_index - index_from + 1
word_index = dict(
    (word, index + index_from) for (word, index) in word_index.items()
)

Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/imdb_word_index.json
[1m1641221/1641221[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 0us/step


Додаємо зворотній словник, де кожному значенню токена відповідає своє слово

In [None]:
index_word = dict(
    (index, word) for (word, index) in word_index.items()
)

Заміняємо символ пропуску `"br"` на той, що використовується операційною системою


In [None]:
index_word[word_index["br"]] = os.linesep

Створюємо функцію, що перетворює вектор токенів на слово та перевіряємо її роботу

In [None]:
def vector_2_string(vector, index_word_dict):
    string = " ".join(index_word_dict[i] for i in vector)
    return string

In [None]:
print(vector_2_string(x_train[1], index_word))

[START] i began watching this movie with my girl friend and after 5 minutes i was alone 
 
 i succeed to stay until the end it has been a painful experience 
 
 i liked jean hugues anglade but i think that he needed to eat as us and thus he accepted to play in this movie 
 
 there are only 5 characters and the rest could be called 'art' or something that i couldn't express but that i didn't understand at all 
 
 the only worst movie i saw was crash but i'm pretty sure now that i have enough experience to watch it successfully again 
 
 good luck o


Створюємо функцію, що перетворює строку на вектор

In [None]:
def list_of_words(string):
    string = string.replace(os.linesep, " br ")
    filtered_text = (re.sub("[\W_]+", " ", string)).split(" ")
    if filtered_text[-1] == "":
        filtered_text.pop()
    return filtered_text

def string_2_vector(
    string,
    word_index,
    start_sym="[START]",
    oov=2,
):
    string = string.lower()
    words = [start_sym] + list_of_words(string)
    return [word_index.get(word, oov) for word in words]

Перевіряємо написану функцію на довільному реченні

In [None]:
string_2_vector("Hello,\nSkibidi toilet without artificial intelligence", word_index)

[1, 4824, 9, 2, 3479, 208, 4501, 1662]

Визначаємо функції ручного маскування датасету та вектора

In [None]:
def mask_dataset(list_of_vectors, max_words):
    out_array = np.zeros((len(list_of_vectors), max_words))
    for i in range(max_words):
        max_index = len(list_of_vectors[i]) if len(list_of_vectors[i]) < max_words else max_words
        out_array[i, : max_index] = list_of_vectors[i][ : max_index]
    return out_array
def mask_vector(vector, max_words):
    max_index = len(vector) if len(vector) < max_words else max_words
    return np.array(vector[ : max_index])

Створюємо функцію визначення моделі, що складається з 3-х послідовних двонаправлених LSTM шарів та двох повнозвʼязних шарів

In [None]:
def get_model(input_shape, time_steps):
    model = keras.models.Sequential([
        keras.layers.Input(shape=input_shape),
        keras.layers.Bidirectional(keras.layers.LSTM(time_steps, return_sequences=True)),
        keras.layers.Bidirectional(keras.layers.LSTM(time_steps, return_sequences=True)),
        keras.layers.Bidirectional(keras.layers.LSTM(time_steps)),
        keras.layers.Dense(64, activation='relu'),
        keras.layers.Dropout(0.5),
        keras.layers.Dense(1),
    ])
    model.compile(
        loss=keras.losses.BinaryCrossentropy(from_logits=True),
        metrics=['accuracy']
    )
    return model

Проведемо експеримент залежності якості класифікації від максимальної довжини строки

In [None]:
time_steps_list = [16, 32, 64, 128, 256]
batch_size = 8
epochs = 10

In [None]:
for time_steps in time_steps_list:
    print(f"Time steps: {time_steps}")
    x_train_masked = mask_dataset(x_train, time_steps)[:, :, np.newaxis]
    x_test_masked = mask_dataset(x_test, time_steps)[:, :, np.newaxis]
    model = get_model(x_train_masked.shape[1 : ], time_steps)
    model.fit(
        x_train_masked,
        y_train,
        epochs=epochs,
        validation_data=(x_test_masked, y_test),
        verbose=1
    )
    test_loss, test_acc = model.evaluate(x_test_masked, y_test)

    print(f"Test Loss: {test_loss}")
    print(f"Test Accuracy: {test_acc}\n")

Time steps: 16
Epoch 1/10
[1m782/782[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m56s[0m 57ms/step - accuracy: 0.4969 - loss: 0.6934 - val_accuracy: 0.5000 - val_loss: 0.6931
Epoch 2/10
[1m782/782[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m78s[0m 52ms/step - accuracy: 0.4969 - loss: 0.6934 - val_accuracy: 0.5000 - val_loss: 0.6931
Epoch 3/10
[1m782/782[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m37s[0m 47ms/step - accuracy: 0.4969 - loss: 0.6935 - val_accuracy: 0.5000 - val_loss: 0.6931
Epoch 4/10
[1m782/782[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m36s[0m 46ms/step - accuracy: 0.4969 - loss: 0.6932 - val_accuracy: 0.5000 - val_loss: 0.6931
Epoch 5/10
[1m782/782[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m40s[0m 45ms/step - accuracy: 0.4969 - loss: 0.6933 - val_accuracy: 0.5000 - val_loss: 0.6931
Epoch 6/10
[1m782/782[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m37s[0m 47ms/step - accuracy: 0.4969 - loss: 0.6932 - val_accuracy: 0.5000 - val_loss: 0.6932
E

KeyboardInterrupt: 



```
Time steps: 16
782/782 ━━━━━━━━━━━━━━━━━━━━ 9s 11ms/step - accuracy: 0.5019 - loss: 0.6931
Test Loss: 0.6931413412094116
Test Accuracy: 0.5

Time steps: 32
782/782 ━━━━━━━━━━━━━━━━━━━━ 16s 20ms/step - accuracy: 0.5019 - loss: 0.6931
Test Loss: 0.6931492686271667
Test Accuracy: 0.5

Time steps: 64
782/782 ━━━━━━━━━━━━━━━━━━━━ 47s 60ms/step - accuracy: 0.5019 - loss: 0.6933
Test Loss: 0.6931658387184143
Test Accuracy: 0.5

Time steps: 128
782/782 ━━━━━━━━━━━━━━━━━━━━ 293s 374ms/step - accuracy: 0.5038 - loss: 0.6935
Test Loss: 0.6932134628295898
Test Accuracy: 0.5002800226211548
```



При зміні довжини вектору тестова та тренувальна точності залишаються коло 0.5

Спробуємо іншу архітектуру, до якої включені шари енкодеру та ембедінгу.

# Спроба з автоматичною токінізацією та шаром embedding

Імпортуємо модуль з датасетами tensorflow та викачуємо IMDB датасет


In [3]:
import tensorflow_datasets as tfds

In [4]:
dataset, info = tfds.load('imdb_reviews', with_info=True, as_supervised=True)
train_dataset, test_dataset = dataset['train'], dataset['test']

Downloading and preparing dataset 80.23 MiB (download: 80.23 MiB, generated: Unknown size, total: 80.23 MiB) to /root/tensorflow_datasets/imdb_reviews/plain_text/1.0.0...


Dl Completed...: 0 url [00:00, ? url/s]

Dl Size...: 0 MiB [00:00, ? MiB/s]

Generating splits...:   0%|          | 0/3 [00:00<?, ? splits/s]

Generating train examples...:   0%|          | 0/25000 [00:00<?, ? examples/s]

Shuffling /root/tensorflow_datasets/imdb_reviews/plain_text/incomplete.HY8NK9_1.0.0/imdb_reviews-train.tfrecor…

Generating test examples...:   0%|          | 0/25000 [00:00<?, ? examples/s]

Shuffling /root/tensorflow_datasets/imdb_reviews/plain_text/incomplete.HY8NK9_1.0.0/imdb_reviews-test.tfrecord…

Generating unsupervised examples...:   0%|          | 0/50000 [00:00<?, ? examples/s]

Shuffling /root/tensorflow_datasets/imdb_reviews/plain_text/incomplete.HY8NK9_1.0.0/imdb_reviews-unsupervised.…

Dataset imdb_reviews downloaded and prepared to /root/tensorflow_datasets/imdb_reviews/plain_text/1.0.0. Subsequent calls will reuse this data.


Перетасовуємо датасет та завантажуємо у памʼять

In [5]:
BUFFER_SIZE = 10000
BATCH_SIZE = 64

In [6]:
train_dataset = train_dataset.shuffle(BUFFER_SIZE).batch(BATCH_SIZE).prefetch(tf.data.AUTOTUNE)
test_dataset = test_dataset.batch(BATCH_SIZE).prefetch(tf.data.AUTOTUNE)

Кодуємо слова відповідним токеном в енкодері

In [7]:
VOCAB_SIZE = 1024
encoder = keras.layers.TextVectorization(max_tokens=VOCAB_SIZE)
encoder.adapt(train_dataset.map(lambda text, label: text))

Створюємо нову функцію визначення моделі, де ми можемо налаштовувати кількість відліків, кількість додаткових двонаправлених LSTM шарів, та енкодер

In [8]:
def get_model(time_steps, extra_blstm_layers, encoder):
    model = keras.Sequential()
    model.add(encoder)
    model.add(keras.layers.Embedding(
        input_dim=len(encoder.get_vocabulary()),
        output_dim=time_steps,
        mask_zero=True
    ))
    for i in range(extra_blstm_layers):
        model.add(keras.layers.Bidirectional(keras.layers.LSTM(time_steps, return_sequences=True)))
    model.add(keras.layers.Bidirectional(tf.keras.layers.LSTM(time_steps)))
    model.add(keras.layers.Dense(time_steps, activation='relu'))
    model.add(keras.layers.Dense(1))
    model.compile(
        loss=tf.keras.losses.BinaryCrossentropy(from_logits=True),
        optimizer=tf.keras.optimizers.Adam(1e-4),
        metrics=['accuracy']
    )
    return model

In [None]:
time_steps_list = [8, 16, 32, 64, 128]
extra_layers_list = [1, 2, 3]

In [None]:
for param in itertools.product(extra_layers_list, time_steps_list):
    print(param)
    model = get_model(param[1], param[0], encoder)
    model.fit(
        train_dataset,
        epochs=10,
        validation_data=test_dataset,
        validation_steps=30
    )
    test_loss, test_acc = model.evaluate(test_dataset)
    print('Test Loss:', test_loss)
    print('Test Accuracy:', test_acc, '\n')

(1, 8)
Epoch 1/10
[1m 37/391[0m [32m━[0m[37m━━━━━━━━━━━━━━━━━━━[0m [1m13:10[0m 2s/step - accuracy: 0.5039 - loss: 0.6931



```
(1, 8)
Epoch 1/10
391/391 ━━━━━━━━━━━━━━━━━━━━ 804s 2s/step - accuracy: 0.5033 - loss: 0.6931 - val_accuracy: 0.4917 - val_loss: 0.6928
Epoch 2/10
391/391 ━━━━━━━━━━━━━━━━━━━━ 799s 2s/step - accuracy: 0.5020 - loss: 0.6870 - val_accuracy: 0.7417 - val_loss: 0.5148
Epoch 3/10
391/391 ━━━━━━━━━━━━━━━━━━━━ 794s 2s/step - accuracy: 0.7944 - loss: 0.4593 - val_accuracy: 0.8438 - val_loss: 0.3667
Epoch 4/10
391/391 ━━━━━━━━━━━━━━━━━━━━ 789s 2s/step - accuracy: 0.8412 - loss: 0.3684 - val_accuracy: 0.8505 - val_loss: 0.3685
Epoch 5/10
391/391 ━━━━━━━━━━━━━━━━━━━━ 803s 2s/step - accuracy: 0.8488 - loss: 0.3460 - val_accuracy: 0.8484 - val_loss: 0.3425
Epoch 6/10
391/391 ━━━━━━━━━━━━━━━━━━━━ 805s 2s/step - accuracy: 0.8590 - loss: 0.3296 - val_accuracy: 0.8536 - val_loss: 0.3425
Epoch 7/10
391/391 ━━━━━━━━━━━━━━━━━━━━ 798s 2s/step - accuracy: 0.8666 - loss: 0.3169 - val_accuracy: 0.8625 - val_loss: 0.3354
Epoch 8/10
391/391 ━━━━━━━━━━━━━━━━━━━━ 788s 2s/step - accuracy: 0.8692 - loss: 0.3113 - val_accuracy: 0.8630 - val_loss: 0.3240
Epoch 9/10
391/391 ━━━━━━━━━━━━━━━━━━━━ 789s 2s/step - accuracy: 0.8747 - loss: 0.3031 - val_accuracy: 0.8391 - val_loss: 0.3273
Epoch 10/10
391/391 ━━━━━━━━━━━━━━━━━━━━ 815s 2s/step - accuracy: 0.8725 - loss: 0.2984 - val_accuracy: 0.8536 - val_loss: 0.3219
391/391 ━━━━━━━━━━━━━━━━━━━━ 126s 322ms/step - accuracy: 0.8546 - loss: 0.3225
Test Loss: 0.32147249579429626
Test Accuracy: 0.8547199964523315

(1, 16)
Epoch 1/10
391/391 ━━━━━━━━━━━━━━━━━━━━ 898s 2s/step - accuracy: 0.4984 - loss: 0.6917 - val_accuracy: 0.6260 - val_loss: 0.5940
Epoch 2/10
391/391 ━━━━━━━━━━━━━━━━━━━━ 908s 2s/step - accuracy: 0.7717 - loss: 0.4853 - val_accuracy: 0.8349 - val_loss: 0.3783
Epoch 3/10
391/391 ━━━━━━━━━━━━━━━━━━━━ 926s 2s/step - accuracy: 0.8278 - loss: 0.3878 - val_accuracy: 0.8510 - val_loss: 0.3487
Epoch 4/10
391/391 ━━━━━━━━━━━━━━━━━━━━ 924s 2s/step - accuracy: 0.8251 - loss: 0.4118 - val_accuracy: 0.8234 - val_loss: 0.3935
Epoch 5/10
391/391 ━━━━━━━━━━━━━━━━━━━━ 905s 2s/step - accuracy: 0.8442 - loss: 0.3602 - val_accuracy: 0.8505 - val_loss: 0.3465
Epoch 6/10
391/391 ━━━━━━━━━━━━━━━━━━━━ 872s 2s/step - accuracy: 0.8584 - loss: 0.3328 - val_accuracy: 0.8396 - val_loss: 0.3440
Epoch 7/10
391/391 ━━━━━━━━━━━━━━━━━━━━ 856s 2s/step - accuracy: 0.8631 - loss: 0.3263 - val_accuracy: 0.8542 - val_loss: 0.3241
Epoch 8/10
391/391 ━━━━━━━━━━━━━━━━━━━━ 877s 2s/step - accuracy: 0.8672 - loss: 0.3182 - val_accuracy: 0.8536 - val_loss: 0.3258
Epoch 9/10
391/391 ━━━━━━━━━━━━━━━━━━━━ 920s 2s/step - accuracy: 0.8658 - loss: 0.3189 - val_accuracy: 0.8604 - val_loss: 0.3126
Epoch 10/10
391/391 ━━━━━━━━━━━━━━━━━━━━ 907s 2s/step - accuracy: 0.8620 - loss: 0.3253 - val_accuracy: 0.8641 - val_loss: 0.3318
391/391 ━━━━━━━━━━━━━━━━━━━━ 142s 364ms/step - accuracy: 0.8505 - loss: 0.3308
Test Loss: 0.3276016414165497
Test Accuracy: 0.8529199957847595

(1, 32)
Epoch 1/10
391/391 ━━━━━━━━━━━━━━━━━━━━ 1050s 3s/step - accuracy: 0.5238 - loss: 0.6764 - val_accuracy: 0.8094 - val_loss: 0.4422
Epoch 2/10
391/391 ━━━━━━━━━━━━━━━━━━━━ 1046s 3s/step - accuracy: 0.8163 - loss: 0.4019 - val_accuracy: 0.8594 - val_loss: 0.3449
Epoch 3/10
391/391 ━━━━━━━━━━━━━━━━━━━━ 1029s 3s/step - accuracy: 0.8512 - loss: 0.3441 - val_accuracy: 0.8615 - val_loss: 0.3027
Epoch 4/10
391/391 ━━━━━━━━━━━━━━━━━━━━ 1054s 3s/step - accuracy: 0.8553 - loss: 0.3296 - val_accuracy: 0.8396 - val_loss: 0.3491
Epoch 5/10
391/391 ━━━━━━━━━━━━━━━━━━━━ 1032s 3s/step - accuracy: 0.8524 - loss: 0.3299 - val_accuracy: 0.8682 - val_loss: 0.3148
Epoch 6/10
321/391 ━━━━━━━━━━━━━━━━━━━━ 3:03 3s/step - accuracy: 0.8673 - loss: 0.3097
```



Схоже що довжина вхідної строки впливає лише на швидкість навчання. На кінцеву оцінку впливу не помічено. Спробуємо попрацювати з тестовую конфігурацією

In [None]:
model = get_model(1, 32, encoder)
model.fit(
    train_dataset,
    epochs=10,
    validation_data=test_dataset,
    validation_steps=30
)
test_loss, test_acc = model.evaluate(test_dataset)
print('Test Loss:', test_loss)
print('Test Accuracy:', test_acc, '\n')

Epoch 1/10
[1m126/391[0m [32m━━━━━━[0m[37m━━━━━━━━━━━━━━[0m [1m1:55:46[0m 26s/step - accuracy: 0.4990 - loss: 0.6931

Спробуємо з явно позитивним відгуком

In [None]:
sample_text = ('The movie was cool. The animation and the graphics '
               'were out of this world. I would recommend this movie.')
predictions = model.predict(np.array([sample_text]))

Спробуємо з явно негативним відгуком

In [None]:
sample_text = ('The movie is terrible. It is so boring and terribly'
               'filmed that I left the cinema in the middle of it.')
predictions = model.predict(np.array([sample_text]))

Спробуємо з неявно негативним відгуком

In [None]:
sample_text = ('I literally fell asleep in the cinema.')
predictions = model.predict(np.array([sample_text]))

Спробуємо з невизначеним відгуком

In [None]:
sample_text = ('The first code is somewhat contradictive.'
               'The graphics, the animation, and the special effects are incredible!'
               'However, the plot of the film is super boring,'
               'the most boring film I have ever seen in my life.')
predictions = model.predict(np.array([sample_text]))