## Start part

In [15]:
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'

from tensorflow.keras.optimizers import Adam, RMSprop
from tensorflow.keras.models import Sequential, Model, load_model
from tensorflow.keras.layers import Input, Dense, LSTM, Dropout, TextVectorization, Embedding, Bidirectional
from tensorflow import keras
import tensorflow as tf

import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import re

In [2]:
BATCH_SIZE = 64

EMBEDDING_DIM = 100
SEQ_LENGTH    = 600
MAX_TOKENS    = 20_000

PATH_DATASET = r"D:\BigDataSets\RNN\IMDB_50k.csv"
PATH_GLOVE   = r"D:\BigDataSets\Vectorization\GloVe\glove.6B.100d.txt"

In [3]:
def remove_URL(text):  # to remove URLs
    url = re.compile(r'https?://\S+|www\.\S+')
    return url.sub(r'', text)

def remove_html(text):  # to remove html tags
    html = re.compile(r'<.*?>')
    return html.sub(r'', text)

data_frame = pd.read_csv(PATH_DATASET)
data_frame['review'] = data_frame['review'].apply(remove_URL)
data_frame['review'] = data_frame['review'].apply(remove_html)

data = data_frame.to_numpy()
texts, labels = data[:, 0], data[:, 1]
labels[labels=='positive'] = "0"
labels[labels=='negative'] = "1"
print(data.shape, 'Loaded texts', sep='\n')
# print(labels[:15], texts[:1])

# 30_000 - 5_000 - 15_000
train_texts, val_texts, test_texts = texts[:30_000], texts[30_000:35_000], texts[35_000:]
train_labels, val_labels, test_labels = labels[:30_000], labels[30_000:35_000], labels[35_000:]

(50000, 2)
Loaded texts


In [4]:
text_vectorization = TextVectorization(
    max_tokens=MAX_TOKENS,
    output_sequence_length=SEQ_LENGTH,
    output_mode="int", ngrams=None)

text_ds = tf.data.Dataset.from_tensor_slices(train_texts)
text_vectorization.adapt(text_ds.batch(64))

In [14]:
print(type(train_texts))
print(text_vectorization.vocabulary_size())
print(text_vectorization(train_texts[10]))
# print(text_vectorization.get_vocabulary())

<class 'numpy.ndarray'>
20000
tf.Tensor(
[ 4474     2  1155     7    28     5   141  2574    95   116     2  1165
     7   444   185     2 17920     5   265   240    70   757     1    88
     9    13    51  1047     3   176   154    18    14     2    17  7830
    11   149   156     2   621    39 17920   154     1     4   468   382
    19   174   107     4   425     8   395    48    67    46   176   209
    99    18   849    11    40   424     1   808    10    19    56  1260
     6     4 10294    35     7  3567     1   137   712    18   123   361
   647    36   160  1280     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0    

In [9]:
def make_ds(x, y):
    ds = tf.data.Dataset.from_tensor_slices((x, y))
    ds = ds.map(lambda x, y: (text_vectorization(x), tf.strings.to_number(y, out_type=tf.int32)),
                      num_parallel_calls=tf.data.AUTOTUNE)
    ds = ds.batch(BATCH_SIZE).cache()
    return ds

train_ds = make_ds(train_texts, train_labels)
val_ds = make_ds(val_texts, val_labels)
test_ds = make_ds(test_texts, test_labels)

for data in train_ds.take(1):
    print(*data, sep='\n')
    

tf.Tensor(
[[  28    5    2 ...    0    0    0]
 [   4  374  112 ...    0    0    0]
 [  11  191   10 ...    0    0    0]
 ...
 [ 198   40  277 ...    0    0    0]
 [  38  646    1 ...    0    0    0]
 [1427  105  342 ...    0    0    0]], shape=(64, 600), dtype=int64)
tf.Tensor(
[0 0 0 1 0 0 0 1 1 0 1 1 1 1 0 1 0 1 0 1 0 1 0 1 1 0 0 1 1 0 0 0 1 0 1 1 1
 1 0 1 1 0 1 1 0 0 1 1 0 1 0 0 0 0 1 1 1 1 0 0 1 1 0 1], shape=(64,), dtype=int32)


#### GloVe

In [17]:
embeddings_index = {}
with open(PATH_GLOVE, encoding='utf-8') as f:
    for line in f:
        word, coefs = line.split(maxsplit=1)
        coefs = np.fromstring(coefs, "f", sep=" ")
        embeddings_index[word] = coefs
print(f"Found {len(embeddings_index)} word vectors.")

Found 400000 word vectors.


UnicodeDecodeError: 'utf-8' codec can't decode byte 0xc3 in position 0: unexpected end of data

In [None]:
# 2
vocabulary = text_vectorization.get_vocabulary()
word_index = dict(zip(vocabulary, range(len(vocabulary))))
embedding_matrix = np.zeros((MAX_TOKENS, EMBEDDING_DIM))
for word, i in word_index.items():
    if i < max_tokens:
        embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:  # Else zeros
        embedding_matrix[i] = embedding_vector
    else:
        print(f"Cant find {word} in GloVe")

In [None]:
# 3
embedding_layer = Embedding(
    MAX_TOKENS, EMBEDDING_DIM,
    embeddings_initializer=keras.initializers.Constant(embedding_matrix),
    trainable=False, mask_zero=True)