In [1]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import skipgrams
from tensorflow.keras.utils import to_categorical

import tensorflow as tf
from tensorflow.keras.layers import Input, Embedding, Dense, Flatten
from tensorflow.keras.models import Model
import numpy as np

In [3]:
with open('reviews_data.txt', 'r', encoding='utf-8') as f:
    text = f.readlines()

In [4]:
text[0]

'oct nice trendy hotel location not too bad stayed in this hotel for one night as this is fairly new place some of the taxi drivers did not know where it was and or did not want to drive there once have eventually arrived at the hotel was very pleasantly surprised with the decor of the lobby ground floor area it was very stylish and modern found the reception staff geeting me with aloha bit out of place but guess they are briefed to say that to keep up the coroporate image as have starwood preferred guest member was given small gift upon check in it was only couple of fridge magnets in gift box but nevertheless nice gesture my room was nice and roomy there are tea and coffee facilities in each room and you get two complimentary bottles of water plus some toiletries by bliss the location is not great it is at the last metro stop and you then need to take taxi but if you are not planning on going to see the historic sites in beijing then you will be ok chose to have some breakfast in the

In [5]:
sample = text[:100000]

In [6]:
# Initialize tokenizer with a minimum frequency of 10k and more than 90k
tokenizer = Tokenizer(lower=True, oov_token="<OOV>", filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n')
tokenizer.fit_on_texts(sample)

# Filter out words with frequency less than 10k and more than 90k
word_counts = tokenizer.word_counts
filtered_word_counts = {word: count for word, count in word_counts.items() if count >= 10000 and count<=90000}

# Create a new tokenizer with the filtered words
filtered_tokens = list(filtered_word_counts.keys())
filtered_tokenizer = Tokenizer(lower=True, oov_token="<OOV>", filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n', num_words=len(filtered_tokens))
filtered_tokenizer.fit_on_texts(sample)

In [7]:
word_index = filtered_tokenizer.word_index
# word_index

In [8]:
# Create word embeddings
embedding_dim = 5
vocab_size = len(word_index) + 1
embedding_layer = Embedding(input_dim=vocab_size, output_dim=embedding_dim, input_length=1)

In [9]:
# Generate skip-grams

skip_window = 5
num_negative_samples = 2
skip_grams = []

def generate_skipgrams(texts, tokenizer, skip_window, num_negative_samples, vocab_size):
    for text in texts:
        tokens = tokenizer.texts_to_sequences([text])[0]
        pairs, _ = skipgrams(tokens, vocabulary_size=vocab_size, window_size=skip_window, negative_samples=num_negative_samples)
        for pair in pairs:
            yield pair
            
tokens = filtered_tokenizer.texts_to_sequences(sample)
skip_grams = list(generate_skipgrams(sample, filtered_tokenizer, skip_window, num_negative_samples, vocab_size))

KeyboardInterrupt: 

In [None]:
# Model building

# Define input layer
input_word = Input(shape=(1,))

# Define embedding layer
embedding = embedding_layer(input_word)

# Define flatten layer to flatten embedding output
flatten = Flatten()(embedding)

# Define output layer with softmax activation
output_word = Dense(vocab_size, activation='softmax')(flatten)

# Define skipgram model with input and output layers
model = Model(inputs=input_word, outputs=output_word)

# Compile model with categorical cross-entropy loss and Adam optimizer
model.compile(loss='categorical_crossentropy', optimizer='adam')

In [None]:
# Train model

# Convert skip-grams to input and output data
pairs, labels = zip(*skip_grams)
pairs = np.array(pairs, dtype=np.int32)
labels = to_categorical(np.array(labels, dtype=np.int32), num_classes=vocab_size)

# Train model for 10 epochs
model.fit(pairs, labels, epochs=10, batch_size=1024)

In [None]:
# One-hot encode the context words
# x_train = []
# y_train = []
# for pair in skip_grams:
#     center_word = pair[0]
#     context_word = pair[1]
#     x_train.append(center_word)
#     y_train.append(to_categorical(context_word, num_classes=vocab_size))


# batch_size = 128
# num_epochs = 10
# steps_per_epoch = len(skip_grams) // batch_size

# for epoch in range(num_epochs):
#     for step in range(steps_per_epoch):
#         batch_skip_grams = skip_grams[step*batch_size:(step+1)*batch_size]
#         x_train = []
#         y_train = []
#         for pair in batch_skip_grams:
#             center_word = pair[0]
#             context_word = pair[1]
#             x_train.append(center_word)
#             y_train.append(to_categorical(context_word, num_classes=vocab_size))
#         model.train_on_batch(x_train, y_train)
