<a href="https://colab.research.google.com/github/AngelLinaStarshine/yelp_review/blob/main/Untitled0.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install datasets



In [None]:
import io
import matplotlib.pyplot as plt
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
import tensorflow_datasets as tfds


max_tokens = 10000
sequence_length = 100


vectorize_layer = layers.TextVectorization(
    max_tokens=max_tokens,
    output_mode='int',
    output_sequence_length=sequence_length
)

def get_batch_data():

    (train_data, test_data), info = tfds.load('yelp_polarity_reviews',
                                              split=(tfds.Split.TRAIN, tfds.Split.TEST),
                                              with_info=True, as_supervised=True)

    train_text = train_data.map(lambda text, label: text)
    vectorize_layer.adapt(train_text)


    train_data = train_data.map(lambda text, label: (vectorize_layer(text), label))
    test_data = test_data.map(lambda text, label: (vectorize_layer(text), label))

    train_batches = train_data.shuffle(1000).padded_batch(10)
    test_batches = test_data.padded_batch(10)
    return train_batches, test_batches

def get_model(embedding_dim=16):
    model = keras.Sequential([
        layers.Embedding(max_tokens, embedding_dim, input_length=sequence_length),
        layers.GlobalAveragePooling1D(),
        layers.Dense(1, activation='sigmoid')
    ])
    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
    return model

def plot_data(history):
    acc = history.history['accuracy']
    val_acc = history.history['val_accuracy']
    epochs = range(1, len(acc) + 1)

    plt.figure(figsize=(12, 9))
    plt.plot(epochs, acc, 'bo', label='Training acc')
    plt.plot(epochs, val_acc, 'b', label='Validation acc')
    plt.title('Training and validation accuracy')
    plt.xlabel('Epochs')
    plt.ylabel('Accuracy')
    plt.legend(loc='lower right')
    plt.ylim((0.5, 1))
    plt.show()

def retrieve_embeddings(model):
    out_vectors = io.open('vecs.tsv', 'w', encoding='utf-8')
    out_metadata = io.open('meta.tsv', 'w', encoding='utf-8')
    weights = model.layers[0].get_weights()[0]

    print("Sample embeddings before saving:")
    for i in range(5):
        print(f"Token {i+1}: {weights[i+1]}")

    for num in range(1, max_tokens):
        vec = weights[num]
        out_metadata.write(f"Token_{num}\n")
        out_vectors.write('\t'.join([str(x) for x in vec]) + '\n')

    out_vectors.close()
    out_metadata.close()


train_batches, test_batches = get_batch_data()
model = get_model()
history = model.fit(train_batches, epochs=10, validation_data=test_batches, validation_steps=20)


retrieve_embeddings(model)

from google.colab import files
files.download('vecs.tsv')
files.download('meta.tsv')


Epoch 1/10
[1m56000/56000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m371s[0m 7ms/step - accuracy: 0.8557 - loss: 0.3352 - val_accuracy: 0.8750 - val_loss: 0.2749
Epoch 2/10
[1m56000/56000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m364s[0m 7ms/step - accuracy: 0.8984 - loss: 0.2532 - val_accuracy: 0.8850 - val_loss: 0.2568
Epoch 3/10
[1m56000/56000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m389s[0m 7ms/step - accuracy: 0.9003 - loss: 0.2490 - val_accuracy: 0.8750 - val_loss: 0.3496
Epoch 4/10
[1m56000/56000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m378s[0m 7ms/step - accuracy: 0.9022 - loss: 0.2468 - val_accuracy: 0.9300 - val_loss: 0.2187
Epoch 5/10
[1m56000/56000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m373s[0m 7ms/step - accuracy: 0.9029 - loss: 0.2454 - val_accuracy: 0.9450 - val_loss: 0.1884
Epoch 6/10
[1m56000/56000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m372s[0m 6ms/step - accuracy: 0.9038 - loss: 0.2442 - val_accuracy: 0.9150 - val_loss:

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>