<a href="https://colab.research.google.com/github/AgeCoder/Deep-learning-and-llms/blob/main/Sentiment_Analysis_on_IMDB_Reviews_BOW.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import numpy as np
import pandas as pd
import random
import matplotlib.pyplot as plt
from pathlib import Path

#Tensorflow & Keras related packages
import tensorflow as tf
from tensorflow import keras
from keras import layers

In [None]:
import tensorflow_datasets as tfds

In [None]:
train, val, test = tfds.load(name="imdb_reviews",split=["train[:80%]", "train[80%:]", "test"],
                             as_supervised=True)

In [None]:
print (len(train))
print (len(val))
print (len(test))

In [None]:
for element in train.take(2):
    print (element)

In [None]:
for review,label in train.take(1):
    print (review)
    print (label)

In [None]:
train_data = train.batch(32)
val_data = val.batch(32)
test_data = test.batch(32)

In [None]:
for reviews, labels in train_data.take(1):
    print ("Reviews shape", reviews.shape)
    print ("Labels shape", labels.shape, "\n")

    print ('First Review:', reviews[0].numpy().decode("utf-8"), "\n")
    print ('First Label:', labels[0].numpy())

In [None]:
train_data_onlyreviews = train_data.map(lambda x,y : x)

In [None]:
from tensorflow import keras
from tensorflow.keras.layers import TextVectorization

max_tokens = 10000
text_vectorization = TextVectorization(
                                        max_tokens=max_tokens,
                                        output_mode = "multi_hot"
)

In [None]:
text_vectorization.adapt(train_data_onlyreviews)

In [None]:
vocab = text_vectorization.get_vocabulary()
vocab

In [None]:
train_multihot_ugram = train_data.map(lambda x,y: (text_vectorization(x),y))
val_multihot_ugram = val_data.map(lambda x,y: (text_vectorization(x),y))
test_multihot_ugram = test_data.map(lambda x,y: (text_vectorization(x),y))

In [None]:
type(train_multihot_ugram)

In [None]:
for reviews, labels in train_multihot_ugram.take(1):
    print ("Reviews shape", reviews.shape)
    print ("Labels shape", labels.shape, "\n")

    print ('First Review:', reviews[0], "\n")
    print ('First Label:', labels[0].numpy())

In [None]:
for reviews, labels in train_multihot_ugram.take(1):
    # Convert tensor to numpy for easy to view
    reviews_np = reviews.numpy()

    # Create DataFrame
    df = pd.DataFrame(reviews_np, columns=vocab)
    break
df

In [None]:
random.seed(42)
np.random.seed(42)
tf.random.set_seed(42)

In [None]:
hidden_units = 16

def model_ugram_dense():

    # Define Input shape
    inputs = keras.Input(shape = (max_tokens,))

    # Dense Hidden Layer
    x = layers.Dense(hidden_units, activation="relu")(inputs)

    # Dropout Layer
    x = layers.Dropout(0.5)(x)

    # Dense output Layer
    outputs = layers.Dense(1, activation="sigmoid")(x)

    model = keras.Model(inputs=inputs, outputs=outputs)
    return model


In [None]:
model = model_ugram_dense()
model.summary()

In [None]:
model = model_ugram_dense()
path = Path("./models/model_ugram_dense.keras")

In [None]:
def train_evaluate(model,path,train,val,test):

    # Call backs
    checkpoint_cb = keras.callbacks.ModelCheckpoint(filepath = path,
                                                       save_best_only=True) # Save only best model

    earlystop_cb = keras.callbacks.EarlyStopping(monitor='val_loss', patience=5,
                                                 restore_best_weights=True)
    callbacks = [checkpoint_cb,earlystop_cb]

    # Compile the model
    model.compile(optimizer="rmsprop", loss='binary_crossentropy',  metrics = ["accuracy"])

    # Train the model
    history = model.fit(train, validation_data = val, callbacks=callbacks, epochs=20)

    # Evaluate the model
    test_loss, test_accuracy = model.evaluate(test)

    return (history,test_accuracy)

In [None]:
(history_ugram_dense,test_accuracy_ugram_dense) = train_evaluate(model,path,
                                                                 train_multihot_ugram,
                                                                 val_multihot_ugram,
                                                                 test_multihot_ugram)

In [None]:
print (f"Accuracy on the test data set is {test_accuracy_ugram_dense}")

In [None]:
test_probabilities = model.predict(test_multihot_ugram)
test_probabilities

In [None]:
(test_probabilities >= 0.5).astype(int)

In [None]:
for review, label in test_multihot_ugram.take(1):
    print (label)

In [None]:
max_tokens = 10000
text_vectorization = TextVectorization(
    max_tokens=max_tokens,
    output_mode = "tf_idf"
)
# Vectorize the reviews
# apply text vectorization on training data reviews to index the vocabulary
text_vectorization.adapt(train_data_onlyreviews)

vocab = text_vectorization.get_vocabulary()

# reviews are converted into multihot
train_tfidf = train_data.map(lambda x,y: (text_vectorization(x),y))
val_tfidf = val_data.map(lambda x,y: (text_vectorization(x),y))
test_tfidf= test_data.map(lambda x,y: (text_vectorization(x),y))

In [None]:
for reviews, labels in train_tfidf.take(1):

    print ('First Review:', reviews[0], "\n")
    print ('First Label:', labels[0].numpy())

In [None]:
for reviews, labels in train_tfidf.take(1):
    # Convert tensor to numpy for easy to view
    reviews_np = reviews.numpy()

    # Create DataFrame
    df = pd.DataFrame(reviews_np, columns=vocab)
    break
df

In [None]:
random.seed(42)
np.random.seed(42)
tf.random.set_seed(42)

In [None]:
hidden_units = 16

def model_tfidf_dense():
    inputs = keras.Input(shape = (max_tokens,)) # Define Input shape

    x = layers.Dense(hidden_units, activation="relu")(inputs) # Dense Hidden Layer

    x = layers.Dropout(0.5)(x) # Dropout Layer

    outputs = layers.Dense(1, activation="sigmoid")(x) # Dense output Layer

    model = keras.Model(inputs=inputs, outputs=outputs)
    return model

In [None]:
model = model_tfidf_dense()
model.summary()

In [None]:
(history_tfidf_dense,test_accuracy_tfidf_dense) = train_evaluate(model,path,
                                                                 train_tfidf,
                                                                 val_tfidf,
                                                                 test_tfidf)

In [None]:
print (f"Accuracy on the test data set is {test_accuracy_tfidf_dense}")

