
Sentiment Analysis Model trained on Yelp Dataset ,that is used for:

Source: The dataset is created from Yelp reviews, which are user-generated reviews of businesses (e.g., restaurants, shops, services).

Labels: Each review is labeled as either:

1 (Positive): Indicates a positive sentiment (e.g., a happy customer).

0 (Negative): Indicates a negative sentiment (e.g., an unhappy customer).

Size: The dataset contains 560,000 training samples and 38,000 testing samples, making it a large and robust dataset for sentiment analysis tasks.



In [2]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout, Bidirectional, GlobalAveragePooling1D
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.regularizers import l2
import tensorflow_datasets as tfds
import numpy as np

# Load Yelp Polarity dataset
dataset, info = tfds.load('yelp_polarity_reviews', with_info=True, as_supervised=True)
train_data, test_data = dataset['train'], dataset['test']

# Prepare the data
vocab_size = 50000  # Limit vocabulary to top 50,000 words
max_length = 200    # Maximum review length (truncation/padding)

# Tokenizer to convert text to sequences
tokenizer = keras.preprocessing.text.Tokenizer(num_words=vocab_size)
tokenizer.fit_on_texts([text.numpy().decode('utf-8') for text, _ in train_data])

# Convert text data to sequences and pad them
def preprocess_dataset(dataset):
    texts, labels = [], []
    for text, label in dataset:
        texts.append(text.numpy().decode('utf-8'))
        labels.append(label.numpy())
    sequences = tokenizer.texts_to_sequences(texts)
    padded_sequences = pad_sequences(sequences, maxlen=max_length, padding='post', truncating='post')
    return padded_sequences, np.array(labels)

# Preprocess training and testing data
tr_x, tr_y = preprocess_dataset(train_data)
te_x, te_y = preprocess_dataset(test_data)

# Build the model
model = keras.Sequential([
    Embedding(input_dim=vocab_size, output_dim=128, input_length=max_length),  # Embedding Layer
    Bidirectional(LSTM(32, return_sequences=True, kernel_regularizer=l2(0.001))),  # Bidirectional LSTM
    Dropout(0.5),  # Dropout to prevent overfitting
    GlobalAveragePooling1D(),  # Average pooling over time steps
    Dense(32, activation='relu', kernel_regularizer=l2(0.001)),  # Fully connected layer with L2 reg
    Dropout(0.5),  # Dropout before output layer
    Dense(1, activation='sigmoid')  # Output layer for binary classification
])

# Compile the model
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

# Add early stopping
early_stopping = keras.callbacks.EarlyStopping(monitor='val_loss', patience=2, restore_best_weights=True)

# Train the model
model.fit(tr_x, tr_y, epochs=10, batch_size=64, validation_data=(te_x, te_y), callbacks=[early_stopping])

# Evaluate the model
test_loss, test_acc = model.evaluate(te_x, te_y)
print(f"Test Accuracy: {test_acc:.4f}")

Downloading and preparing dataset 158.67 MiB (download: 158.67 MiB, generated: 435.14 MiB, total: 593.80 MiB) to /root/tensorflow_datasets/yelp_polarity_reviews/0.2.0...


Dl Completed...: 0 url [00:00, ? url/s]

Dl Size...: 0 MiB [00:00, ? MiB/s]

Extraction completed...: 0 file [00:00, ? file/s]

Generating splits...:   0%|          | 0/2 [00:00<?, ? splits/s]

Generating train examples...:   0%|          | 0/560000 [00:00<?, ? examples/s]

Shuffling /root/tensorflow_datasets/yelp_polarity_reviews/incomplete.UXGJ0L_0.2.0/yelp_polarity_reviews-train.…

Generating test examples...:   0%|          | 0/38000 [00:00<?, ? examples/s]

Shuffling /root/tensorflow_datasets/yelp_polarity_reviews/incomplete.UXGJ0L_0.2.0/yelp_polarity_reviews-test.t…

Dataset yelp_polarity_reviews downloaded and prepared to /root/tensorflow_datasets/yelp_polarity_reviews/0.2.0. Subsequent calls will reuse this data.




Epoch 1/10
[1m8750/8750[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m176s[0m 20ms/step - accuracy: 0.8701 - loss: 0.3432 - val_accuracy: 0.9357 - val_loss: 0.1873
Epoch 2/10
[1m8750/8750[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m202s[0m 20ms/step - accuracy: 0.9401 - loss: 0.1806 - val_accuracy: 0.9447 - val_loss: 0.1632
Epoch 3/10
[1m8750/8750[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m171s[0m 20ms/step - accuracy: 0.9519 - loss: 0.1496 - val_accuracy: 0.9419 - val_loss: 0.1740
Epoch 4/10
[1m8750/8750[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m204s[0m 20ms/step - accuracy: 0.9579 - loss: 0.1335 - val_accuracy: 0.9461 - val_loss: 0.1579
Epoch 5/10
[1m8750/8750[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m202s[0m 20ms/step - accuracy: 0.9625 - loss: 0.1212 - val_accuracy: 0.9459 - val_loss: 0.1653
Epoch 6/10
[1m8750/8750[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m203s[0m 20ms/step - accuracy: 0.9651 - loss: 0.1128 - val_accuracy: 0.9435 - val_loss: 0.173

In [3]:
from tensorflow.keras.preprocessing.sequence import pad_sequences
import numpy as np

# Function to predict sentiment of input text
def predict_sentiment(model, text, tokenizer, max_length=200):
    # Convert text to sequence using the tokenizer
    sequence = tokenizer.texts_to_sequences([text])

    # Pad the sequence to the fixed length
    padded_sequence = pad_sequences(sequence, maxlen=max_length, padding='post', truncating='post')

    # Get prediction probability
    prediction = model.predict(padded_sequence)[0][0]

    # Classify sentiment
    sentiment = "Positive" if prediction > 0.5 else "Negative"

    # Confidence score
    confidence = prediction if sentiment == "Positive" else 1 - prediction

    return sentiment, confidence

In [4]:
# Function to decode a sequence back to text
def sequence_to_text(sequence, tokenizer):
    return tokenizer.sequences_to_texts([sequence])[0]

# Select random samples from the test set
num_samples = 5  # Number of samples to test
random_indices = np.random.choice(len(te_x), num_samples, replace=False)

# Test the selected samples
for i in random_indices:
    # Get the sequence and label
    sequence = te_x[i]
    true_label = te_y[i]

    # Decode the sequence to text
    review_text = sequence_to_text(sequence, tokenizer)

    # Predict sentiment using the model
    predicted_sentiment, confidence = predict_sentiment(model, review_text, tokenizer)

    # Map the true label to a sentiment
    true_sentiment = "Positive" if true_label == 1 else "Negative"

    # Print results
    print(f"Review: {review_text}")
    print(f"True Sentiment: {true_sentiment}")
    print(f"Predicted Sentiment: {predicted_sentiment} (Confidence: {confidence:.4f})")
    print("-" * 50)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 216ms/step
Review: looking for our cheers n ni live in the area and wanted to check this place out for some time now i've been looking for a comfortable place to hang out for a couple three happy hours a week and maybe a weekend meal n nmy girlfriend and i dropped in on a playoff football sunday about 2 pm first thing we noticed when we walk in the door is the hostess is very cute and the place is loud i mean really loud not with your normal every day sports bar chatter but the game audio coming from the ceiling mounted speakers was deafening the hostess asked if we wanted a high top table and that is usually our preference though the one she suggested was positioned directly under a speaker i quickly surveyed the speaker system and chose a table slightly off center of a speaker though unfortunately i soon came to realize it was right near the patio door access to the outside smoking area the door was used often and some smo

In [5]:
# List of positive and negative reviews to test
test_reviews = [
    # Positive reviews
    "The food was absolutely delicious, and the service was exceptional!",
    "I had a wonderful experience at this restaurant. The ambiance was perfect, and the staff was very friendly.",
    "Highly recommend this place! The dishes were flavorful, and the presentation was stunning.",

    # Negative reviews
    "The worst experience ever. The food was cold, and the waiter was rude.",
    "I was extremely disappointed with the service. The staff ignored us, and the food was overpriced.",
    "Terrible quality. The place was dirty, and the food tasted awful."
]

# Function to predict sentiment for a list of reviews
def test_reviews_sentiment(model, reviews, tokenizer, max_length=200):
    for review in reviews:
        # Predict sentiment
        sentiment, confidence = predict_sentiment(model, review, tokenizer, max_length)

        # Print results
        print(f"Review: {review}")
        print(f"Predicted Sentiment: {sentiment} (Confidence: {confidence:.4f})")
        print("-" * 50)

# Test the reviews
test_reviews_sentiment(model, test_reviews, tokenizer)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 42ms/step
Review: The food was absolutely delicious, and the service was exceptional!
Predicted Sentiment: Positive (Confidence: 0.9780)
--------------------------------------------------
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 51ms/step
Review: I had a wonderful experience at this restaurant. The ambiance was perfect, and the staff was very friendly.
Predicted Sentiment: Positive (Confidence: 0.9885)
--------------------------------------------------
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 59ms/step
Review: Highly recommend this place! The dishes were flavorful, and the presentation was stunning.
Predicted Sentiment: Positive (Confidence: 0.9738)
--------------------------------------------------
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 45ms/step
Review: The worst experience ever. The food was cold, and the waiter was rude.
Predicted Sentiment: Negative (Confi

In [6]:
text1=["Bad food"]
test_reviews_sentiment(model, text1, tokenizer)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 82ms/step
Review: Bad food
Predicted Sentiment: Negative (Confidence: 0.6572)
--------------------------------------------------


In [13]:
text2=["Sushi was great"]
test_reviews_sentiment(model, text2, tokenizer)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 47ms/step
Review: Sushi was great
Predicted Sentiment: Positive (Confidence: 0.8970)
--------------------------------------------------
