In [2]:
import os
import tensorflow as tf
from transformers import BertTokenizer, TFBertForSequenceClassification
import pandas as pd
from bs4 import BeautifulSoup
import re
import matplotlib.pyplot as plt
import plotly.express as px
import plotly.offline as pyo
from wordcloud import WordCloud, STOPWORDS
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

In [3]:
(train_data, train_labels), (test_data, test_labels) = tf.keras.datasets.imdb.load_data()

Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/imdb.npz


In [4]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)



The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.



tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [5]:
model = TFBertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

All PyTorch model weights were used when initializing TFBertForSequenceClassification.

Some weights or buffers of the TF 2.0 model TFBertForSequenceClassification were not initialized from the PyTorch model and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [6]:
def text_cleaning(text):
    soup = BeautifulSoup(text, "html.parser")
    text = re.sub(r'\[[^]]*\]', '', soup.get_text())
    pattern = r"[^a-zA-Z0-9\s,']"
    text = re.sub(pattern, '', text)
    return text

In [7]:
def predict_sentiment(review_text):
    cleaned_text = text_cleaning(review_text)
    encoded_text = tokenizer.encode_plus(
        cleaned_text,
        max_length=128,
        truncation=True,
        padding='max_length',
        return_tensors='tf'
    )
    prediction = model.predict([encoded_text['input_ids'], encoded_text['token_type_ids'], encoded_text['attention_mask']])
    pred_label = tf.argmax(prediction.logits, axis=1).numpy()[0]
    sentiment = "Positive" if pred_label == 1 else "Negative"
    return sentiment

In [8]:
def visualize_sentiment_counts(train_labels):
    sentiment_counts = pd.Series(train_labels).value_counts()
    fig = px.bar(x=['Negative', 'Positive'], y=sentiment_counts.values, color=sentiment_counts.index,
                 color_discrete_sequence=px.colors.qualitative.Dark24,
                 title='Sentiment Counts')
    fig.update_layout(xaxis_title='Sentiment', yaxis_title='Counts', template='plotly_dark')
    fig.show()

In [14]:
# Function to generate word clouds
def generate_wordcloud(text, title):
    all_text = " ".join([tokenizer.decode(review) for review in text])
    wordcloud = WordCloud(width=800, height=400, stopwords=set(STOPWORDS), background_color='black').generate(all_text)
    plt.figure(figsize=(10, 5))
    plt.imshow(wordcloud, interpolation='bilinear')
    plt.axis("off")
    plt.title(title)
    plt.show()


In [24]:
def chatbot():
    print("Welcome to the Sentiment Analysis Chatbot!")
    print("Type 'exit' to quit.")
    while True:
        user_input = input("Enter a review: ")
        if user_input.lower() == 'exit':
            print("Goodbye!")
            break
        else:
            sentiment = predict_sentiment(user_input)
            print("Predicted sentiment:", sentiment)
            visualize_sentiment_counts(train_labels)

In [25]:
chatbot()

Welcome to the Sentiment Analysis Chatbot!
Type 'exit' to quit.
Enter a review: I was wonderful movie, I just loved the movie
Predicted sentiment: Positive


Enter a review: exit
Goodbye!


In [42]:
def evaluate_model(predictions, true_labels):
    # Convert numerical labels to strings
    pred_labels = ["Positive" if label == 1 else "Negative" for label in predictions]
    true_labels = ["Positive" if label == 1 else "Negative" for label in true_labels]
    print(classification_report(true_labels, pred_labels))


In [43]:
evaluate_model(test_data, test_labels)


Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.



              precision    recall  f1-score   support

    Negative       0.50      1.00      0.67     12500
    Positive       0.00      0.00      0.00     12500

    accuracy                           0.50     25000
   macro avg       0.25      0.50      0.33     25000
weighted avg       0.25      0.50      0.33     25000




Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.


Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.

