In [8]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Bidirectional, LSTM, Dense, Dropout, GlobalMaxPool1D
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.regularizers import l2
from tensorflow.keras.callbacks import EarlyStopping, LearningRateScheduler
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, classification_report
import numpy as np
import re
from datasets import load_dataset
import matplotlib.pyplot as plt
import seaborn as snspi
import contractions

In [9]:
def clean_text(text):
    # Remove HTML tags
    text = re.sub(r'<[^>]+>', ' ', text)
    # Fix contractions
    text = contractions.fix(text)
    # Remove special characters and numbers
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    # Convert to lowercase
    text = text.lower()
    # Remove extra whitespace
    text = re.sub(r'\s+', ' ', text).strip()
    return text

In [10]:
dataset = load_dataset("imdb")
train_data = dataset["train"]
test_data = dataset["test"]

# Apply advanced cleaning
train_texts = [clean_text(example["text"]) for example in train_data]
test_texts = [clean_text(example["text"]) for example in test_data]

y_train = np.array([example["label"] for example in train_data])
y_test = np.array([example["label"] for example in test_data])

Using the latest cached version of the dataset since imdb couldn't be found on the Hugging Face Hub
Found the latest cached dataset configuration 'plain_text' at C:\Users\hp\.cache\huggingface\datasets\imdb\plain_text\0.0.0\e6281661ce1c48d982bc483cf8a173c1bbeb5d31 (last modified on Tue Apr  8 12:05:30 2025).


In [11]:
tokenizer = Tokenizer(num_words=20000, oov_token="<OOV>")
tokenizer.fit_on_texts(train_texts)

# Convert texts to sequences
train_sequences = tokenizer.texts_to_sequences(train_texts)
test_sequences = tokenizer.texts_to_sequences(test_texts)

max_length = 256  # Increased sequence length
X_train = pad_sequences(train_sequences, maxlen=max_length, padding='post', truncating='post')
X_test = pad_sequences(test_sequences, maxlen=max_length, padding='post', truncating='post')

# Split validation set
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

In [12]:
embedding_dim = 256
model = Sequential([
    Embedding(input_dim=20000, 
              output_dim=embedding_dim, 
              input_length=max_length),
    Bidirectional(LSTM(128, return_sequences=True, 
                       kernel_regularizer=l2(0.001))),
    Dropout(0.5),
    GlobalMaxPool1D(),
    Dense(64, activation='relu', kernel_regularizer=l2(0.001)),
    Dropout(0.4),
    Dense(1, activation='sigmoid')
])

optimizer = tf.keras.optimizers.Adam(learning_rate=0.0001)
model.compile(loss='binary_crossentropy',
              optimizer=optimizer,
              metrics=['accuracy'])



In [13]:
early_stop = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)

def lr_scheduler(epoch, lr):
    if epoch > 2:
        return lr * 0.5
    return lr

history = model.fit(
    X_train, y_train,
    validation_data=(X_val, y_val),
    epochs=15,
    batch_size=256,
    callbacks=[early_stop, LearningRateScheduler(lr_scheduler)]
)

Epoch 1/15
[1m79/79[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m291s[0m 4s/step - accuracy: 0.5211 - loss: 1.4323 - val_accuracy: 0.7336 - val_loss: 1.3057 - learning_rate: 1.0000e-04
Epoch 2/15
[1m79/79[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m328s[0m 4s/step - accuracy: 0.6472 - loss: 1.2556 - val_accuracy: 0.7664 - val_loss: 1.0624 - learning_rate: 1.0000e-04
Epoch 3/15
[1m79/79[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m291s[0m 4s/step - accuracy: 0.8116 - loss: 0.9230 - val_accuracy: 0.8234 - val_loss: 0.8759 - learning_rate: 1.0000e-04
Epoch 4/15
[1m79/79[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m359s[0m 5s/step - accuracy: 0.8738 - loss: 0.7384 - val_accuracy: 0.8572 - val_loss: 0.7716 - learning_rate: 5.0000e-05
Epoch 5/15
[1m79/79[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m357s[0m 5s/step - accuracy: 0.9069 - loss: 0.6316 - val_accuracy: 0.8648 - val_loss: 0.7410 - learning_rate: 2.5000e-05
Epoch 6/15
[1m79/79[0m [32m━━━━━━━━━━━━━━━━━━━━[0m

In [14]:
y_pred = (model.predict(X_test) > 0.5).astype("int32")

print(f"Test Accuracy: {accuracy_score(y_test, y_pred):.4f}")
print(f"F1 Score: {f1_score(y_test, y_pred):.4f}")
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

[1m782/782[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m104s[0m 133ms/step
Test Accuracy: 0.8503
F1 Score: 0.8504

Classification Report:
              precision    recall  f1-score   support

           0       0.85      0.85      0.85     12500
           1       0.85      0.85      0.85     12500

    accuracy                           0.85     25000
   macro avg       0.85      0.85      0.85     25000
weighted avg       0.85      0.85      0.85     25000



In [15]:
def predict_sentiment(text):
    cleaned_text = clean_text(text)
    sequence = tokenizer.texts_to_sequences([cleaned_text])
    padded = pad_sequences(sequence, maxlen=max_length, padding='post', truncating='post')
    
    prediction = model.predict(padded, verbose=0)[0][0]
    sentiment = "Positive" if prediction > 0.5 else "Negative"
    confidence = prediction if sentiment == "Positive" else 1 - prediction
    
    return f"{sentiment} (Confidence: {confidence*100:.1f}%)"

In [16]:
# CLI Interface
print("\nSentiment Analysis CLI (type 'exit' to quit)")
while True:
    text = input("\nEnter text: ")
    if text.lower() == "exit":
        break
    if len(text.strip()) == 0:
        print("Error: Please enter valid text")
        continue
    print(text+":")
    print(predict_sentiment(text))


Sentiment Analysis CLI (type 'exit' to quit)
it was a very good film and must watch awsome experience:
Positive (Confidence: 60.0%)
it was very bad and worst experience:
Negative (Confidence: 56.6%)
it was very good:
Positive (Confidence: 57.1%)
it was nice :
Positive (Confidence: 55.6%)


In [None]:
import tkinter as tk
from tkinter import ttk, messagebox

def create_gui():
    root = tk.Tk()
    root.title("Sentiment Analyzer")
    root.geometry("600x400")

    style = ttk.Style()
    style.configure('TButton', font=('Helvetica', 12))
    style.configure('TLabel', font=('Helvetica', 14))

    main_frame = ttk.Frame(root, padding="20")
    main_frame.pack(fill=tk.BOTH, expand=True)

    def analyze_sentiment():
        text = input_text.get("1.0", tk.END).strip()
        if not text:
            messagebox.showwarning("Input Error", "Please enter some text to analyze!")
            return
        
        try:
            result = predict_sentiment(text)
            result_label.config(text=result, foreground='green')
        except Exception as e:
            messagebox.showerror("Error", f"An error occurred: {str(e)}")

    # Input Section
    input_label = ttk.Label(main_frame, text="Enter your text:")
    input_label.pack(pady=5, anchor=tk.W)

    input_text = tk.Text(main_frame, height=8, width=60, wrap=tk.WORD)
    input_text.pack(pady=5, fill=tk.X)

    # Analyze Button
    analyze_btn = ttk.Button(main_frame, text="Analyze Sentiment", command=analyze_sentiment)
    analyze_btn.pack(pady=10)

    # Result Display
    result_label = ttk.Label(main_frame, text="", wraplength=550)
    result_label.pack(pady=10, fill=tk.X)

    # Exit Button
    exit_btn = ttk.Button(main_frame, text="Exit", command=root.destroy)
    exit_btn.pack(pady=10)

    root.mainloop()

# Run the GUI instead of the CLI
print("\nLaunching Sentiment Analysis GUI...")
create_gui()


Launching Sentiment Analysis GUI...
