In [None]:
### 🛠️ Step 1: Install & Import Required Libraries

# Google Colab Users must first install the  install required libraries using:

!pip install numpy pandas tensorflow keras scikit-learn spacy matplotlib
!python -m spacy download en_core_web_sm

# Now, we import the necessary libraries:
import numpy as np  # numerical computations
import pandas as pd  # #️Data handling
import spacy  # NLP processing (tokenization, lemmatization)
import string  # String operations
from sklearn.model_selection import train_test_split  # Splitting dataset
from tensorflow.keras.preprocessing.text import Tokenizer  # Converting text to numbers
from tensorflow.keras.preprocessing.sequence import pad_sequences  # Standardizing input size
from tensorflow.keras.utils import to_categorical  # One-hot encoding labels
import matplotlib.pyplot as plt  # Visualization

###📌 Step 2: Load and Clean the Data

# Dataset Information:
# Reviews are taken from Yelp, Amazon, and IMDB datasets.
# Labels: 0 (Negative), 1 (Positive).
# We must apply text cleaning (lowercasing, punctuation removal, and lemmatization to get rid of noise).

# Load spaCy's English NLP model
nlp = spacy.load("en_core_web_sm")  # Essential for tokenization & lemmatization

# Write the function to load datasets
def load_data():
    column_name = ['Review', 'Sentiment']

    # Read datasets from text files
    data_yelp = pd.read_csv("yelp_labelled.txt", sep='\t', header=None, names=column_name)
    data_amazon = pd.read_csv("amazon_cells_labelled.txt", sep='\t', header=None, names=column_name)
    data_imdb = pd.read_csv("imdb_labelled.txt", sep='\t', header=None, names=column_name)

    return pd.concat([data_yelp, data_amazon, data_imdb], ignore_index=True)

# Now load data
data = load_data()
print("Dataset Loaded Successfully! Here is the shape:", data.shape)

# Write the function for getting rid of punctuation and do lemmatization
def clean_text(text):
    doc = nlp(text)
    tokens = [token.lemma_.lower().strip() for token in doc if token.text not in string.punctuation]
    return " ".join(tokens)

# Text cleaning command
data["Cleaned_Review"] = data["Review"].apply(clean_text)

# Convert labels (0: Negative, 1: Positive)
data["Sentiment"] = data["Sentiment"].astype(int)

# Split dataset into training (80%) and testing (20%)
X_train, X_test, y_train, y_test = train_test_split(data["Cleaned_Review"], data["Sentiment"], test_size=0.2, random_state=42)

print("Data Split Completed!")

###📌 Step 3: Tokenization & Padding
# Why Tokenization?
# - Convert words into numerical sequences.
# - Use padding to ensure all sequences have the same length.

tokenizer = Tokenizer(num_words=5000, oov_token="<OOV>")  # We limit vocab size to 5000 words
tokenizer.fit_on_texts(X_train)

# Convert text to sequences
X_train_seq = tokenizer.texts_to_sequences(X_train)
X_test_seq = tokenizer.texts_to_sequences(X_test)

# Find max sequence length for padding
max_length = max(len(seq) for seq in X_train_seq)

# Apply padding
X_train_padded = pad_sequences(X_train_seq, maxlen=max_length, padding="post")
X_test_padded = pad_sequences(X_test_seq, maxlen=max_length, padding="post")

# Convert labels to one-hot encoding
y_train = to_categorical(y_train, num_classes=2)
y_test = to_categorical(y_test, num_classes=2)

print(" Tokenization and Padding Finished!")

###📌 Step 4: Build the Deep Learning Model (LSTM)
# Why LSTM?
# It handles sequential data better than standard neural networks.
# It also stores context from previous words.

import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense

model = Sequential([
    Embedding(input_dim=5000, output_dim=128, input_length=max_length),  # Word embeddings
    LSTM(64, return_sequences=True),  # First LSTM layer
    LSTM(32),  # Second LSTM layer
    Dense(32, activation="relu"),  # Fully connected layer
    Dense(2, activation="softmax")  # Output layer
])

model.compile(loss="categorical_crossentropy", optimizer="adam", metrics=["accuracy"])

model.summary()

###📌 Step 5: Train the Model

history = model.fit(X_train_padded, y_train, epochs=5, batch_size=32, validation_data=(X_test_padded, y_test))

###📌 Step 6: Evaluate Model & Make Predictions

test_loss, test_acc = model.evaluate(X_test_padded, y_test)
print(f"test Accuracy: {test_acc:.2f}")

# Predict on sample reviews
sample_reviews = ["This product is not useful.", "I loved it. It is very good."]
sample_seq = tokenizer.texts_to_sequences(sample_reviews)
sample_padded = pad_sequences(sample_seq, maxlen=max_length, padding="post")

predictions = model.predict(sample_padded)
sentiments = ["Negative", "Positive"]

for review, prediction in zip(sample_reviews, predictions):
    print(f"Review: \"{review}\" → Sentiment: {sentiments[np.argmax(prediction)]}")

