<a href="https://www.kaggle.com/code/nadaarfaoui/bilstm-to-detect-emotions-in-customer-reviews?scriptVersionId=289083621" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

In [None]:
import pandas as pd
import numpy as np
import pickle
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential, load_model
from tensorflow.keras.layers import Embedding, Bidirectional, LSTM, Dense, Dropout
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.callbacks import EarlyStopping
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.utils.class_weight import compute_class_weight

# ==========================
# 1️⃣ Load and label dataset
# ==========================
df_binary = pd.read_csv("/kaggle/input/d/nadaarfaoui/cleaned-amazon-sales-and-reviews-dataset/cleaned_dataset.csv")
df_binary["cleaned_review_text"] = df_binary["cleaned_review_text"].astype(str).fillna("")
print("Class distribution (binary):")
print(df_binary["sentiment"].value_counts())

# ==========================
# 2️⃣ Tokenize and encode
# ==========================
tokenizer = Tokenizer(oov_token="<OOV>")
tokenizer.fit_on_texts(df_binary["cleaned_review_text"])


# Vocabulary size
word_index = tokenizer.word_index
vocab_size = len(word_index) + 1  # +1 because indexing starts at 1
print(f"Vocabulary size: {vocab_size}")

# Convert text to sequences
sequences = tokenizer.texts_to_sequences(df_binary["cleaned_review_text"])

# ==========================
# Step 2: Maximum sequence length
# ==========================
sequence_lengths = [len(seq) for seq in sequences]
max_len = max(sequence_lengths)
print(f"\nMaximum sequence length: {max_len}")

X = pad_sequences(sequences, maxlen=max_len, padding='post')
print("\nPadded sequences (first 5 samples):")
for i in range(5):
    print(f"Sample {i+1}: {X[i]}")
print(f"\nShape of padded input: {X.shape}")


encoder = LabelEncoder()
y = encoder.fit_transform(df_binary["sentiment"])
y_cat = to_categorical(y)

X_train, X_test, y_train, y_test = train_test_split(
    X, y_cat, test_size=0.2, random_state=42, stratify=y_cat
)

# ==========================
# 3️⃣ Compute class weights
# ==========================
class_weights = compute_class_weight(
    class_weight='balanced',
    classes=np.unique(y),
    y=y
)
class_weights = dict(enumerate(class_weights))
print("Class Weights:", class_weights)

# ==========================
# 4️⃣ Build BiLSTM model
# ==========================
model = Sequential([
    Embedding(input_dim=vocab_size, output_dim=128, input_length=max_len),
    Bidirectional(LSTM(128, dropout=0.2, recurrent_dropout=0.2)),
    Dense(64, activation='relu'),
    Dropout(0.5),
    Dense(2, activation='softmax')
])
model.build(input_shape=(None, max_len))
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary()

# ==========================
# 5️⃣ Train with early stopping
# ==========================
early_stop = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)

history = model.fit(
    X_train, y_train,
    validation_split=0.2,
    epochs=15,
    batch_size=64,
    class_weight=class_weights,
    callbacks=[early_stop],
    verbose=1
)

# Evaluate
loss, acc = model.evaluate(X_test, y_test)
print(f"\n✅ Test Accuracy: {acc:.2f}")

# ==========================
# 6️⃣ Save model & tokenizer
# ==========================
model.save("sentiment_model.keras")
with open("tokenizer.pkl", "wb") as f:
    pickle.dump(tokenizer, f)
with open("label_encoder.pkl", "wb") as f:
    pickle.dump(encoder, f)

In [None]:
def analyze_brand_sentiment(brand_name):
    # Load components
    model = load_model("sentiment_model.keras")
    with open("tokenizer.pkl", "rb") as f:
        tokenizer = pickle.load(f)
    with open("label_encoder.pkl", "rb") as f:
        encoder = pickle.load(f)

    # Filter reviews from the balanced dataset
    brand_reviews = df_binary[df_binary["brand"].str.lower() == brand_name.lower()]["cleaned_review_text"]
    if brand_reviews.empty:
        return f"No reviews found for brand '{brand_name}'."

    # Prepare sequences
    sequences = tokenizer.texts_to_sequences(brand_reviews)
    padded = pad_sequences(sequences, maxlen=max_len, padding='post')

    # Predict
    preds = model.predict(padded, verbose=0)
    pred_labels = encoder.inverse_transform(np.argmax(preds, axis=1))

    # Count results
    counts = pd.Series(pred_labels).value_counts().to_dict()
    result = {sent: counts.get(sent, 0) for sent in ["Positive", "Negative"]}
    return result

In [None]:
print(analyze_brand_sentiment("Redmi"))

In [None]:
result = analyze_brand_sentiment("Redmi")

# Compute percentages
total = sum(result.values())
percentages = {k: round((v / total) * 100, 2) for k, v in result.items()}

print("📊 Sentiment Percentages for Redmi:")
for sentiment, pct in percentages.items():
    print(f"{sentiment}: {pct}%")

In [None]:
from kaggle_secrets import UserSecretsClient
user_secrets = UserSecretsClient()
secret_value_0 = user_secrets.get_secret("HUGGING_FACE_TOKEN")

In [None]:
# ============================================================
# 💬 Continue after the sentiment analysis percentages section
# ============================================================
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch
from huggingface_hub import login
import re

# --- Login to Hugging Face ---
login(secret_value_0)

# --- Load model and tokenizer ---
model_name = "mistralai/Mistral-7B-Instruct-v0.3"
tokenizer_hf = AutoTokenizer.from_pretrained(model_name)
model_hf = AutoModelForCausalLM.from_pretrained(
    model_name,
    torch_dtype=torch.float16,
    device_map="auto"
)

In [None]:
# --- Helper to build emotion prompt ---
def build_emotion_prompt(brand_name, brand_reviews):
    joined_reviews = "\n".join(brand_reviews)
    prompt = f"""
You are an expert emotion classifier. Analyze the following reviews for brand "{brand_name}".
Each review expresses emotions such as: enjoyment, satisfaction, anger, disappointment, confusion, excitement, trust, or surprise.

Identify the **most prominent overall emotion** across all reviews.

⚠️ Respond ONLY with one word — the emotion (e.g., enjoyment, anger, trust, confusion, disappointment, satisfaction, excitement, surprise).

Reviews:
{joined_reviews}

Now output only one word — the most prominent emotion:
"""
    return prompt.strip()


# --- Combined function for both Sentiment + Emotion ---
def full_brand_analysis(brand_name):
    # --- Sentiment from BiLSTM ---
    sentiment_result = analyze_brand_sentiment(brand_name)
    total = sum(sentiment_result.values())
    percentages = {k: round((v / total) * 100, 2) for k, v in sentiment_result.items()}

    # --- Average rating ---
    brand_reviews_df = df_binary[df_binary["brand"].str.lower() == brand_name.lower()]
    if brand_reviews_df.empty:
        return f"No reviews found for brand '{brand_name}'."

    avg_rating = brand_reviews_df["rating"].mean()

    # --- Emotion from Mistral ---
    brand_reviews = brand_reviews_df["review_text"].tolist()
    emotion_prompt = build_emotion_prompt(brand_name, brand_reviews)
    inputs = tokenizer_hf(emotion_prompt, return_tensors="pt", truncation=True).to(model_hf.device)

    with torch.no_grad():
        emotion_output = model_hf.generate(**inputs, max_new_tokens=100, pad_token_id=tokenizer_hf.eos_token_id)

    emotion_text = tokenizer_hf.decode(emotion_output[0], skip_special_tokens=True)
    emotion_match = re.search(
        r"\b(enjoyment|satisfaction|anger|disappointment|confusion|excitement|trust|surprise)\b",
        emotion_text,
        re.IGNORECASE
    )
    emotion = emotion_match.group(1).capitalize() if emotion_match else "Unknown"

    # --- Print nicely formatted report ---
    print(f"\n⭐ Average Rating for {brand_name}: {avg_rating:.2f}/5\n")
    print(f"📊 Sentiment Report for {brand_name}:")
    for sentiment, pct in percentages.items():
        print(f"  {sentiment}: {pct}%")

    print(f"\n💫 Most Prominent Emotion: {emotion}")

    return {
        "average_rating": round(avg_rating, 2),
        "sentiment_percentages": percentages,
        "dominant_emotion": emotion
    }


# --- Example run ---
final_result = full_brand_analysis("Redmi")