<a href="https://www.kaggle.com/code/nadaarfaoui/bilstm-to-detect-emotions-in-customer-reviews?scriptVersionId=289392421" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

In [1]:
import pandas as pd
import numpy as np
import pickle
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential, load_model
from tensorflow.keras.layers import Embedding, Bidirectional, LSTM, Dense, Dropout
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.callbacks import EarlyStopping
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.utils.class_weight import compute_class_weight

# ==========================
# 1Ô∏è‚É£ Load and label dataset
# ==========================
df_binary = pd.read_csv("/kaggle/input/d/nadaarfaoui/cleaned-amazon-sales-and-reviews-dataset/cleaned_dataset.csv")
df_binary["cleaned_review_text"] = df_binary["cleaned_review_text"].astype(str).fillna("")
print("Class distribution (binary):")
print(df_binary["sentiment"].value_counts())

# ==========================
# 2Ô∏è‚É£ Tokenize and encode
# ==========================
tokenizer = Tokenizer(oov_token="<OOV>")
tokenizer.fit_on_texts(df_binary["cleaned_review_text"])


# Vocabulary size
word_index = tokenizer.word_index
vocab_size = len(word_index) + 1  # +1 because indexing starts at 1
print(f"Vocabulary size: {vocab_size}")

# Convert text to sequences
sequences = tokenizer.texts_to_sequences(df_binary["cleaned_review_text"])

# ==========================
# Step 2: Maximum sequence length
# ==========================
sequence_lengths = [len(seq) for seq in sequences]
max_len = max(sequence_lengths)
print(f"\nMaximum sequence length: {max_len}")

X = pad_sequences(sequences, maxlen=max_len, padding='post')
print("\nPadded sequences (first 5 samples):")
for i in range(5):
    print(f"Sample {i+1}: {X[i]}")
print(f"\nShape of padded input: {X.shape}")


encoder = LabelEncoder()
y = encoder.fit_transform(df_binary["sentiment"])
y_cat = to_categorical(y)

X_train, X_test, y_train, y_test = train_test_split(
    X, y_cat, test_size=0.2, random_state=42, stratify=y_cat
)

# ==========================
# 3Ô∏è‚É£ Compute class weights
# ==========================
class_weights = compute_class_weight(
    class_weight='balanced',
    classes=np.unique(y),
    y=y
)
class_weights = dict(enumerate(class_weights))
print("Class Weights:", class_weights)

# ==========================
# 4Ô∏è‚É£ Build BiLSTM model
# ==========================
model = Sequential([
    Embedding(input_dim=vocab_size, output_dim=128, input_length=max_len),
    Bidirectional(LSTM(128, dropout=0.2, recurrent_dropout=0.2)),
    Dense(64, activation='relu'),
    Dropout(0.5),
    Dense(2, activation='softmax')
])
model.build(input_shape=(None, max_len))
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary()

# ==========================
# 5Ô∏è‚É£ Train with early stopping
# ==========================
early_stop = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)

history = model.fit(
    X_train, y_train,
    validation_split=0.2,
    epochs=15,
    batch_size=64,
    class_weight=class_weights,
    callbacks=[early_stop],
    verbose=1
)

# Evaluate
loss, acc = model.evaluate(X_test, y_test)
print(f"\n‚úÖ Test Accuracy: {acc:.2f}")

# ==========================
# 6Ô∏è‚É£ Save model & tokenizer
# ==========================
model.save("sentiment_model.keras")
with open("tokenizer.pkl", "wb") as f:
    pickle.dump(tokenizer, f)
with open("label_encoder.pkl", "wb") as f:
    pickle.dump(encoder, f)

2025-12-31 12:30:24.645998: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1767184224.845005      23 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1767184224.898653      23 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1767184225.335510      23 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1767184225.335553      23 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1767184225.335555      23 computation_placer.cc:177] computation placer alr

Class distribution (binary):
sentiment
Positive    4448
Negative     406
Name: count, dtype: int64
Vocabulary size: 3614

Maximum sequence length: 102

Padded sequences (first 5 samples):
Sample 1: [ 102   45    4   13    2   71    7  234  143  634 1216   15 1217  635
   21  301 1218  636  277  278  197   26  637    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0]
Sample 2: [ 20   2  20   7 302   4 120  12  92  15   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0  

I0000 00:00:1767184238.513835      23 gpu_device.cc:2019] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 15513 MB memory:  -> device: 0, name: Tesla P100-PCIE-16GB, pci bus id: 0000:00:04.0, compute capability: 6.0


Epoch 1/15
[1m49/49[0m [32m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m[37m[0m [1m28s[0m 405ms/step - accuracy: 0.6995 - loss: 0.6748 - val_accuracy: 0.8224 - val_loss: 0.4150
Epoch 2/15
[1m49/49[0m [32m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m[37m[0m [1m19s[0m 386ms/step - accuracy: 0.8357 - loss: 0.4280 - val_accuracy: 0.9125 - val_loss: 0.2351
Epoch 3/15
[1m49/49[0m [32m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m[37m[0m [1m19s[0m 390ms/step - accuracy: 0.9222 - loss: 0.2313 - val_accuracy: 0.8932 - val_loss: 0.2978
Epoch 4/15
[1m49/49[0m [32m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m[37m[0m [1m19s[0m 387ms/step - accuracy: 0.9456 - loss: 0.1731 - val_accuracy: 0.9176 - val_loss: 0.2566
Epoch 5/15
[1m49/49[0m [32m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m[37m[0m [1m19s[0m 388ms/step - accuracy: 0.9683 - loss: 0.1189 - val_accuracy: 0.8739 - va

In [2]:
def analyze_brand_sentiment(brand_name):
    # Load components
    model = load_model("sentiment_model.keras")
    with open("tokenizer.pkl", "rb") as f:
        tokenizer = pickle.load(f)
    with open("label_encoder.pkl", "rb") as f:
        encoder = pickle.load(f)

    # Filter reviews from the balanced dataset
    brand_reviews = df_binary[df_binary["brand"].str.lower() == brand_name.lower()]["cleaned_review_text"]
    if brand_reviews.empty:
        return f"No reviews found for brand '{brand_name}'."

    # Prepare sequences
    sequences = tokenizer.texts_to_sequences(brand_reviews)
    padded = pad_sequences(sequences, maxlen=max_len, padding='post')

    # Predict
    preds = model.predict(padded, verbose=0)
    pred_labels = encoder.inverse_transform(np.argmax(preds, axis=1))

    # Count results
    counts = pd.Series(pred_labels).value_counts().to_dict()
    result = {sent: counts.get(sent, 0) for sent in ["Positive", "Negative"]}
    return result

In [3]:
print(analyze_brand_sentiment("Redmi"))

{'Positive': 45, 'Negative': 14}


In [4]:
result = analyze_brand_sentiment("Redmi")

# Compute percentages
total = sum(result.values())
percentages = {k: round((v / total) * 100, 2) for k, v in result.items()}

print("üìä Sentiment Percentages for Redmi:")
for sentiment, pct in percentages.items():
    print(f"{sentiment}: {pct}%")

üìä Sentiment Percentages for Redmi:
Positive: 76.27%
Negative: 23.73%


In [5]:
from kaggle_secrets import UserSecretsClient
user_secrets = UserSecretsClient()
secret_value_0 = user_secrets.get_secret("HUGGING_FACE_TOKEN")

In [6]:
# ============================================================
# üí¨ Continue after the sentiment analysis percentages section
# ============================================================
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch
from huggingface_hub import login
import re

# --- Login to Hugging Face ---
login(secret_value_0)

# --- Load model and tokenizer ---
model_name = "mistralai/Mistral-7B-Instruct-v0.3"
tokenizer_hf = AutoTokenizer.from_pretrained(model_name)
model_hf = AutoModelForCausalLM.from_pretrained(
    model_name,
    torch_dtype=torch.float16,
    device_map="auto"
)

tokenizer_config.json: 0.00B [00:00, ?B/s]

tokenizer.model:   0%|          | 0.00/587k [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/414 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/601 [00:00<?, ?B/s]

`torch_dtype` is deprecated! Use `dtype` instead!


model.safetensors.index.json: 0.00B [00:00, ?B/s]

Fetching 3 files:   0%|          | 0/3 [00:00<?, ?it/s]

model-00001-of-00003.safetensors:   0%|          | 0.00/4.95G [00:00<?, ?B/s]

model-00002-of-00003.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00003-of-00003.safetensors:   0%|          | 0.00/4.55G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

In [7]:
# --- Helper to build emotion prompt ---
def build_emotion_prompt(brand_name, brand_reviews):
    joined_reviews = "\n".join(brand_reviews)
    prompt = f"""
You are an expert emotion classifier. Analyze the following reviews for brand "{brand_name}".
Each review expresses emotions such as: enjoyment, satisfaction, anger, disappointment, confusion, excitement, trust, or surprise.

Identify the **most prominent overall emotion** across all reviews.

‚ö†Ô∏è Respond ONLY with one word ‚Äî the emotion (e.g., enjoyment, anger, trust, confusion, disappointment, satisfaction, excitement, surprise).

Reviews:
{joined_reviews}

Now output only one word ‚Äî the most prominent emotion:
"""
    return prompt.strip()


# --- Combined function for both Sentiment + Emotion ---
def full_brand_analysis(brand_name):
    # --- Sentiment from BiLSTM ---
    sentiment_result = analyze_brand_sentiment(brand_name)
    total = sum(sentiment_result.values())
    percentages = {k: round((v / total) * 100, 2) for k, v in sentiment_result.items()}

    # --- Average rating ---
    brand_reviews_df = df_binary[df_binary["brand"].str.lower() == brand_name.lower()]
    if brand_reviews_df.empty:
        return f"No reviews found for brand '{brand_name}'."

    avg_rating = brand_reviews_df["rating"].mean()

    # --- Emotion from Mistral ---
    brand_reviews = brand_reviews_df["review_text"].tolist()
    emotion_prompt = build_emotion_prompt(brand_name, brand_reviews)
    inputs = tokenizer_hf(emotion_prompt, return_tensors="pt", truncation=True).to(model_hf.device)

    with torch.no_grad():
        emotion_output = model_hf.generate(**inputs, max_new_tokens=100, pad_token_id=tokenizer_hf.eos_token_id)

    emotion_text = tokenizer_hf.decode(emotion_output[0], skip_special_tokens=True)
    emotion_match = re.search(
        r"\b(enjoyment|satisfaction|anger|disappointment|confusion|excitement|trust|surprise)\b",
        emotion_text,
        re.IGNORECASE
    )
    emotion = emotion_match.group(1).capitalize() if emotion_match else "Unknown"

    # --- Print nicely formatted report ---
    print(f"\n‚≠ê Average Rating for {brand_name}: {avg_rating:.2f}/5\n")
    print(f"üìä Sentiment Report for {brand_name}:")
    for sentiment, pct in percentages.items():
        print(f"  {sentiment}: {pct}%")

    print(f"\nüí´ Most Prominent Emotion: {emotion}")

    return {
        "average_rating": round(avg_rating, 2),
        "sentiment_percentages": percentages,
        "dominant_emotion": emotion
    }


# --- Example run ---
final_result = full_brand_analysis("Redmi")







Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.



‚≠ê Average Rating for Redmi: 4.12/5

üìä Sentiment Report for Redmi:
  Positive: 76.27%
  Negative: 23.73%

üí´ Most Prominent Emotion: Enjoyment
