<h1 style="text-align:center; font-family:Georgia; font-weight:bold; ">BiLSTM with Word2Vec</h1>

## Imports


In [4]:
import re
import nltk
import json
import spacy
import numpy as np
import pandas as pd
import tensorflow as tf
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm
from collections import Counter
from gensim.models import Word2Vec
import gensim.downloader as api
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, LSTM, Bidirectional, Embedding, SpatialDropout1D
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.optimizers import Adam

np.random.seed(42)
tf.random.set_seed(42)

ValueError: numpy.dtype size changed, may indicate binary incompatibility. Expected 96 from C header, got 88 from PyObject

## Load Preprocessed Data


In [None]:
trainer.train()

In [None]:
preprocessed_data = pd.read_csv('./Preprocessed Data/preprocessed_text.csv')
print(f"Loaded preprocessed data with shape: {preprocessed_data.shape}")
print(f"Columns: {preprocessed_data.columns.tolist()}")

preprocessed_data.head()

## Constants and Global Variables


In [None]:
EMOTIONS = ['happiness', 'neutral', 'sadness', 'anger', 'fear']
MAX_WORDS = 10000  
MAX_SEQUENCE_LENGTH = 100  
EMBEDDING_DIM = 300  
BATCH_SIZE = 64
EPOCHS = 10
VALIDATION_SPLIT = 0.2

encoder = LabelEncoder()
encoder.classes_ = np.array(EMOTIONS)

if 'label' not in preprocessed_data.columns:
    preprocessed_data['label'] = encoder.transform(preprocessed_data['Emotion'])

emotion_counts = preprocessed_data['Emotion'].value_counts()
print("Emotion distribution in dataset:")
print(emotion_counts)

plt.figure(figsize=(10, 6))
sns.barplot(x=emotion_counts.index, y=emotion_counts.values)
plt.title('Emotion Distribution in Dataset')
plt.xlabel('Emotion')
plt.ylabel('Count')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

## Text Preprocessing and Tokenization



In [None]:
def clean_text(text):
    if not isinstance(text, str):
        return ""
    
    text = text.lower()
    text = re.sub(r'[^a-zA-Z\s]', ' ', text)
    text = re.sub(r'\s+', ' ', text).strip()
    
    return text.split()

print("Tokenizing texts...")
tokenized_texts = [clean_text(text) for text in tqdm(preprocessed_data['Text'])]

# Calculate sequence length statistics
seq_lengths = [len(tokens) for tokens in tokenized_texts]
print(f"Average sequence length: {np.mean(seq_lengths):.2f}")
print(f"Max sequence length: {np.max(seq_lengths)}")
print(f"90th percentile sequence length: {np.percentile(seq_lengths, 90):.2f}")

plt.figure(figsize=(10, 6))
plt.hist(seq_lengths, bins=50)
plt.title('Distribution of Text Length')
plt.xlabel('Number of Tokens')
plt.ylabel('Count')
plt.axvline(x=MAX_SEQUENCE_LENGTH, color='r', linestyle='--', label=f'Max Length: {MAX_SEQUENCE_LENGTH}')
plt.legend()
plt.tight_layout()
plt.show()

## Word2Vec Embedding



In [None]:
# Option 1: Train Word2Vec on our dataset
print("Training Word2Vec model on our dataset...")
w2v_model = Word2Vec(sentences=tokenized_texts, vector_size=EMBEDDING_DIM, window=5, min_count=1, workers=4)
print(f"Word2Vec model trained. Vocabulary size: {len(w2v_model.wv.key_to_index)}")

# Option 2: Load pre-trained Word2Vec (Google News)
# Uncomment to use pre-trained model instead
# print("Loading pre-trained Word2Vec model...")
# w2v_model = api.load("word2vec-google-news-300")
# print(f"Pre-trained Word2Vec model loaded. Vocabulary size: {len(w2v_model.key_to_index)}")

tokenizer = Tokenizer(num_words=MAX_WORDS, oov_token="<OOV>")
tokenizer.fit_on_texts(tokenized_texts)
word_index = tokenizer.word_index
print(f"Found {len(word_index)} unique tokens")

sequences = tokenizer.texts_to_sequences(tokenized_texts)
padded_sequences = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH, padding='post', truncating='post')

# Create embedding matrix
embedding_matrix = np.zeros((min(MAX_WORDS, len(word_index) + 1), EMBEDDING_DIM))
for word, i in word_index.items():
    if i >= MAX_WORDS:
        continue
    try:
        embedding_vector = w2v_model.wv[word]
        embedding_matrix[i] = embedding_vector
    except KeyError:
        pass

print(f"Embedding matrix shape: {embedding_matrix.shape}")

## Data Preparation


In [None]:
labels = to_categorical(preprocessed_data['label'])

X_train, X_test, y_train, y_test = train_test_split(
    padded_sequences, labels, test_size=0.2, stratify=labels, random_state=42
)

X_train, X_val, y_train, y_val = train_test_split(
    X_train, y_train, test_size=0.2, stratify=y_train, random_state=42
)

print(f"Training set shape: {X_train.shape}, {y_train.shape}")
print(f"Validation set shape: {X_val.shape}, {y_val.shape}")
print(f"Test set shape: {X_test.shape}, {y_test.shape}")

## BiLSTM Model Building


In [None]:
model = Sequential()

model.add(Embedding(
    input_dim=min(MAX_WORDS, len(word_index) + 1),
    output_dim=EMBEDDING_DIM,
    weights=[embedding_matrix],
    input_length=MAX_SEQUENCE_LENGTH,
    trainable=False  
))

model.add(SpatialDropout1D(0.2))
model.add(Bidirectional(LSTM(128, dropout=0.2, recurrent_dropout=0.2, return_sequences=True)))
model.add(Bidirectional(LSTM(64, dropout=0.2, recurrent_dropout=0.2)))
model.add(Dense(64, activation='relu'))
model.add(Dense(len(EMOTIONS), activation='softmax'))

model.compile(
    loss='categorical_crossentropy',
    optimizer=Adam(learning_rate=0.001),
    metrics=['accuracy']
)

model.summary()

## Model Training


In [None]:
early_stopping = EarlyStopping(
    monitor='val_loss',
    patience=3,
    restore_best_weights=True,
    verbose=1
)

model_checkpoint = ModelCheckpoint(
    filepath='best_bilstm_model.h5',
    monitor='val_accuracy',
    mode='max',
    save_best_only=True,
    verbose=1
)

history = model.fit(
    X_train, y_train,
    validation_data=(X_val, y_val),
    epochs=EPOCHS,
    batch_size=BATCH_SIZE,
    callbacks=[early_stopping, model_checkpoint],
    verbose=1
)

plt.figure(figsize=(12, 5))

plt.subplot(1, 2, 1)
plt.plot(history.history['accuracy'])
plt.plot(history.history['val_accuracy'])
plt.title('Model Accuracy')
plt.ylabel('Accuracy')
plt.xlabel('Epoch')
plt.legend(['Train', 'Validation'], loc='lower right')

plt.subplot(1, 2, 2)
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.title('Model Loss')
plt.ylabel('Loss')
plt.xlabel('Epoch')
plt.legend(['Train', 'Validation'], loc='upper right')

plt.tight_layout()
plt.show()

## Model Evaluation



In [None]:
test_loss, test_accuracy = model.evaluate(X_test, y_test, verbose=0)
print(f'Test Loss: {test_loss:.4f}')
print(f'Test Accuracy: {test_accuracy:.4f}')

y_pred_proba = model.predict(X_test)
y_pred = np.argmax(y_pred_proba, axis=1)
y_true = np.argmax(y_test, axis=1)

y_pred_emotions = encoder.inverse_transform(y_pred)
y_true_emotions = encoder.inverse_transform(y_true)

print("\nClassification Report:")
print(classification_report(y_true_emotions, y_pred_emotions, target_names=EMOTIONS))

cm = confusion_matrix(y_true, y_pred)
plt.figure(figsize=(10, 8))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=EMOTIONS, yticklabels=EMOTIONS)
plt.title('Confusion Matrix')
plt.xlabel('Predicted')
plt.ylabel('True')
plt.tight_layout()
plt.show()

## Lexicon Score Analysis



In [None]:
score_columns = [col for col in preprocessed_data.columns if col.endswith('Score')]
if score_columns:
    test_indices = np.random.choice(range(len(preprocessed_data)), size=len(y_test), replace=False)
    test_scores = preprocessed_data.iloc[test_indices][score_columns].values
    
    plt.figure(figsize=(12, 8))
    
    for i, emotion in enumerate(EMOTIONS):
        emotion_indices = np.where(y_true == i)[0]
        
        avg_scores = np.mean(test_scores[emotion_indices], axis=0)
        
        plt.subplot(2, 3, i+1)
        plt.bar(range(len(score_columns)), avg_scores)
        plt.title(f'Avg Lexicon Scores for {emotion}')
        plt.xticks(range(len(score_columns)), [s.replace('Score', '') for s in score_columns], rotation=45)
    
    plt.tight_layout()
    plt.show()

## Model for Inference


In [None]:
def predict_emotion(text, tokenizer, model, encoder):
    cleaned_text = clean_text(text)
    
    sequence = tokenizer.texts_to_sequences([cleaned_text])
    padded = pad_sequences(sequence, maxlen=MAX_SEQUENCE_LENGTH, padding='post', truncating='post')
    
    prediction = model.predict(padded)[0]
    predicted_class = np.argmax(prediction)
    predicted_emotion = encoder.inverse_transform([predicted_class])[0]
    
    scores = {emotion: float(score) for emotion, score in zip(EMOTIONS, prediction)}
    
    return {
        'emotion': predicted_emotion,
        'confidence': float(np.max(prediction)),
        'scores': scores
    }

test_examples = [
    "I am so happy today, everything is wonderful!",
    "I feel so sad and depressed after what happened.",
    "I'm absolutely furious about the way they treated me.",
    "I'm really scared about what might happen next.",
    "It's just another normal day, nothing special."
]

for text in test_examples:
    result = predict_emotion(text, tokenizer, model, encoder)
    print(f"\nText: '{text}'")
    print(f"Predicted emotion: {result['emotion']} (confidence: {result['confidence']:.2f})")
    print("Scores for all emotions:")
    for emotion, score in sorted(result['scores'].items(), key=lambda x: x[1], reverse=True):
        print(f"  {emotion}: {score:.4f}")

## Model Comparison


In [None]:
bilstm_metrics = {
    'accuracy': test_accuracy,
    'model_name': 'BiLSTM with Word2Vec'
}

print(f"BiLSTM with Word2Vec Accuracy: {test_accuracy:.4f}")

## Text Length Analysis


In [None]:
text_lengths = [len(text.split()) for text in preprocessed_data['Text'].iloc[test_indices]]

bins = [0, 10, 20, 30, 40, 50, 70, 100, 1000]
length_bins = pd.cut(text_lengths, bins=bins)

correct_predictions = (y_pred == y_true)
df_results = pd.DataFrame({
    'text_length': text_lengths,
    'length_bin': length_bins,
    'correct': correct_predictions
})

accuracy_by_length = df_results.groupby('length_bin')['correct'].mean()
samples_by_length = df_results.groupby('length_bin').size()

plt.figure(figsize=(12, 6))
accuracy_by_length.plot(kind='bar')
plt.title('BiLSTM Model Accuracy by Text Length')
plt.xlabel('Text Length')
plt.ylabel('Accuracy')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

plt.figure(figsize=(12, 6))
samples_by_length.plot(kind='bar')
plt.title('Number of Samples by Text Length')
plt.xlabel('Text Length')
plt.ylabel('Count')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

## Conclusion


In [None]:
model.save("bilstm_emotion_model.h5")

import pickle
with open('bilstm_tokenizer.pickle', 'wb') as handle:
    pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)

print("Model and tokenizer saved successfully!")
print("\n=== BiLSTM with Word2Vec Model Summary ===")
print(f"Test Accuracy: {test_accuracy:.4f}")
print(f"Model size: {sum(np.prod(v.get_shape().as_list()) for v in model.trainable_variables):,} trainable parameters")
print("Embedding type: Word2Vec")
print(f"Embedding dimension: {EMBEDDING_DIM}")
print("\nStrengths:")
print("- Captures sequential nature of text")
print("- Uses both forward and backward context")
print("- Leverages semantic relationships from Word2Vec")

print("\nNext steps:")
print("- Compare with DistilRoBERTa and other models")
print("- Fine-tune hyperparameters")
print("- Try attention mechanisms")
print("- Explore ensemble methods")