In [None]:
import pandas as pd

In [None]:
data = pd.read_csv("/kaggle/input/frankendata/final_fix_V2_merged_emotion_dataset.csv")
data.head

In [None]:
X, y = data['text'], data['label']

import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Conv1D, BatchNormalization, GlobalMaxPooling1D, Dense, Dropout, LeakyReLU, ReLU
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.regularizers import l2
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

encoder = LabelEncoder()
y_encoded = encoder.fit_transform(y)

tokenizer = Tokenizer(num_words=30000, oov_token="<OOV>")
tokenizer.fit_on_texts(X)
X_sequences = tokenizer.texts_to_sequences(X)
X_padded = pad_sequences(X_sequences, maxlen=100, padding='post')

X_train, X_test, y_train, y_test = train_test_split(
    X_padded, y_encoded, test_size=0.2, random_state=42
)

model = Sequential([
    Embedding(input_dim=30000, output_dim=256, trainable=True),
    Conv1D(256, 3, padding='same', kernel_regularizer=l2(0.0005)),
    BatchNormalization(),
    ReLU(),
    Conv1D(128, 5, padding='same', kernel_regularizer=l2(0.0005)),
    BatchNormalization(),
    LeakyReLU(alpha=0.1),
    GlobalMaxPooling1D(),
    Dense(128, activation='relu'),
    Dropout(0.5),
    Dense(len(encoder.classes_), activation='softmax')
])

model.compile(
    optimizer=tf.keras.optimizers.Adam(learning_rate=0.000005),
    loss='sparse_categorical_crossentropy',
    metrics=['accuracy']
)

callbacks = [
    tf.keras.callbacks.EarlyStopping(monitor='accuracy', patience=2, restore_best_weights=True, mode='max'),
    tf.keras.callbacks.ReduceLROnPlateau(monitor='loss', factor=0.5, patience=1, min_lr=1e-6)
]

history = model.fit(
    X_train, y_train,
    epochs=50,
    batch_size=32,
    callbacks=callbacks
)

In [None]:
import pandas as pd
import numpy as np

# Load and inspect the data
data = pd.read_csv("/kaggle/input/frankendata/final_fix_V2_merged_emotion_dataset.csv")
print(data.head())

# Extract features and labels
X, y = data['text'], data['label']

import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout, Bidirectional, LeakyReLU
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import pickle

# Encode labels
encoder = LabelEncoder()
y_encoded = encoder.fit_transform(y)

# Tokenize and pad the text data
tokenizer = Tokenizer(num_words=30000, oov_token="<OOV>")
tokenizer.fit_on_texts(X)
X_sequences = tokenizer.texts_to_sequences(X)
X_padded = pad_sequences(X_sequences, maxlen=100, padding='post')

# Save the tokenizer for later inference if needed
with open("tokenizer.pkl", "wb") as f:
    pickle.dump(tokenizer, f)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(
    X_padded, y_encoded, test_size=0.2, random_state=42
)

# Compute class weights to address potential class imbalance
from sklearn.utils import class_weight
class_weights_values = class_weight.compute_class_weight(
    class_weight='balanced',
    classes=np.unique(y_train),
    y=y_train
)
class_weights = dict(enumerate(class_weights_values))

# Define early stopping callback
early_stopper = tf.keras.callbacks.EarlyStopping(
    monitor='val_loss', patience=3, restore_best_weights=True
)

# Build the model using LSTM layers
model = Sequential([
    # Adding input_length ensures layers are built immediately
    Embedding(input_dim=30000, output_dim=256, input_length=100),
    Bidirectional(LSTM(32, recurrent_dropout=0.2)),
    Dense(16, kernel_regularizer=tf.keras.regularizers.l2(0.006)),
    Dropout(0.3),
    LeakyReLU(),
    Dense(6, activation='softmax')
])

model.compile(
    loss='sparse_categorical_crossentropy',
    optimizer=tf.keras.optimizers.Adam(learning_rate=0.001),
    metrics=['accuracy']
)

model.summary()

# Train the model
history = model.fit(
    X_train, 
    y_train, 
    epochs=20, 
    batch_size=32, 
    validation_split=0.2, 
    class_weight=class_weights,
    callbacks=[early_stopper]
)


In [None]:
print(dict(zip(encoder.classes_, encoder.transform(encoder.classes_))))

In [None]:
import pickle

with open("tokenizer.pkl", "wb") as f:
    pickle.dump(tokenizer, f)

In [None]:
model.save("emotion_classifier_model_rev2.h5")