In [None]:
import random

import numpy as np
import pandas as pd
import seaborn as sns
import tensorflow as tf
import matplotlib.pyplot as plt

# Data Processing

## Load Data

In [None]:
TRAIN_DATA_PATH="/kaggle/input/emotions-dataset-for-nlp/train.txt"
TEST_DATA_PATH="/kaggle/input/emotions-dataset-for-nlp/test.txt"
VAL_DATA_PATH="/kaggle/input/emotions-dataset-for-nlp/val.txt"

In [None]:
train_data = pd.DataFrame(columns=['sentence', 'label'])
test_data = pd.DataFrame(columns=['sentence', 'label'])
val_data = pd.DataFrame(columns=['sentence', 'label'])

with open(TRAIN_DATA_PATH, 'r') as file:
    for _, line in enumerate(file):
        train_data = pd.concat([
            pd.DataFrame(
                [line.replace('\n', '').split(';')], columns=train_data.columns
            ),
            train_data
        ], ignore_index=True)

with open(TEST_DATA_PATH, 'r') as file:
    for _, line in enumerate(file):
        test_data = pd.concat([
            pd.DataFrame(
                [line.replace('\n', '').split(';')], columns=test_data.columns
            ),
            test_data
        ], ignore_index=True)

with open(VAL_DATA_PATH, 'r') as file:
    for _, line in enumerate(file):
        val_data = pd.concat([
            pd.DataFrame(
                [line.replace('\n', '').split(';')], columns=val_data.columns
            ),
            val_data
        ], ignore_index=True)

train_data = train_data.sample(frac=1, random_state=42)

train_sentences = train_data['sentence']
test_sentences = test_data['sentence']
val_sentences = test_data['sentence']

train_labels = train_data['label'].replace(
    to_replace=['sadness', 'surprise', 'fear', 'anger', 'joy', 'love'], 
    value=[0, 1, 2, 3, 4, 5]
)
test_labels = test_data['label'].replace(
    to_replace=['sadness', 'surprise', 'fear', 'anger', 'joy', 'love'], 
    value=[0, 1, 2, 3, 4, 5]
)
val_labels = test_data['label'].replace(
    to_replace=['sadness', 'surprise', 'fear', 'anger', 'joy', 'love'], 
    value=[0, 1, 2, 3, 4, 5]
)

In [None]:
print(f'Length of Training Set: { len(train_data) }')
print(f'Length of Test Set: { len(test_data) }')
print(f'Length of Val Set: { len(val_data) }')

print('')

print(f'Number of Labels: { len(train_data["label"].unique()) }')
print('Possible Labels:')
for label in train_data['label'].unique():
    print(f'    { label }')
    print(f'        Count: { len(train_data[train_data["label"] == label]) }')
    print(f'        % of Total: { len(train_data[train_data["label"] == label]) / len(train_data) * 100.}')

### Investigate Data

In [None]:
def view_random_sentence():
    row_num = random.randint(0, len(train_data) - 1)

    print(f'emotion: { train_data.iloc[row_num]["label"] }')
    print(f'sentence: { train_data.iloc[row_num]["sentence"] }')

In [None]:
view_random_sentence()

## Create Vectorizer

In [None]:
text_vectorizer = tf.keras.layers.TextVectorization(
    max_tokens=12000,
    output_mode='int',
    output_sequence_length=15
)

text_vectorizer.adapt(train_sentences)

In [None]:
print(f'''
    Total number of words: { len(text_vectorizer.get_vocabulary()) }
    Top 10 most common words: { text_vectorizer.get_vocabulary()[:10] }
    Top 10 least common words: { text_vectorizer.get_vocabulary()[-10:] }      
''')

# Model

## Model 1: Dense NN

In [None]:
embedding_1 = tf.keras.layers.Embedding(
    input_dim=len(text_vectorizer.get_vocabulary()),
    output_dim=128,
    embeddings_initializer='uniform',
    input_length=15,
    name='embedding'
)

inputs = tf.keras.layers.Input(shape=(1,), dtype='string')

x = text_vectorizer(inputs)
x = embedding_1(x)
x = tf.keras.layers.GlobalAveragePooling1D()(x)

outputs = tf.keras.layers.Dense(6, activation='softmax')(x)

model_1 = tf.keras.Model(inputs, outputs, name='model_1_simple_dense')

model_1.compile(
    optimizer='Adam',
    loss='sparse_categorical_crossentropy',
    metrics=['accuracy']
)

model_1_history = model_1.fit(
    train_sentences,
    train_labels,
    epochs=5,
    validation_data=(val_sentences, val_labels),
)

In [None]:
model_1.evaluate(test_sentences, test_labels)

## Model 2: LSTM

In [None]:
embedding_2 = tf.keras.layers.Embedding(
    input_dim=len(text_vectorizer.get_vocabulary()),
    output_dim=128,
    embeddings_initializer='uniform',
    input_length=15,
    name='embedding_2'
) 

inputs = tf.keras.layers.Input(shape=(1,), dtype='string')

x = text_vectorizer(inputs)
x = embedding_2(x)
x = tf.keras.layers.LSTM(64)(x)

outputs = tf.keras.layers.Dense(6, activation='softmax')(x)

model_2 = tf.keras.Model(inputs, outputs, name='model_2_simple_lstm')

model_2.compile(
    optimizer='Adam',
    loss='sparse_categorical_crossentropy',
    metrics=['accuracy']
)

model_2_history = model_2.fit(
    train_sentences,
    train_labels,
    epochs=5,
    validation_data=(val_sentences, val_labels)
)

In [None]:
model_2.evaluate(test_sentences, test_labels)

## GRU

In [None]:
embedding_3 = tf.keras.layers.Embedding(
    input_dim=len(text_vectorizer.get_vocabulary()),
    output_dim=128,
    embeddings_initializer='uniform',
    input_length=15,
    name='embedding_3'
)

inputs = tf.keras.layers.Input(shape=(1,), dtype='string')

x = text_vectorizer(inputs)
x = embedding_3(x)
x = tf.keras.layers.GRU(64, kernel_regularizer=tf.keras.regularizers.l2(0.01))(x)

outputs = tf.keras.layers.Dense(6, activation='softmax')(x)

model_3 = tf.keras.Model(inputs, outputs, name='model_2_simple_gru')

model_3.compile(
    optimizer='Adam',
    loss='sparse_categorical_crossentropy',
    metrics=['accuracy']
)

history_3 = model_3.fit(
    train_sentences,
    train_labels,
    epochs=5,
    validation_data=(val_sentences, val_labels)
)

In [None]:
model_3.evaluate(test_sentences, test_labels)

## Model 4: CNN

In [None]:
embedding_4 = tf.keras.layers.Embedding(
    input_dim=len(text_vectorizer.get_vocabulary()),
    output_dim=128,
    embeddings_initializer='uniform',
    input_length=15,
    name='embedding_4'
)

inputs = tf.keras.layers.Input(shape=(1,), dtype='string')

x = text_vectorizer(inputs)
x = embedding_4(x)
x = tf.keras.layers.Conv1D(
    filters=32, 
    kernel_size=5, 
    activation='relu',
    kernel_regularizer=tf.keras.regularizers.l2(0.01)
)(x)
x = tf.keras.layers.GlobalMaxPooling1D()(x)

outputs = tf.keras.layers.Dense(6, activation='softmax')(x)

model_4 = tf.keras.Model(inputs, outputs, name='model_4_simple_cnn')

model_4.compile(
    optimizer=tf.keras.optimizers.Adam(),
    loss='sparse_categorical_crossentropy',
    metrics=['accuracy']
)

model_4_history = model_4.fit(
    train_sentences,
    train_labels,
    epochs=5,
    validation_data=(val_sentences, val_labels)
)

In [None]:
model_4.evaluate(test_sentences, test_labels)

# Compare Models

In [None]:
plt.subplots(figsize=(12, 7))
plt.subplots_adjust(hspace=0.4)

plt.subplot(2, 2, 1)

plt.plot(model_1_history.history['accuracy'], label='Train')
plt.plot(model_1_history.history['val_accuracy'], label='Validation')
plt.legend(loc='lower right')
plt.title('Dense')
plt.xlabel('Epochs')

plt.subplot(2, 2, 2)

plt.plot(model_2_history.history['accuracy'], label='Train')
plt.plot(model_2_history.history['val_accuracy'], label='Validation')
plt.legend(loc='lower right')
plt.title('LSTM')
plt.xlabel('Epochs')

plt.subplot(2, 2, 3)

plt.plot(history_3.history['accuracy'], label='Train')
plt.plot(history_3.history['val_accuracy'], label='Validation')
plt.legend(loc='lower right')
plt.title('GRU')
plt.xlabel('Epochs')

plt.subplot(2, 2, 4)

plt.plot(history_3.history['accuracy'], label='Train')
plt.plot(history_3.history['val_accuracy'], label='Validation')
plt.legend(loc='lower right')
plt.title('CNN')
plt.xlabel('Epochs')

plt.show()

In [23]:
import random
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras import layers, models, callbacks
import pickle
import shutil
import os
import json 
import re
import string
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau

In [4]:
TRAIN_DATA_PATH = '/kaggle/input/emotions-dataset-for-nlp/train.txt'
TEST_DATA_PATH  = '/kaggle/input/emotions-dataset-for-nlp/test.txt'
VAL_DATA_PATH   = '/kaggle/input/emotions-dataset-for-nlp/val.txt'

In [5]:
def load_data(path):
    records = []
    with open(path) as f:
        for line in f:
            sentence, label = line.strip().split(';')
            records.append({'sentence': sentence, 'label': label})
    return pd.DataFrame.from_records(records)

In [6]:
train_df = load_data(TRAIN_DATA_PATH).sample(frac=1, random_state=42)
test_df  = load_data(TEST_DATA_PATH)
val_df   = load_data(VAL_DATA_PATH)


In [7]:
train_df = train_df[train_df['label'] != 'love'].sample(frac=1, random_state=42)
val_df   = val_df[val_df['label'] != 'love']
test_df  = test_df[test_df['label'] != 'love']

In [8]:
train_sentences = train_df['sentence']
val_sentences   = val_df['sentence']
test_sentences  = test_df['sentence']

In [9]:
label_map = {'sadness':0, 'surprise':1, 'fear':2, 'anger':3, 'joy':4}
train_labels = train_df['label'].map(label_map)
val_labels   = val_df['label'].map(label_map)
test_labels  = test_df['label'].map(label_map)


In [14]:
def preprocess_text(text):
    text = text.lower()
    text = re.sub(f"[{re.escape(string.punctuation)}]", "", text)
    text = re.sub(r"\s+", " ", text).strip()
    return text

In [19]:
# Применим предобработку
train_df['sentence'] = train_df['sentence'].apply(preprocess_text)
val_df['sentence']   = val_df['sentence'].apply(preprocess_text)
test_df['sentence']  = test_df['sentence'].apply(preprocess_text)


In [12]:
# Предобработка: TextVectorization
MAX_TOKENS = 12000
SEQ_LEN    = 20
EMBED_DIM  = 256

text_vectorizer = layers.TextVectorization(
    max_tokens=MAX_TOKENS,
    output_mode='int',
    output_sequence_length=SEQ_LEN
)
text_vectorizer.adapt(train_sentences)

In [24]:
early_stop = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)
reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.3, patience=2)

def save_model_and_vectorizer(model, model_name, vectorizer):
    model.save(f"{model_name}", save_format='tf')

    vectorizer_config = vectorizer.get_config()
    vectorizer_weights = vectorizer.get_weights()

    os.makedirs(f"{model_name}_vectorizer", exist_ok=True)

    with open(f"{model_name}_vectorizer/config.json", "w") as f:
        json.dump(vectorizer_config, f)

    with open(f"{model_name}_vectorizer/weights.pkl", "wb") as f:
        pickle.dump(vectorizer_weights, f)

In [25]:
inputs = layers.Input(shape=(1,), dtype=tf.string)
x = text_vectorizer(inputs)
x = layers.Embedding(len(text_vectorizer.get_vocabulary()), EMBED_DIM)(x)
x = layers.Bidirectional(layers.GRU(128, return_sequences=True, dropout=0.3))(x)
x = layers.Bidirectional(layers.GRU(64, dropout=0.3))(x)
x = layers.Dense(64, activation='relu')(x)
x = layers.Dropout(0.5)(x)
outputs = layers.Dense(6, activation='softmax')(x)
model_bigru = models.Model(inputs, outputs, name='model_bigru')
model_bigru.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
model_bigru.fit(train_sentences, train_labels, epochs=10, validation_data=(val_sentences, val_labels), callbacks=[early_stop, reduce_lr])
model_bigru.evaluate(test_sentences, test_labels)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10


[0.3381398618221283, 0.8777838349342346]

In [29]:
from sklearn.metrics import classification_report, confusion_matrix, ConfusionMatrixDisplay
import numpy as np
import matplotlib.pyplot as plt

y_pred_probs = model_bigru.predict(test_sentences)
y_pred = np.argmax(y_pred_probs, axis=1)

print("Classification Report (Test Set):")
print(classification_report(test_labels, y_pred, target_names=['sadness', 'surprise', 'fear', 'anger', 'joy']))

Classification Report (Test Set):
              precision    recall  f1-score   support

     sadness       0.97      0.95      0.96       581
    surprise       0.95      0.91      0.93        66
        fear       0.96      0.94      0.95       224
       anger       0.94      0.95      0.94       275
         joy       0.96      0.97      0.96       695

    accuracy                           0.96      1841
   macro avg       0.96      0.94      0.95      1841
weighted avg       0.96      0.96      0.96      1841



In [None]:
save_model_and_vectorizer(model_bigru, 'model_bigru', text_vectorizer)

shutil.make_archive('/kaggle/working/model_bigru_all', 'zip', '/kaggle/working/model_bigru')
shutil.make_archive('/kaggle/working/vectorizer', 'zip', '/kaggle/working/model_bigru_vectorizer')
