In [1]:
!pip install nlp

Collecting nlp
  Downloading nlp-0.4.0-py3-none-any.whl (1.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.7/1.7 MB[0m [31m10.9 MB/s[0m eta [36m0:00:00[0m
Collecting dill (from nlp)
  Downloading dill-0.3.6-py3-none-any.whl (110 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m110.5/110.5 kB[0m [31m11.4 MB/s[0m eta [36m0:00:00[0m
Collecting xxhash (from nlp)
  Downloading xxhash-3.2.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (212 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m212.5/212.5 kB[0m [31m15.4 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: xxhash, dill, nlp
Successfully installed dill-0.3.6 nlp-0.4.0 xxhash-3.2.0


In [2]:
import tensorflow as tf
import numpy as np
import matplotlib.pyplot as plt
import nlp
import random
import pandas as pd
import json
import re
import nltk
from nltk.corpus import stopwords
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Embedding, Dense, Bidirectional, LSTM, Dropout
from tensorflow.keras.models import Sequential
from tensorflow.keras.regularizers import l2
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import Callback, LearningRateScheduler, EarlyStopping
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer

In [3]:
train = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/EmotionClassifier/emotion-labels-train.csv')
test = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/EmotionClassifier/emotion-labels-test.csv')
val = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/EmotionClassifier/emotion-labels-val.csv')

In [4]:
def extract_data(dataset):
    sentences = []
    labels = []

    # Iterate through each row in the dataset
    for index, row in dataset.iterrows():
        # Extract the text and label from each row
        text = row['text']
        label = row['label']

        # Append the text and label to their respective lists
        sentences.append(text)
        labels.append(label)

    return np.array(sentences), np.array(labels)

def tok_seq_pad(sentences):
  tokenizer = Tokenizer(num_words=10429, oov_token="<OOV>")
  tokenizer.fit_on_texts(sentences)
  total_words = len(tokenizer.word_index)

  seq = tokenizer.texts_to_sequences(sentences)
  maxlength = max([len(sentence) for sentence in seq])
  padded = pad_sequences(seq, maxlen=20, padding="post", truncating="post")

  return padded, maxlength, total_words, tokenizer

nltk.download('stopwords')

def remove_stop_words(sentences):
  cleaned_sentences = []

  # Remove symbols and stopwords from each sentence
  for sentence in sentences:
      # Remove symbols and emojis
      cleaned_sentence = re.sub(r'[^\w\s]', '', sentence)

      # Convert the sentence to lowercase
      cleaned_sentence = cleaned_sentence.lower()

      # Tokenize the sentence into words
      words = cleaned_sentence.split()

      # Remove stop words
      stop_words = set(stopwords.words('english'))
      cleaned_words = [word for word in words if word not in stop_words]

      # Join the cleaned words back into a sentence
      cleaned_sentence = ' '.join(cleaned_words)

      cleaned_sentences.append(cleaned_sentence)

  return np.array(cleaned_sentences)

def categorize_labels(labels):
    label_mapping = {
        'joy': 0,
        'sadness': 1,
        'anger': 2,
        'fear': 3,
    }
    categorized_labels = []

    for label in labels:
        if label in label_mapping:
            categorized_labels.append(label_mapping[label])

    return np.array(categorized_labels)

def shuffle_data(padded, labels):
  # Shuffling data for better generalization
  # Combine test_padded and test_labels into a single array
  combined_data = np.column_stack((padded, labels))

  # Shuffle the combined data along the first axis
  np.random.shuffle(combined_data)

  # Split the shuffled data back into test_padded and test_labels
  shuffled_test_padded = combined_data[:, :-1]
  shuffled_test_labels = combined_data[:, -1]

  return shuffled_test_padded, shuffled_test_labels

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [5]:
train_sentences, train_labels = extract_data(train)
test_sentences, test_labels = extract_data(test)
val_sentences, val_labels = extract_data(val)

In [6]:
train_sentences = remove_stop_words(train_sentences)
test_sentences = remove_stop_words(test_sentences)
val_sentences = remove_stop_words(val_sentences)

In [7]:
train_padded, train_maxlength, train_total_words, train_tokenizer = tok_seq_pad(train_sentences)
test_padded, test_maxlength, test_total_words, test_tokenizer = tok_seq_pad(test_sentences)
val_padded, val_maxlength, val_total_words, val_tokenizer = tok_seq_pad(val_sentences)

In [8]:
train_labels = categorize_labels(train_labels)
test_labels = categorize_labels(test_labels)
val_labels = categorize_labels(val_labels)

In [9]:
train_padded, train_labels = shuffle_data(train_padded, train_labels)
test_padded, test_labels = shuffle_data(test_padded, test_labels)
val_padded, val_labels = shuffle_data(val_padded, val_labels)

In [10]:
X_train_pd = pd.DataFrame(train_padded)
X_test_pd = pd.DataFrame(test_padded)
y_train_pd = pd.DataFrame(train_labels)
y_test_pd = pd.DataFrame(test_labels)
X = pd.concat([X_train_pd, X_test_pd])
y = pd.concat([y_train_pd, y_test_pd])

In [11]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [14]:
# Replace 'val_loss' with the metric you want to monitor, e.g., 'val_accuracy'
early_stopping = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)

# Create the model
model = Sequential()
model.add(Embedding(10429, 100, input_length=train_maxlength))
model.add(Bidirectional(LSTM(20, return_sequences=True)))
model.add(Bidirectional(LSTM(20, kernel_regularizer=l2(0.001))))
model.add(Dense(units=4, activation="softmax"))

# Compile the model
optimizer = Adam(learning_rate=0.01)
model.compile(optimizer=optimizer, loss="sparse_categorical_crossentropy", metrics=['accuracy'])

# Train the model with callbacks
model.fit(X_train, y_train, epochs=30, steps_per_epoch=8, verbose=1, validation_data=(X_test, y_test), callbacks=[early_stopping])

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30


<keras.callbacks.History at 0x7e1d22dbe4a0>

In [16]:
model.save('/content/drive/MyDrive/Colab Notebooks/EmotionClassifier/path_to_save_model')



In [17]:
loaded_model = tf.keras.models.load_model('/content/drive/MyDrive/Colab Notebooks/EmotionClassifier/path_to_save_model')

In [19]:
loaded_model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
loaded_model.fit(
    X_test, y_test,
    epochs=10,  # Or any number of epochs you want to train for
    batch_size=32,  # Set batch size according to your data and hardware resources
    validation_data=(X_train, y_train),  # If you have a separate validation set for evaluation
    callbacks=[early_stopping]
    )


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10


<keras.callbacks.History at 0x7e1d28db35b0>

In [None]:
p = model.predict(np.expand_dims(val_padded[0], axis=0))[0]
pred_class=index_to_class[np.argmax(p).astype('uint8')]
print('Predicted Emotion: ', pred_class)

Predicted Emotion:  0


In [None]:
val_sentences[0]

'theclobra lol i thought maybe couldnt decide if there was levity or not'

In [None]:
val_labels[0]

1

In [None]:
classes = set(val_labels)
class_to_index = dict((c,i) for i, c in enumerate(classes))
index_to_class = dict((v,k) for k, v in class_to_index.items())
names_to_ids = lambda labels: np.array([class_to_index.get(x) for x in labels])