In [None]:
import pandas as pd
import numpy as np
df_train = pd.read_csv("https://raw.githubusercontent.com/google-research/google-research/master/goemotions/data/train.tsv", sep="\t", names = ["text", "labels", "id"])
df_test = pd.read_csv("https://raw.githubusercontent.com/google-research/google-research/master/goemotions/data/test.tsv", sep="\t", names = ["text", "labels", "id"])

In [None]:
import pandas as pd
df = pd.concat([df_train, df_test], ignore_index=True)

In [None]:
from collections import Counter

label_counts = Counter([label for labels in df['labels'] for label in labels])

minority_classes = sorted(label_counts, key=label_counts.get)[:6]
print("Minority classes:", minority_classes)
print(label_counts)

Minority classes: ['9', '6', '8', '3', '4', '5']
Counter({'2': 25963, '7': 20094, '1': 16031, '0': 8690, ',': 8595, '5': 5718, '4': 4565, '3': 3910, '8': 3048, '6': 2805, '9': 1607})


In [None]:
!pip install nlpaug
!pip install language-tool-python



In [None]:
import language_tool_python
from concurrent.futures import ThreadPoolExecutor

# Load LanguageTool
tool = language_tool_python.LanguageTool('en-US')

def correct_text(text):
    matches = tool.check(text)
    return language_tool_python.utils.correct(text, matches)

# Function to apply correction in parallel
def correct_text_parallel(df, column_name):
    with ThreadPoolExecutor() as executor:
        corrected_texts = list(executor.map(correct_text, df[column_name]))
    df[column_name] = corrected_texts  # Assign back to DataFrame

# Example usage
correct_text_parallel(df, "text")  # Process train data
correct_text_parallel(df_test, "text")  # Process test data

In [None]:
import nltk
nltk.download('averaged_perceptron_tagger_eng')
import nlpaug.augmenter.word as naw
augmenter = naw.SynonymAug(aug_p=0.3)

def augment_text(text):
    return augmenter.augment(text)

augmented_texts = []
augmented_labels = []
augmented_ids = []
for i, row in df.iterrows():
    if any(label in minority_classes for label in row['labels']):
        augmented_ids.append(i)
        new_text = ''.join(augment_text(row['text']))
        augmented_texts.append(new_text)
        augmented_labels.append(row['labels'])

df_augmented = pd.DataFrame({'text': augmented_texts, 'labels': augmented_labels})

df = pd.concat([df, df_augmented], ignore_index=True)
df.head()

[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger_eng is already up-to-
[nltk_data]       date!


Unnamed: 0,text,labels,id
0,My favorite food is anything I didn't have to ...,27,eebbqej
1,"Now if he does off himself, everyone will thin...",27,ed00q6i
2,WHY THE FUCK IS BAGLESS ISOING,2,eezlygj
3,To make her feel threatened,14,ed7ypvh
4,Dirty Southern Wankers,3,ed0bdzj


In [None]:
df_augmented

Unnamed: 0,text,labels
0,To make her feeling threatened,14
1,Dirty Southern Wankers,3
2,OMG Peyton international relations and securit...,26
3,Yes I heard astir the f turkey! That have to b...,15
4,We need more than boards and to create a numbe...,820
...,...,...
19963,"They be shaver, they have no souls.",45
19964,I would love a biz with either of these deuce!...,18
19965,"That ’ s what I ’ k thinking likewise, so I en...",15
19966,Thanks. One be diagnosed with BP ane after the...,15


In [None]:
import re
import string
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet

nltk.download('punkt')
nltk.download('punkt_tab')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')

stop_words = set(stopwords.words("english"))
lemmatizer = WordNetLemmatizer()

# Function for text cleaning
def clean_text(text):
    text = text.lower()  # Lowercase
    text = re.sub(r"http\S+|www\S+|https\S+", "", text)  # Remove URLs
    text = re.sub(r"\@\w+|\#", "", text)  # Remove mentions and hashtags
    text = text.translate(str.maketrans("", "", string.punctuation))  # Remove punctuation
    text = word_tokenize(text)  # Tokenization
    text = [lemmatizer.lemmatize(word) for word in text if word not in stop_words]  # Lemmatization & stopword removal
    return " ".join(text)

# Apply cleaning to dataset
df["text"] = df["text"].apply(clean_text)
df_test["text"] = df_test["text"].apply(clean_text)

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [None]:
df["text"]

Unnamed: 0,text
0,favorite food anything didnt cook
1,everyone think he laugh screwing people instea...
2,fuck bagless isoing
3,make feel threatened
4,dirty southern wanker
...,...
68800,shaver soul
68801,would love biz either deuce 2d option caracalla
68802,’ ’ k thinking likewise english hawthorn go re...
68803,thanks one diagnosed bp ane hospitalization well


In [None]:
df.drop(columns=['id'], inplace=True)

In [None]:
df.iloc[augmented_ids]

Unnamed: 0,text,labels
3,make feel threatened,14
4,dirty southern wanker,3
5,omg peyton isnt good enough help u playoff dum...,26
6,yes heard f bomb thanks reply hubby anxiously ...,15
7,need board create bit space name ’ good,820
...,...,...
48825,child soul,45
48826,would love game either two 2nd choice caracalla,18
48827,’ ’ thinking may go referral thanks help,15
48832,thanks diagnosed bp 1 hospitalization well,15


In [None]:
df.head()

Unnamed: 0,text,labels
0,favorite food anything didnt cook,27
1,everyone think he laugh screwing people instea...,27
2,fuck bagless isoing,2
3,make feel threatened,14
4,dirty southern wanker,3


In [None]:
from sklearn.model_selection import train_test_split

df_train, df_test = train_test_split(df, test_size=0.2)

In [None]:
df_train.head()

Unnamed: 0,text,labels
51579,tent fly gull,13
37505,change channel nfc playoff hey ever pay ticket...,7
38763,peace,5
19503,name also steal name evolutionism,37
45726,already hard came,27


In [None]:
emotions = ['admiration',
 'amusement',
 'anger',
 'annoyance',
 'approval',
 'caring',
 'confusion',
 'curiosity',
 'desire',
 'disappointment',
 'disapproval',
 'disgust',
 'embarrassment',
 'excitement',
 'fear',
 'gratitude',
 'grief',
 'joy',
 'love',
 'nervousness',
 'optimism',
 'pride',
 'realization',
 'relief',
 'remorse',
 'sadness',
 'surprise',
 'neutral']

In [None]:
len(emotions)

28

In [None]:
def OneHotEncode(labels):
  res = np.zeros(28, dtype='float32')
  res[[int(x) for x in labels.split(',')]] = 1
  return res

In [None]:
%tensorflow_version 2.x
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

Colab only includes TensorFlow 2.x; %tensorflow_version has no effect.


In [None]:
# Hyperparameters
MAX_NUM_WORDS = 15000  # Vocabulary size
MAX_SEQUENCE_LENGTH = 64  # Max words per comment

In [None]:
tokenizer = Tokenizer(num_words=MAX_NUM_WORDS, oov_token="<OOV>")
tokenizer.fit_on_texts(df_train["text"])

In [None]:
word_index = tokenizer.word_counts

In [None]:
word_index

OrderedDict([('tent', 5),
             ('fly', 43),
             ('gull', 2),
             ('change', 251),
             ('channel', 54),
             ('nfc', 4),
             ('playoff', 68),
             ('hey', 223),
             ('ever', 499),
             ('pay', 215),
             ('ticket', 41),
             ('wondering', 89),
             ('peace', 43),
             ('name', 8476),
             ('also', 825),
             ('steal', 62),
             ('evolutionism', 1),
             ('already', 391),
             ('hard', 481),
             ('came', 206),
             ('luck', 335),
             ('youre', 835),
             ('implying', 18),
             ('im', 1963),
             ('vulnerable', 7),
             ('submissive', 4),
             ('hahhhaaaaaaa', 1),
             ('young', 136),
             ('intel', 23),
             ('loser', 38),
             ('considering', 70),
             ('’', 8674),
             ('thousand', 141),
             ('majoring', 2),
          

In [None]:
def textToTokenizedSequence(text_list):
  sequences = tokenizer.texts_to_sequences(text_list)
  return pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH, padding="post", truncating="post")

In [None]:
sample = np.stack(pd.Series(['1', '2, 1']).apply(lambda x: OneHotEncode(x)))

In [None]:
sample

array([[0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 1., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]], dtype=float32)

In [None]:
y_train = np.stack(df_train['labels'].apply(lambda x: OneHotEncode(x)))
y_test = np.stack(df_test['labels'].apply(lambda x: OneHotEncode(x)))

In [None]:
def SampleVisualiser(textID, train = True):
  positive_emotions_detected = []
  negative_emotions_detected = []
  neutral_emotions_detected = []
  df_local = df_train if train else df_test
  y = y_train if train else y_test
  print('LABELS:', y[textID])
  print('TEXT:', df_local['text'][textID])
  for i in range(len(emotions)):
    if y[textID][i]:
      print(i)
      if i in positive_emotion_idx:
        print(emotions[i])
        positive_emotions_detected.append(emotions[i])
      if i in negative_emotion_idx:
        negative_emotions_detected.append(emotions[i])
      if i in neutral_emotion_idx:
        neutral_emotions_detected.append(emotions[i])
  print('POSITIVE:', positive_emotions_detected)
  print('NEGATIVE:', negative_emotions_detected)
  print('NEUTRAL:', neutral_emotions_detected)

In [None]:
df_train.iloc[0]

Unnamed: 0,51579
text,tent fly gull
labels,13


In [None]:
#starting the model building process
#we will build a 3-way classifier to classify positive, negative, neutral
#then three seperate classifiers for all positive, negative and neutral emotions

In [None]:
x_train = textToTokenizedSequence(df_train['text'])
x_test = textToTokenizedSequence(df_test['text'])

In [None]:
word_index = tokenizer.word_index
reverse_word_index = dict([(value, key) for (key, value) in word_index.items()])

In [None]:
def decode_sentence(text):
    return ' '.join([reverse_word_index.get(i, '?') for i in text])

In [None]:
decode_sentence(x_train[1000])

'one ’ tonne get information technology r <OOV> actually believe bp highly overrated ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ?'

In [None]:
!pip install keras-self-attention



In [None]:
from keras_self_attention import SeqWeightedAttention

In [None]:
#HYPERPARAMETERS
EMBEDDING_DIM = 256
LSTM_UNITS = 128
OUTPUT_DIM = 3
DROPOUT_RATE = 0.2

In [None]:
from sklearn.utils.class_weight import compute_class_weight
class_counts = np.sum(y_train, axis=0)
class_weights = len(y_train) / (len(class_counts) * class_counts)
class_weight_dict = {i: class_weights[i] for i in range(len(class_counts))}

In [None]:
from tensorflow.keras.layers import Embedding, Bidirectional, LSTM, Dense, Dropout, BatchNormalization
from tensorflow.keras.regularizers import l2

# Set global mixed precision policy
tf.keras.mixed_precision.set_global_policy('mixed_float16')

model = tf.keras.Sequential([
    Embedding(MAX_NUM_WORDS, EMBEDDING_DIM, mask_zero=True, dtype=tf.float32),  # Ensure float32 for Embedding
    Bidirectional(LSTM(LSTM_UNITS, return_sequences=True, dropout=0.3, kernel_regularizer=l2(1e-4))),
    Bidirectional(LSTM(LSTM_UNITS, return_sequences=True, dropout=0.3, kernel_regularizer=l2(1e-4))),
    SeqWeightedAttention(),  # Ensure this layer supports mixed precision
    Dense(64, activation='relu', kernel_regularizer=l2(1e-4)),
    BatchNormalization(),
    Dropout(0.3),
    Dense(32, activation='relu', kernel_regularizer=l2(1e-4)),
    BatchNormalization(),
    Dense(len(emotions), activation='sigmoid', dtype=tf.float32)  # Final layer in float32 for numerical stability
])

# Learning Rate Scheduler & Early Stopping
reduce_lr = tf.keras.callbacks.ReduceLROnPlateau(monitor='val_auc', factor=0.3, patience=2, min_lr=1e-6)
early_stopping = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)

# Compile model
model.compile(
    loss=tf.keras.losses.CategoricalFocalCrossentropy(),
    optimizer=tf.keras.optimizers.Adam(learning_rate=3e-4),  # Lower LR for stability
    metrics=[
        tf.keras.metrics.AUC(name="auc"),
        tf.keras.metrics.Precision(name="precision"),
        tf.keras.metrics.Recall(name="recall"),
        tf.metrics.F1Score(name="f1_score", average='weighted', dtype=tf.float32)  # Cast correctly
    ]
)


In [None]:
y_train.shape

(55044, 28)

In [None]:
y_test.shape

(13761, 28)

In [None]:
x_train.shape

(55044, 64)

In [None]:
class_weight_dict

{0: np.float32(0.43035403),
 1: np.float32(0.8329903),
 2: np.float32(1.131103),
 3: np.float32(0.4380252),
 4: np.float32(0.37530681),
 5: np.float32(1.0081319),
 6: np.float32(0.79236484),
 7: np.float32(0.8089947),
 8: np.float32(1.7154076),
 9: np.float32(0.85065216),
 10: np.float32(0.9334554),
 11: np.float32(2.2674246),
 12: np.float32(6.1433034),
 13: np.float32(1.2765306),
 14: np.float32(1.8270048),
 15: np.float32(0.407685),
 16: np.float32(14.670576),
 17: np.float32(1.2529364),
 18: np.float32(0.5360941),
 19: np.float32(6.6414094),
 20: np.float32(1.0695632),
 21: np.float32(17.24436),
 22: np.float32(1.5790017),
 23: np.float32(7.446429),
 24: np.float32(1.9897339),
 25: np.float32(0.8344046),
 26: np.float32(1.0228184),
 27: np.float32(0.14529617)}

In [None]:
model.fit(x_train, y_train, epochs=10, batch_size=32, validation_data=(x_test, y_test), callbacks = [reduce_lr], class_weight=class_weight_dict)

Epoch 1/10
[1m1721/1721[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m91s[0m 38ms/step - auc: 0.5179 - f1_score: 0.0422 - loss: 0.8734 - precision: 0.0458 - recall: 0.5392 - val_auc: 0.6159 - val_f1_score: 0.0468 - val_loss: 0.9796 - val_precision: 0.0546 - val_recall: 0.7708 - learning_rate: 3.0000e-04
Epoch 2/10
[1m1721/1721[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m65s[0m 33ms/step - auc: 0.6709 - f1_score: 0.0986 - loss: 0.7554 - precision: 0.0617 - recall: 0.7648 - val_auc: 0.7630 - val_f1_score: 0.1568 - val_loss: 0.8757 - val_precision: 0.0737 - val_recall: 0.8564 - learning_rate: 3.0000e-04
Epoch 3/10
[1m1721/1721[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m81s[0m 33ms/step - auc: 0.8156 - f1_score: 0.2561 - loss: 0.6499 - precision: 0.0858 - recall: 0.8725 - val_auc: 0.8352 - val_f1_score: 0.2921 - val_loss: 0.7427 - val_precision: 0.1225 - val_recall: 0.7826 - learning_rate: 3.0000e-04
Epoch 4/10
[1m1721/1721[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m82s

<keras.src.callbacks.history.History at 0x7d51da3d5890>

In [None]:
model.save('bilstm_go_emotions.keras')

In [None]:
tokenizer.to_json()



In [None]:
# prompt: save the tokenizer

import json

# Assuming 'tokenizer' is your tokenizer object
tokenizer_json = tokenizer.to_json()

with open('/content/tokenizer.json', 'w') as f:
  json.dump(tokenizer_json, f)

In [None]:
# prompt: upload the model to drive

from google.colab import drive
drive.mount('/content/drive')

!cp bilstm_go_emotions.keras /content/drive/MyDrive/
!cp /content/tokenizer.json /content/drive/MyDrive/

MessageError: Error: credential propagation was unsuccessful

In [None]:
def correct_text_extraction(text):
    matches = tool.check(text)
    corrected_text = language_tool_python.utils.correct(text, matches)
    return corrected_text

In [None]:
def predict_emotions(text):
    sequence = correct_text_extraction(text)
    sequence = clean_text(sequence)
    sequence = tokenizer.texts_to_sequences([sequence])
    padded_sequence = tf.keras.preprocessing.sequence.pad_sequences(sequence, maxlen=64, padding="post", truncating="post")
    predictions = model.predict(padded_sequence)[0]
    emotions = ['admiration', 'amusement', 'anger', 'annoyance', 'approval', 'caring', 'confusion', 'curiosity', 'desire', 'disappointment', 'disapproval', 'disgust', 'embarrassment', 'excitement', 'fear', 'gratitude', 'grief', 'joy', 'love', 'nervousness', 'optimism', 'pride', 'realization', 'relief', 'remorse', 'sadness', 'surprise', 'neutral']
    predicted_emotions = [emotions[i] for i, prob in enumerate(predictions) if prob > 0.5]
    return predictions

# Example usage
text_to_predict = input("Enter Text: ")
predicted_emotions = predict_emotions(text_to_predict)
print(f"Predicted emotions for '{text_to_predict}': {predicted_emotions}")


Enter Text: Please, don't hit me!
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 586ms/step
Predicted emotions for 'Please, don't hit me!': [0.05943624 0.01467261 0.43703386 0.51436687 0.17551461 0.4747789
 0.32554322 0.3051039  0.11395215 0.24412313 0.6898498  0.19114204
 0.43683887 0.04897926 0.12555453 0.00820805 0.05636196 0.05516723
 0.00847285 0.16962287 0.5828353  0.13171151 0.38776523 0.07994209
 0.08868065 0.21148716 0.0784521  0.8810571 ]
