In [2]:
#Load the dataset

import tensorflow as tf
import numpy as np
from datasets import load_dataset


In [3]:
dataset = load_dataset("go_emotions")

In [4]:
dataset

DatasetDict({
    train: Dataset({
        features: ['text', 'labels', 'id'],
        num_rows: 43410
    })
    validation: Dataset({
        features: ['text', 'labels', 'id'],
        num_rows: 5426
    })
    test: Dataset({
        features: ['text', 'labels', 'id'],
        num_rows: 5427
    })
})

In [5]:
# Extract text and labels
train_texts = [x['text'] for x in dataset['train']]
test_texts = [x['text'] for x in dataset['test']]
emotions = dataset['train'].features['labels'].feature.names  # Emotion categories
emotions

['admiration',
 'amusement',
 'anger',
 'annoyance',
 'approval',
 'caring',
 'confusion',
 'curiosity',
 'desire',
 'disappointment',
 'disapproval',
 'disgust',
 'embarrassment',
 'excitement',
 'fear',
 'gratitude',
 'grief',
 'joy',
 'love',
 'nervousness',
 'optimism',
 'pride',
 'realization',
 'relief',
 'remorse',
 'sadness',
 'surprise',
 'neutral']

In [6]:
len(train_texts)

43410

In [7]:
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\GITESH\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [8]:
#Encoding labels
def one_hot_encode(examples):
    labels = np.zeros(len(emotions))
    for label in examples['labels']:
        labels[label] = 1
    return labels

In [9]:
train_labels = np.array([one_hot_encode(x) for x in dataset['train']])
test_labels = np.array([one_hot_encode(x) for x in dataset['test']])

In [10]:
train_labels.shape

(43410, 28)

In [11]:
#Lemmatizer
from nltk.stem import WordNetLemmatizer
lemmatizer=WordNetLemmatizer()

In [12]:
#Data Preprocessing
import re
import string
def preprocess_text(text):
    text=text.lower()
    text = re.sub(r'\d+', '', text)  # Remove numbers
    text=re.sub('[^a-z A-z 0-9]',' ',text)
    text = text.translate(str.maketrans("", "", string.punctuation))  # Remove punctuation
    text=' '.join([lemmatizer.lemmatize(word) for word in text.split() if word not in stopwords.words('english')])
    return text
    

In [13]:
# Apply preprocessing

train_texts = [preprocess_text(text) for text in train_texts]
test_texts = [preprocess_text(text) for text in test_texts]

In [14]:
len(train_texts)

43410

In [15]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences


tokenizer = Tokenizer()
tokenizer.fit_on_texts(train_texts)
total_words=len(tokenizer.word_index)+1
total_words




22621

In [16]:
tokenizer.word_index

{'name': 1,
 'like': 2,
 'get': 3,
 'love': 4,
 'one': 5,
 'people': 6,
 'would': 7,
 'good': 8,
 'know': 9,
 'think': 10,
 'really': 11,
 'time': 12,
 'make': 13,
 'thanks': 14,
 'lol': 15,
 'see': 16,
 'thing': 17,
 'look': 18,
 'thank': 19,
 'go': 20,
 'year': 21,
 'much': 22,
 'still': 23,
 'even': 24,
 'well': 25,
 'right': 26,
 'want': 27,
 'got': 28,
 'oh': 29,
 'guy': 30,
 'way': 31,
 'feel': 32,
 'yeah': 33,
 'need': 34,
 'never': 35,
 'game': 36,
 'hope': 37,
 'say': 38,
 'great': 39,
 'man': 40,
 'bad': 41,
 'u': 42,
 'sorry': 43,
 'day': 44,
 'back': 45,
 'going': 46,
 'better': 47,
 'could': 48,
 'also': 49,
 'someone': 50,
 'actually': 51,
 'mean': 52,
 'sure': 53,
 'pretty': 54,
 'work': 55,
 'thought': 56,
 'something': 57,
 'though': 58,
 'best': 59,
 'take': 60,
 'post': 61,
 'happy': 62,
 'first': 63,
 'always': 64,
 'yes': 65,
 'life': 66,
 'friend': 67,
 'made': 68,
 'kid': 69,
 'wow': 70,
 'said': 71,
 'let': 72,
 'new': 73,
 'point': 74,
 'maybe': 75,
 'glad': 76

In [17]:
input_sequences_train = []
input_sequences_test=[]

for line in train_texts:  # Iterate over list of sentences
    token_list = tokenizer.texts_to_sequences([line])[0]  # Convert sentence to token sequence
    input_sequences_train.append(token_list)
for line in test_texts:  # Iterate over list of sentences
    token_list = tokenizer.texts_to_sequences([line])[0]  # Convert sentence to token sequence
    input_sequences_test.append(token_list)

In [18]:
train_max_sequence_length=max([len(x) for x in input_sequences_train])
train_max_sequence_length

33

In [19]:
test_max_sequence_length=max([len(x) for x in input_sequences_test])
test_max_sequence_length

18

In [20]:
train_padded_sequence_text=pad_sequences(input_sequences_train,padding='pre',maxlen=train_max_sequence_length,)
test_padded_sequence_text=pad_sequences(input_sequences_test,padding='pre',maxlen=train_max_sequence_length,)

In [21]:
X_train=train_padded_sequence_text
X_test=test_padded_sequence_text
ytrain=train_labels
ytest=test_labels

In [22]:
#LSTM model
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding,LSTM,Dense,Dropout,Bidirectional


lstm_model=Sequential()
lstm_model.add(Embedding(total_words,100,input_length=train_max_sequence_length))
lstm_model.add(Bidirectional(LSTM(150,return_sequences=True))),
lstm_model.add(Dropout(0.2))
lstm_model.add(Bidirectional(LSTM(100)))
lstm_model.add(Dense(28,activation='softmax'))

lstm_model.compile(loss='categorical_crossentropy',optimizer='adam',metrics=['accuracy'])
lstm_model.summary()






In [23]:
X_train.shape

(43410, 33)

In [24]:
X_test.shape

(5427, 33)

In [25]:
ytrain.shape


(43410, 28)

In [26]:
ytest.shape

(5427, 28)

In [27]:
#train the model
history=lstm_model.fit(X_train,ytrain,epochs=10,batch_size=25,validation_data=(X_test,ytest),verbose=1)

Epoch 1/10


KeyboardInterrupt: 

In [32]:
import tensorflow as tf
import numpy as np
from datasets import load_dataset
from transformers import AutoTokenizer, TFBertModel
from tensorflow.keras.utils import to_categorical

# Load dataset
dataset = load_dataset("go_emotions")

# Extract text and labels
train_texts = [x['text'] for x in dataset['train']]
test_texts = [x['text'] for x in dataset['test']]
emotions = dataset['train'].features['labels'].feature.names  # Emotion categories

# One-hot encode labels
def one_hot_encode(examples):
    labels = np.zeros(len(emotions))
    for label in examples['labels']:
        labels[label] = 1
    return labels

train_labels = np.array([one_hot_encode(x) for x in dataset['train']])
test_labels = np.array([one_hot_encode(x) for x in dataset['test']])

# Load BERT Tokenizer
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

# Tokenize text data
MAX_LENGTH = 50  # Adjust based on dataset
train_encodings = tokenizer(train_texts, truncation=True, padding=True, max_length=MAX_LENGTH, return_tensors="np")
test_encodings = tokenizer(test_texts, truncation=True, padding=True, max_length=MAX_LENGTH, return_tensors="np")


In [33]:
from tensorflow.keras import layers, models

# Load BERT Model
bert_model = TFBertModel.from_pretrained("bert-base-uncased")

# Define model architecture
input_ids = layers.Input(shape=(MAX_LENGTH,), dtype=tf.int32, name="input_ids")
attention_mask = layers.Input(shape=(MAX_LENGTH,), dtype=tf.int32, name="attention_mask")

# Get BERT embeddings
bert_output = bert_model(input_ids, attention_mask=attention_mask)[0][:, 0, :]  # CLS token output

# Classification head
x = layers.Dense(128, activation="relu")(bert_output)
x = layers.Dropout(0.3)(x)
output = layers.Dense(len(emotions), activation="sigmoid")(x)  # Multi-label classification

# Create model
bert_classifier = models.Model(inputs=[input_ids, attention_mask], outputs=output)

# Compile model
bert_classifier.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=2e-5),
                        loss="binary_crossentropy",  # Multi-label classification loss
                        metrics=["accuracy"])


Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFBertModel: ['cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight']
- This IS expected if you are initializing TFBertModel from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
All the weights of TFBertModel were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions w

ValueError: Exception encountered when calling layer 'tf_bert_model' (type TFBertModel).

Data of type <class 'keras.src.backend.common.keras_tensor.KerasTensor'> is not allowed only (<class 'tensorflow.python.framework.tensor.Tensor'>, <class 'bool'>, <class 'int'>, <class 'transformers.utils.generic.ModelOutput'>, <class 'tuple'>, <class 'list'>, <class 'dict'>, <class 'numpy.ndarray'>) is accepted for attention_mask.

Call arguments received by layer 'tf_bert_model' (type TFBertModel):
  • input_ids=<KerasTensor shape=(None, 50), dtype=int32, sparse=False, ragged=False, name=input_ids>
  • attention_mask=<KerasTensor shape=(None, 50), dtype=int32, sparse=False, ragged=False, name=attention_mask>
  • token_type_ids=None
  • position_ids=None
  • head_mask=None
  • inputs_embeds=None
  • encoder_hidden_states=None
  • encoder_attention_mask=None
  • past_key_values=None
  • use_cache=None
  • output_attentions=None
  • output_hidden_states=None
  • return_dict=None
  • training=False