In [1]:
#Load the dataset

import tensorflow as tf
import numpy as np
from datasets import load_dataset       


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
dataset = load_dataset("go_emotions")

In [3]:
dataset

DatasetDict({
    train: Dataset({
        features: ['text', 'labels', 'id'],
        num_rows: 43410
    })
    validation: Dataset({
        features: ['text', 'labels', 'id'],
        num_rows: 5426
    })
    test: Dataset({
        features: ['text', 'labels', 'id'],
        num_rows: 5427
    })
})

In [4]:
# Extract text and labels
train_texts = [x['text'] for x in dataset['train']]
test_texts = [x['text'] for x in dataset['test']]
emotions = dataset['train'].features['labels'].feature.names  # Emotion categories
emotions

['admiration',
 'amusement',
 'anger',
 'annoyance',
 'approval',
 'caring',
 'confusion',
 'curiosity',
 'desire',
 'disappointment',
 'disapproval',
 'disgust',
 'embarrassment',
 'excitement',
 'fear',
 'gratitude',
 'grief',
 'joy',
 'love',
 'nervousness',
 'optimism',
 'pride',
 'realization',
 'relief',
 'remorse',
 'sadness',
 'surprise',
 'neutral']

In [5]:
#Encoding labels
def one_hot_encode(examples):
    labels = np.zeros(len(emotions))
    for label in examples['labels']:
        labels[label] = 1
    return labels

In [6]:
train_labels = np.array([one_hot_encode(x) for x in dataset['train']])
test_labels = np.array([one_hot_encode(x) for x in dataset['test']])

In [7]:
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\GITESH\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [8]:
#Data Preprocessing
from nltk.stem import WordNetLemmatizer
lemmatizer=WordNetLemmatizer()
import re
import string
def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'\d+', '', text)  # Remove numbers
    text = text.translate(str.maketrans("", "", string.punctuation))  # Remove punctuation
    text=re.sub('[^a-z A-Z 0-9]','',text)
    text=" ".join([lemmatizer.lemmatize(word) for word in text.split() if word not in stopwords.words('english')])
    
    return text.strip()  # Keep meaningful characters



In [9]:
# Apply preprocessing

train_texts = [preprocess_text(text) for text in train_texts]
test_texts = [preprocess_text(text) for text in test_texts]

In [10]:
train_texts

['favourite food anything didnt cook',
 'everyone think he laugh screwing people instead actually dead',
 'fuck bayless isoing',
 'make feel threatened',
 'dirty southern wanker',
 'omg peyton isnt good enough help u playoff dumbass bronco fan circa december',
 'yes heard abt f bomb thanks reply hubby anxiously wait',
 'need board create bit space name well good',
 'damn youtube outrage drama super lucrative reddit',
 'might linked trust factor friend',
 'demographic dont know anybody cable tv',
 'aww shell probably come around eventually im sure jealous name mean woman wouldnt lol',
 'hello everyone im toronto well call visit personal needed',
 'rsleeptrain might time sleep training take look try feel whats right family',
 'name fucking problem slightly better command english language',
 'shit guess accidentally bought payperview boxing match',
 'thank friend',
 'fucking coward',
 'retardation look like',
 'maybe thats happened great white houston zoo',
 'never thought moment sometime

In [11]:
# Load DistilBERT tokenizer
from transformers import DistilBertTokenizer, TFDistilBertForSequenceClassification
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')




In [12]:

max_word_count = max(len(text.split()) for text in train_texts)
max_word_count


28

In [13]:
# Tokenization function
def tokenize_texts(texts, tokenizer, max_length=128):
    return tokenizer(
        list(texts),  # Ensure it's a list
        max_length=max_length,
        padding=True,
        truncation=True,
        return_tensors="tf"  # Suitable for TensorFlow
    )


In [14]:
# Tokenize train and test data
train_encodings = tokenize_texts(train_texts, tokenizer,128)
test_encodings = tokenize_texts(test_texts, tokenizer,128)

In [22]:
from transformers import DistilBertConfig, TFDistilBertForSequenceClassification

# Define DistilBERT config with dropout
config = DistilBertConfig(
    num_labels=len(emotions),  # Number of emotion classes
    hidden_dropout_prob=0.3,  # Dropout in hidden layers
    attention_probs_dropout_prob=0.3  # Dropout in attention layers
)

# Load model with custom config
model = TFDistilBertForSequenceClassification.from_pretrained(
    "distilbert-base-uncased",
    config=config
)

Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFDistilBertForSequenceClassification: ['vocab_layer_norm.weight', 'vocab_projector.bias', 'vocab_layer_norm.bias', 'vocab_transform.weight', 'vocab_transform.bias']
- This IS expected if you are initializing TFDistilBertForSequenceClassification from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDistilBertForSequenceClassification from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
Some weights or buffers of the TF 2.0 model TFDistilBertForSequenceClassification were not initialized from the PyTorch model and are newly initialized: ['pre_classifier.weight', 'pre_classifier.bias', 'classifier.weight', 'classifier.bias']
You should 

In [26]:
from transformers import AdamWeightDecay

model.compile(optimizer='adam', loss="categorical_crossentropy", metrics=["accuracy"])





In [27]:
model.summary()

Model: "tf_distil_bert_for_sequence_classification_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 distilbert (TFDistilBertMa  multiple                  66362880  
 inLayer)                                                        
                                                                 
 pre_classifier (Dense)      multiple                  590592    
                                                                 
 classifier (Dense)          multiple                  21532     
                                                                 
 dropout_39 (Dropout)        multiple                  0         
                                                                 
Total params: 66975004 (255.49 MB)
Trainable params: 66975004 (255.49 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [29]:
history = model.fit(
    train_encodings,  # No need to split into a list
    train_labels,
    validation_data=(test_encodings, test_labels),
    epochs=3,
    batch_size=25,
)


Epoch 1/3

KeyboardInterrupt: 