Emotions and Sentiment


In [1]:
!pip install contractions

Collecting contractions
  Downloading contractions-0.1.73-py2.py3-none-any.whl.metadata (1.2 kB)
Collecting textsearch>=0.0.21 (from contractions)
  Downloading textsearch-0.0.24-py2.py3-none-any.whl.metadata (1.2 kB)
Collecting anyascii (from textsearch>=0.0.21->contractions)
  Downloading anyascii-0.3.3-py3-none-any.whl.metadata (1.6 kB)
Collecting pyahocorasick (from textsearch>=0.0.21->contractions)
  Downloading pyahocorasick-2.2.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (13 kB)
Downloading contractions-0.1.73-py2.py3-none-any.whl (8.7 kB)
Downloading textsearch-0.0.24-py2.py3-none-any.whl (7.6 kB)
Downloading anyascii-0.3.3-py3-none-any.whl (345 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m345.1/345.1 kB[0m [31m7.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pyahocorasick-2.2.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (114 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m114.9/114.9 kB[0m 

In [2]:
#installing packages
import os
import re
import pandas as pd
from bs4 import BeautifulSoup
import contractions
from tqdm.auto import tqdm

# scikit-learn
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# nltk
import nltk
nltk.download('punkt', quiet=True)
nltk.download('wordnet', quiet=True)

# pytorch
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pack_padded_sequence, pad_sequence, pad_packed_sequence

# utilities
from collections import Counter
import numpy as np

In [3]:
#loading dataset. In this project we will use dair-ai/emotion
dataset = pd.read_parquet("hf://datasets/dair-ai/emotion/unsplit/train-00000-of-00001.parquet")
dataset.head(10)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Unnamed: 0,text,label
0,i feel awful about it too because it s my job ...,0
1,im alone i feel awful,0
2,ive probably mentioned this before but i reall...,1
3,i was feeling a little low few days back,0
4,i beleive that i am much more sensitive to oth...,2
5,i find myself frustrated with christians becau...,2
6,i am one of those people who feels like going ...,1
7,i feel especially pleased about this as this h...,1
8,i was struggling with these awful feelings and...,1
9,i feel so enraged but helpless at the same time,3


0 → sadness  
1 → joy  
2 → love  
3 → anger  
4 → fear  
5 → surprise

In [4]:
id2label = {0: "sadness", 1: "joy", 2: "love", 3: "anger", 4: "fear", 5: "surprise"}
label2id = {v: k for k, v in id2label.items()}


In [5]:
'''Lets add sentiment column to the dataset to also predict sentiment.
mapping becomes:
joy, love → positive
anger, sadness, fear → negative
surprise → neutral'''

def map_sentiment(label_id):
    emotion = id2label[label_id]
    if emotion in {"joy", "love"}:
        return "positive"
    elif emotion in {"anger", "sadness", "fear"}:
        return "negative"
    else:
        return "neutral"

dataset['sentiment'] = dataset['label'].map(map_sentiment)

print("Dataset sample:")
print(dataset.head())

Dataset sample:
                                                text  label sentiment
0  i feel awful about it too because it s my job ...      0  negative
1                              im alone i feel awful      0  negative
2  ive probably mentioned this before but i reall...      1  positive
3           i was feeling a little low few days back      0  negative
4  i beleive that i am much more sensitive to oth...      2  positive


##Data Cleaning

We will use two cleaning functions: (A) heavy_clean for Linear Regression and Naive Bayes models, (B) light_clean for Pytorch LSTM


In [6]:
#heavy_clean for LR and NB models
def heavy_clean(text):
    """Aggressive cleaning for classical models (TF-IDF / BoW)."""
    if not isinstance(text, str):
        return ""
    text = text.lower()
    text = BeautifulSoup(text, "html.parser").get_text()
    text = contractions.fix(text)
    text = re.sub(r'http\S+|www\.\S+', '<URL>', text)
    text = re.sub(r'@\w+', '<USER>', text)
    text = re.sub(r'#(\w+)', r'\1', text)  # keep hashtag word
    text = re.sub(r'[^a-zA-Z0-9\s.,!?\'"-]', ' ', text)
    text = re.sub(r'([!?.,])\1+', r'\1', text)
    text = re.sub(r'\s+', ' ', text).strip()
    return text

In [7]:
#light_clean for LSTM(there we don't remove stopwords,punctuations)
def light_clean(text):
    """Minimal cleaning for deep models (lowercase + tokens preserved)."""
    if not isinstance(text, str):
        return ""
    text = text.lower()
    text = BeautifulSoup(text, "html.parser").get_text()
    text = contractions.fix(text)
    text = re.sub(r'http\S+|www\.\S+', '<URL>', text)
    text = re.sub(r'@\w+', '<USER>', text)
    text = re.sub(r'\s+', ' ', text).strip()
    return text

In [8]:
tqdm.pandas()
dataset['text_heavy'] = dataset['text'].progress_apply(heavy_clean)
dataset['text_light'] = dataset['text'].progress_apply(light_clean)

dataset.tail()

  0%|          | 0/416809 [00:00<?, ?it/s]

  0%|          | 0/416809 [00:00<?, ?it/s]

Unnamed: 0,text,label,sentiment,text_heavy,text_light
416804,that was what i felt when i was finally accept...,1,positive,that was what i felt when i was finally accept...,that was what i felt when i was finally accept...
416805,i take every day as it comes i m just focussin...,4,negative,i take every day as it comes i m just focussin...,i take every day as it comes i m just focussin...
416806,i just suddenly feel that everything was fake,0,negative,i just suddenly feel that everything was fake,i just suddenly feel that everything was fake
416807,im feeling more eager than ever to claw back w...,1,positive,i am feeling more eager than ever to claw back...,i am feeling more eager than ever to claw back...
416808,i give you plenty of attention even when i fee...,0,negative,i give you plenty of attention even when i fee...,i give you plenty of attention even when i fee...


In [9]:
# splitting dataset into train_df(80%), val_df(10%), and test_df(10%)
# dataset -> train_df(80%) and temp_df(20%)
train_df, temp_df = train_test_split(dataset, test_size=0.2, random_state=42, stratify=dataset['label'])
# temp_df -> val_df(50%), and test_df(50%)
val_df, test_df = train_test_split(temp_df, test_size=0.5, random_state=42, stratify=temp_df['label'])

print(f"Train: {len(train_df)}, Val: {len(val_df)}, Test: {len(test_df)}")


Train: 333447, Val: 41681, Test: 41681


In [10]:
#Converting words into vectors.
#TF-IDF for first model(Logistic Regression)

tfidf_vectorizer = TfidfVectorizer(max_features=3000, ngram_range=(1,2))

X_train_tfidf = tfidf_vectorizer.fit_transform(train_df['text_heavy'])

# Transform the validation and test data using the same vectorizer
X_val_tfidf = tfidf_vectorizer.transform(val_df['text_heavy'])
X_test_tfidf = tfidf_vectorizer.transform(test_df['text_heavy'])

In [11]:
# CountVectorizer(Bag-of-Words)
#will be used for second model(Multinomial Naive Bayes)
# Initialize the TF-IDF vectorizer
vectorizer = CountVectorizer(max_features=3000, ngram_range=(1,2))

# Fit and transform the training data
X_train_cv = vectorizer.fit_transform(train_df['text_heavy'])

# Transform the validation and test data using the same vectorizer
X_val_cv = vectorizer.transform(val_df['text_heavy'])
X_test_cv = vectorizer.transform(test_df['text_heavy'])

In [12]:
import nltk
nltk.download('punkt', quiet=True)
nltk.download('punkt_tab')


[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


True

In [13]:
#tokenization using NLTK(for third model-Pytorch LSTM)
from nltk.tokenize import word_tokenize
from tqdm.auto import tqdm
tqdm.pandas()

def tokenize_with_nltk(text):
    return word_tokenize(text)

X_train_tokens = train_df['text_light'].progress_apply(tokenize_with_nltk)
X_val_tokens   = val_df['text_light'].progress_apply(tokenize_with_nltk)
X_test_tokens  = test_df['text_light'].progress_apply(tokenize_with_nltk)


  0%|          | 0/333447 [00:00<?, ?it/s]

  0%|          | 0/41681 [00:00<?, ?it/s]

  0%|          | 0/41681 [00:00<?, ?it/s]

In [14]:
#creating vocabulary
counter = Counter()
for token_list in X_train_tokens:
    counter.update(token_list)

vocab = {word: i+2 for i, (word, _) in enumerate(counter.most_common(10000))}
vocab["<PAD>"] = 0
vocab["<UNK>"] = 1
print(f"Vocabulary Size: {len(vocab)}")

Vocabulary Size: 10002


In [15]:
#converting tokens into integers

def numericalize_tokens(tokens, vocab=vocab):
    return [vocab.get(tok, vocab['<UNK>']) for tok in tokens]

class EmotionDataset(Dataset):
    def __init__(self, X, y):
        self.X = X
        self.y = y

    def __len__(self):
        return len(self.X)

    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]


In [16]:
#datasets/dataloaders
X_train_list = [torch.tensor(numericalize_tokens(text)) for text in train_df['text']]
X_val_list   = [torch.tensor(numericalize_tokens(text)) for text in val_df['text']]
X_test_list  = [torch.tensor(numericalize_tokens(text)) for text in test_df['text']]

X_train_pad = pad_sequence(X_train_list, batch_first=True, padding_value=0)
X_val_pad   = pad_sequence(X_val_list,   batch_first=True, padding_value=0)
X_test_pad  = pad_sequence(X_test_list,  batch_first=True, padding_value=0)

In [17]:
#labels
y_train = torch.tensor(train_df['label'].values)
y_val   = torch.tensor(val_df['label'].values)
y_test  = torch.tensor(test_df['label'].values)

#Dataloaders
train_data = EmotionDataset(X_train_pad, y_train)
val_data   = EmotionDataset(X_val_pad, y_val)
test_data  = EmotionDataset(X_test_pad, y_test)

train_loader = DataLoader(train_data, batch_size=32, shuffle=True)
val_loader   = DataLoader(val_data, batch_size=32)
test_loader  = DataLoader(test_data, batch_size=32)


print("DataLoaders ready.")

DataLoaders ready.


#Training Models
Models:


1.   Logistic Regression (TF-IDF)

        TF-IDF gives high value to important words like “miserable”, “love”, “angry” and low value to useless words like “the”, “and”, “to”.
2.   Multinomial Naive Bayes

      Naive Bayes works on **word counts**. We will use Bag-of-Words (BoW) vectorization

      - Creates a giant vocabulary of all words
      - Converts each text into a vector counting how many times each word appears

3. PyTorch BiLSTM

      Architecture: Embedding → LSTM → Dropout → Linear → Softmax
      


In [18]:
#training LogisticRegression
print("Training Logistic Regression (TF-IDF)...")
log_reg_model = LogisticRegression(max_iter=300, class_weight='balanced', solver='lbfgs', multi_class='multinomial')

log_reg_model.fit(X_train_tfidf, train_df['label']) #training
val_preds = log_reg_model.predict(X_val_tfidf) #validation
print("Training Complete!")
print("LogReg val acc:", accuracy_score(val_df['label'], val_preds))

Training Logistic Regression (TF-IDF)...




Training Complete!
LogReg val acc: 0.8893500635781291


In [19]:
#validation performance
val_preds = log_reg_model.predict(X_val_tfidf)
val_acc = accuracy_score(val_df['label'], val_preds)

print(f"\nValidation Accuracy: {val_acc:.4f}")




Validation Accuracy: 0.8894


In [20]:
#Training Naive Bayes
print("Training Multinomial Naive Bayes...")

nb = MultinomialNB()
nb.fit(X_train_cv, train_df['label']) #training
val_preds_nb = nb.predict(X_val_cv) #validation
print("NB val acc:", accuracy_score(val_df['label'], val_preds_nb))
print(classification_report(val_df['label'], val_preds_nb, target_names=[id2label[i] for i in range(6)]))
print("Training Complete!")

Training Multinomial Naive Bayes...
NB val acc: 0.8621194309157649
              precision    recall  f1-score   support

     sadness       0.90      0.91      0.90     12119
         joy       0.90      0.88      0.89     14107
        love       0.71      0.80      0.75      3455
       anger       0.88      0.85      0.86      5732
        fear       0.83      0.80      0.81      4771
    surprise       0.70      0.77      0.73      1497

    accuracy                           0.86     41681
   macro avg       0.82      0.83      0.83     41681
weighted avg       0.86      0.86      0.86     41681

Training Complete!


In [21]:
#Adding embedding layer to our LSTM
class BiLSTMEmotion(nn.Module):
    def __init__(self, vocab_size, embed_dim, hidden_dim, num_classes):
        super().__init__()
        self.hidden_dim = hidden_dim

        self.embedding = nn.Embedding(vocab_size, embed_dim, padding_idx=0)
        self.lstm = nn.LSTM(embed_dim, hidden_dim, num_layers=2, batch_first=True, bidirectional=True, dropout=0.3)
        self.fc = nn.Linear(hidden_dim*2, num_classes)

    def forward(self, x):
        x = self.embedding(x)
        output, (h, c) = self.lstm(x)
        last_forward = h[-2]
        last_backward = h[-1]
        final = torch.cat([last_forward, last_backward], dim=1)
        return self.fc(final)


In [22]:
#setting devide to GPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Training on: {device}")

Training on: cuda


In [23]:

model_BiLSTM = BiLSTMEmotion(len(vocab), embed_dim=100, hidden_dim=128, num_classes=6)
model_BiLSTM = model_BiLSTM.to(device)

In [24]:
import torch.optim as optim

criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model_BiLSTM.parameters(), lr=0.001)

In [25]:
#Training Pytprch LSTM model
num_epochs = 5

for epoch in range(num_epochs):
    model_BiLSTM.train()
    train_loss = 0
    correct = 0
    total = 0

    for inputs, labels in train_loader:
        inputs, labels = inputs.to(device), labels.to(device) # Move data to GPU

        optimizer.zero_grad() # Reset gradients
        outputs = model_BiLSTM(inputs) # Forward pass
        loss = criterion(outputs, labels) # Calculate loss
        loss.backward() # Backpropagation
        optimizer.step() # Update weights

        train_loss += loss.item()

        # Calculate accuracy
        _, predicted = torch.max(outputs.data, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()

    avg_loss = train_loss / len(train_loader)
    acc = 100 * correct / total
    print(f"Epoch {epoch+1}/{num_epochs} | Loss: {avg_loss:.4f} | Accuracy: {acc:.2f}%")

print("Training Complete!")

Epoch 1/5 | Loss: 0.6776 | Accuracy: 72.53%
Epoch 2/5 | Loss: 0.1167 | Accuracy: 93.33%
Epoch 3/5 | Loss: 0.1046 | Accuracy: 93.60%
Epoch 4/5 | Loss: 0.1007 | Accuracy: 93.73%
Epoch 5/5 | Loss: 0.1004 | Accuracy: 93.71%
Training Complete!


In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [26]:
model_BiLSTM.eval()

# Validation
val_loss = 0
correct_val = 0
total_val = 0
with torch.no_grad():
    for inputs, labels in val_loader:
        inputs, labels = inputs.to(device), labels.to(device)
        outputs = model_BiLSTM(inputs)
        loss = criterion(outputs, labels)
        val_loss += loss.item()
        _, predicted = torch.max(outputs.data, 1)
        total_val += labels.size(0)
        correct_val += (predicted == labels).sum().item()

avg_val_loss = val_loss / len(val_loader)
val_accuracy = 100 * correct_val / total_val
print(f"\nValidation Loss: {avg_val_loss:.4f} | Validation Accuracy: {val_accuracy:.2f}%")

# Test
test_loss = 0
correct_test = 0
total_test = 0
with torch.no_grad():
    for inputs, labels in test_loader:
        inputs, labels = inputs.to(device), labels.to(device)
        outputs = model_BiLSTM(inputs)
        loss = criterion(outputs, labels)
        test_loss += loss.item()
        _, predicted = torch.max(outputs.data, 1)
        total_test += labels.size(0)
        correct_test += (predicted == labels).sum().item()

avg_test_loss = test_loss / len(test_loader)
test_accuracy = 100 * correct_test / total_test
print(f"Test Loss: {avg_test_loss:.4f} | Test Accuracy: {test_accuracy:.2f}%")


Validation Loss: 0.0946 | Validation Accuracy: 93.90%
Test Loss: 0.0970 | Test Accuracy: 93.75%


In [27]:
#creating models and utils folders dor saving
import os
os.makedirs("models", exist_ok=True)
os.makedirs("utils", exist_ok=True)


In [28]:
#saving models to models folder
import joblib
joblib.dump(log_reg_model, "models/log_reg_model.pkl")
joblib.dump(nb, "models/naive_bayes_model.pkl")

['models/naive_bayes_model.pkl']

In [29]:
#saving BiLSTM
torch.save(model_BiLSTM.state_dict(), "models/bilstm_model.pth")


In [30]:
#saving vocabulary
import json
with open("models/vocab.json", "w") as f:
    json.dump(vocab, f)


In [31]:
#creating itils folder for other tools(vectorizations, etc)
joblib.dump(tfidf_vectorizer, "utils/tfidf_vectorizer.pkl")
joblib.dump(vectorizer, "utils/count_vectorizer.pkl")


['utils/count_vectorizer.pkl']

In [32]:
%%writefile utils/cleaning.py
import re
import contractions
from bs4 import BeautifulSoup

def heavy_clean(text):
    """Aggressive cleaning for classical ML models."""
    if not isinstance(text, str):
        return ""
    text = text.lower()
    text = BeautifulSoup(text, "html.parser").get_text()
    text = contractions.fix(text)
    text = re.sub(r'http\S+|www\.\S+', '<URL>', text)
    text = re.sub(r'@\w+', '<USER>', text)
    text = re.sub(r'#(\w+)', r'\1', text)
    text = re.sub(r'[^a-zA-Z0-9\s.,!?\'"-]', ' ', text)
    text = re.sub(r'([!?.,])\1+', r'\1', text)
    text = re.sub(r'\s+', ' ', text).strip()
    return text


def light_clean(text):
    """Minimal cleaning for deep learning models."""
    if not isinstance(text, str):
        return ""
    text = text.lower()
    text = BeautifulSoup(text, "html.parser").get_text()
    text = contractions.fix(text)
    text = re.sub(r'http\S+|www\.\S+', '<URL>', text)
    text = re.sub(r'@\w+', '<USER>', text)
    text = re.sub(r'\s+', ' ', text).strip()
    return text


Writing utils/cleaning.py


In [33]:
%%writefile utils/tokenizer.py
from nltk.tokenize import word_tokenize

def tokenize_with_nltk(text):
    return word_tokenize(text)


Writing utils/tokenizer.py


In [34]:
%%writefile utils/vocab_tools.py
def numericalize_tokens(tokens, vocab):
    """Convert tokens to integer IDs based on the given vocabulary."""
    return [vocab.get(tok, vocab.get("<UNK>", 1)) for tok in tokens]


Writing utils/vocab_tools.py


In [35]:
!ls -l utils


total 212
-rw-r--r-- 1 root root   1012 Dec  1 10:07 cleaning.py
-rw-r--r-- 1 root root  88557 Dec  1 10:06 count_vectorizer.pkl
-rw-r--r-- 1 root root 112908 Dec  1 10:06 tfidf_vectorizer.pkl
-rw-r--r-- 1 root root    102 Dec  1 10:07 tokenizer.py
-rw-r--r-- 1 root root    180 Dec  1 10:07 vocab_tools.py


In [36]:
!ls -l models


total 6976
-rw-r--r-- 1 root root 6534031 Dec  1 10:06 bilstm_model.pth
-rw-r--r-- 1 root root  144951 Dec  1 10:06 log_reg_model.pkl
-rw-r--r-- 1 root root  288887 Dec  1 10:06 naive_bayes_model.pkl
-rw-r--r-- 1 root root  166570 Dec  1 10:06 vocab.json
