In [98]:
import pandas as pd
import re
import nltk
import numpy as np
# Run the below once 
# nltk.download('punkt_tab')
# nltk.download('wordnet')

In [99]:
dataset = pd.read_csv('data/train.txt',sep=";",header=None, names=["text","emotion"])
dataset_val = pd.read_csv('data/val.txt',sep=";",header=None, names=["text","emotion"])
dataset_test= pd.read_csv('data/test.txt',sep=";",header=None, names=["text","emotion"])
dataset.head()

Unnamed: 0,text,emotion
0,i didnt feel humiliated,sadness
1,i can go from feeling so hopeless to so damned...,sadness
2,im grabbing a minute to post i feel greedy wrong,anger
3,i am ever feeling nostalgic about the fireplac...,love
4,i am feeling grouchy,anger


In [100]:
dataset_val.head(), dataset_test.head()

(                                                text  emotion
 0  im feeling quite sad and sorry for myself but ...  sadness
 1  i feel like i am still looking at a blank canv...  sadness
 2                     i feel like a faithful servant     love
 3                  i am just feeling cranky and blue    anger
 4  i can have for a treat or if i am feeling festive      joy,
                                                 text  emotion
 0  im feeling rather rotten so im not very ambiti...  sadness
 1          im updating my blog because i feel shitty  sadness
 2  i never make her separate from me because i do...  sadness
 3  i left with my bouquet of red and yellow tulip...      joy
 4    i was feeling a little vain when i did this one  sadness)

In [101]:
dataset.describe()

Unnamed: 0,text,emotion
count,16000,16000
unique,15969,6
top,i feel on the verge of tears from weariness i ...,joy
freq,2,5362


In [102]:
print(dataset["text"][0])

i didnt feel humiliated


In [103]:
def clean_text(text):
    text = text.lower()
    text = re.sub(r"[^a-zA-Z0-9\s]","",text) # remove punctuations
    return text


dataset['text'] = dataset['text'].apply(clean_text)
dataset['text'].head()
dataset_val['text'] = dataset_val['text'].apply(clean_text)

In [104]:
from nltk.tokenize import word_tokenize
from collections import Counter

dataset['tokens'] = dataset['text'].apply(word_tokenize)
print(dataset['tokens'].head())
dataset_val['tokens'] = dataset_val['text'].apply(word_tokenize)
print(dataset_val['tokens'].head())


0                         [i, didnt, feel, humiliated]
1    [i, can, go, from, feeling, so, hopeless, to, ...
2    [im, grabbing, a, minute, to, post, i, feel, g...
3    [i, am, ever, feeling, nostalgic, about, the, ...
4                            [i, am, feeling, grouchy]
Name: tokens, dtype: object
0    [im, feeling, quite, sad, and, sorry, for, mys...
1    [i, feel, like, i, am, still, looking, at, a, ...
2                [i, feel, like, a, faithful, servant]
3            [i, am, just, feeling, cranky, and, blue]
4    [i, can, have, for, a, treat, or, if, i, am, f...
Name: tokens, dtype: object


In [105]:
from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()
dataset['label'] = label_encoder.fit_transform(dataset['emotion'])
dataset_val['label'] = label_encoder.transform(dataset_val['emotion'])
dataset.head()

Unnamed: 0,text,emotion,tokens,label
0,i didnt feel humiliated,sadness,"[i, didnt, feel, humiliated]",4
1,i can go from feeling so hopeless to so damned...,sadness,"[i, can, go, from, feeling, so, hopeless, to, ...",4
2,im grabbing a minute to post i feel greedy wrong,anger,"[im, grabbing, a, minute, to, post, i, feel, g...",0
3,i am ever feeling nostalgic about the fireplac...,love,"[i, am, ever, feeling, nostalgic, about, the, ...",3
4,i am feeling grouchy,anger,"[i, am, feeling, grouchy]",0


In [106]:
dataset_val.head()

Unnamed: 0,text,emotion,tokens,label
0,im feeling quite sad and sorry for myself but ...,sadness,"[im, feeling, quite, sad, and, sorry, for, mys...",4
1,i feel like i am still looking at a blank canv...,sadness,"[i, feel, like, i, am, still, looking, at, a, ...",4
2,i feel like a faithful servant,love,"[i, feel, like, a, faithful, servant]",3
3,i am just feeling cranky and blue,anger,"[i, am, just, feeling, cranky, and, blue]",0
4,i can have for a treat or if i am feeling festive,joy,"[i, can, have, for, a, treat, or, if, i, am, f...",2


In [107]:
# Build vocab 

all_tokens = [token for tokens in dataset['tokens'] for token in tokens]
all_tokens[:10]
vocab = Counter(all_tokens)
print(vocab)



In [108]:
# Assign index to each word
word2idx = {word:idx +2 for idx, (word,_) in enumerate(vocab.most_common())}
word2idx['<PAD>'] = 0
word2idx['<UNK>'] = 1
word2idx

{'i': 2,
 'feel': 3,
 'and': 4,
 'to': 5,
 'the': 6,
 'a': 7,
 'feeling': 8,
 'that': 9,
 'of': 10,
 'my': 11,
 'in': 12,
 'it': 13,
 'like': 14,
 'so': 15,
 'for': 16,
 'im': 17,
 'me': 18,
 'but': 19,
 'was': 20,
 'have': 21,
 'is': 22,
 'this': 23,
 'am': 24,
 'with': 25,
 'not': 26,
 'about': 27,
 'be': 28,
 'as': 29,
 'on': 30,
 'you': 31,
 'just': 32,
 'at': 33,
 'when': 34,
 'or': 35,
 'all': 36,
 'because': 37,
 'more': 38,
 'do': 39,
 'can': 40,
 'really': 41,
 'up': 42,
 't': 43,
 'are': 44,
 'by': 45,
 'very': 46,
 'know': 47,
 'been': 48,
 'if': 49,
 'out': 50,
 'myself': 51,
 'time': 52,
 'how': 53,
 'what': 54,
 'get': 55,
 'little': 56,
 'had': 57,
 'now': 58,
 'will': 59,
 'from': 60,
 'being': 61,
 'they': 62,
 'people': 63,
 'them': 64,
 'would': 65,
 'he': 66,
 'want': 67,
 'her': 68,
 'some': 69,
 'think': 70,
 'one': 71,
 'still': 72,
 'ive': 73,
 'him': 74,
 'even': 75,
 'who': 76,
 'an': 77,
 'life': 78,
 'its': 79,
 'make': 80,
 'there': 81,
 'we': 82,
 'bit': 8

In [109]:
def encode_tokens(tokens):
    return [word2idx.get(token, word2idx['<UNK>']) for token in tokens]

In [110]:
dataset['input_ids'] = dataset['tokens'].apply(encode_tokens)
dataset_val['input_ids'] = dataset_val['tokens'].apply(encode_tokens)
print(dataset['input_ids'].head())
print(dataset_val['input_ids'].head())

0                                     [2, 140, 3, 679]
1    [2, 40, 101, 60, 8, 15, 493, 5, 15, 3496, 552,...
2          [17, 3060, 7, 1149, 5, 286, 2, 3, 494, 437]
3    [2, 24, 165, 8, 665, 27, 6, 4158, 2, 59, 47, 9...
4                                     [2, 24, 8, 1065]
Name: input_ids, dtype: object
0    [17, 8, 157, 260, 4, 343, 16, 51, 19, 212, 112...
1    [2, 3, 14, 2, 24, 72, 253, 33, 7, 618, 2743, 6...
2                                [2, 3, 14, 7, 614, 1]
3                         [2, 24, 32, 8, 558, 4, 1313]
4      [2, 40, 21, 16, 7, 1274, 35, 49, 2, 24, 8, 643]
Name: input_ids, dtype: object


In [111]:
MAX_LEN = 50
def pad_sequence(seq):
    if len(seq)< MAX_LEN:
        seq += [word2idx['<PAD>']]*(MAX_LEN-len(seq))
    else:
        seq=seq[:50]

    return seq

In [112]:
dataset['input_ids'] = dataset['input_ids'].apply(pad_sequence)
dataset_val['input_ids'] = dataset_val['input_ids'].apply(pad_sequence)
print(dataset['input_ids'].head())
print(dataset_val['input_ids'].head())

0    [2, 140, 3, 679, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
1    [2, 40, 101, 60, 8, 15, 493, 5, 15, 3496, 552,...
2    [17, 3060, 7, 1149, 5, 286, 2, 3, 494, 437, 0,...
3    [2, 24, 165, 8, 665, 27, 6, 4158, 2, 59, 47, 9...
4    [2, 24, 8, 1065, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
Name: input_ids, dtype: object
0    [17, 8, 157, 260, 4, 343, 16, 51, 19, 212, 112...
1    [2, 3, 14, 2, 24, 72, 253, 33, 7, 618, 2743, 6...
2    [2, 3, 14, 7, 614, 1, 0, 0, 0, 0, 0, 0, 0, 0, ...
3    [2, 24, 32, 8, 558, 4, 1313, 0, 0, 0, 0, 0, 0,...
4    [2, 40, 21, 16, 7, 1274, 35, 49, 2, 24, 8, 643...
Name: input_ids, dtype: object


In [113]:
import torch
from torch.utils.data import Dataset, DataLoader

In [114]:
class EmotionDataset(Dataset):
    def __init__(self, input_ids, labels):
        self.input_ids = torch.tensor(input_ids, dtype=torch.long)
        self.labels = torch.tensor(labels, dtype=torch.long)
    def __len__(self):
        return len(self.labels)
    
    def __getitem__(self, idx):
        return self.input_ids[idx], self.labels[idx]

In [115]:
X_train = dataset['input_ids'].tolist()
X_val = dataset_val['input_ids'].tolist()
y_train = dataset['label'].tolist()
y_val = dataset_val['label'].tolist()

In [116]:
train_dataset = EmotionDataset(X_train,y_train)
val_dataset = EmotionDataset(X_val, y_val)

In [117]:
BATCH_SIZE = 32
train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
val_loader = DataLoader(val_dataset,batch_size=BATCH_SIZE)


In [118]:
import torch.nn as nn

class EmtionalClassifier(nn.Module):
    def __init__(self, vocab_size, embed_dim, hidden_dim, output_dim):
        super(EmtionalClassifier, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim, padding_idx=word2idx['<PAD>'])
        self.lstm = nn.LSTM(embed_dim, hidden_dim, batch_first=True)
        self.fc = nn.Linear(hidden_dim, output_dim)
        self.dropout = nn.Dropout(0.3)

    def forward(self,x):
        x = self.embedding(x)
        _, (hidden,_) = self.lstm(x)
        out = self.fc(self.dropout(hidden[-1]))
        return out
    
model_1 = EmtionalClassifier(vocab_size=len(word2idx), embed_dim=100,hidden_dim=128,output_dim=len(label_encoder.classes_))

In [119]:
import torch.optim as optim
import tqdm


criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model_1.parameters(),lr=0.002)


In [120]:
from sklearn.utils.class_weight import compute_class_weight

weights = compute_class_weight('balanced', classes=np.unique(y_train), y=y_train)
class_weights = torch.tensor(weights, dtype=torch.float)

criterion = nn.CrossEntropyLoss(weight=class_weights)


In [123]:
EPOCHS = 7

for epoch in tqdm.tqdm(range(EPOCHS)):
    model_1.train()
    total_loss = 0

    for batch in train_loader:
        inputs , labels = batch
        optimizer.zero_grad()
        outputs = model_1(inputs)
        loss = criterion(outputs,labels)
        loss.backward()
        optimizer.step()
        total_loss+=loss.item()

    print(f"Epoch: {epoch+1} | Loss: {total_loss/len(train_loader):.4f}")

 14%|█▍        | 1/7 [00:24<02:27, 24.64s/it]

Epoch: 1 | Loss: 0.3642


 29%|██▊       | 2/7 [00:52<02:11, 26.26s/it]

Epoch: 2 | Loss: 0.3087


 43%|████▎     | 3/7 [01:16<01:40, 25.25s/it]

Epoch: 3 | Loss: 0.3075


 57%|█████▋    | 4/7 [01:51<01:27, 29.31s/it]

Epoch: 4 | Loss: 0.2918


 71%|███████▏  | 5/7 [02:10<00:51, 25.62s/it]

Epoch: 5 | Loss: 0.2443


 86%|████████▌ | 6/7 [02:26<00:22, 22.45s/it]

Epoch: 6 | Loss: 0.2283


100%|██████████| 7/7 [02:44<00:00, 23.56s/it]

Epoch: 7 | Loss: 0.2448





In [124]:
from sklearn.metrics import classification_report

model_1.eval()
all_preds = []
all_labels = []
with torch.no_grad():
    for batch in val_loader:
        inputs, labels = batch
        outputs = model_1(inputs)
        preds = torch.argmax(outputs,dim=1)
        all_preds.extend(preds.cpu().numpy())
        all_labels.extend(labels.cpu().numpy())
print(classification_report(all_labels,all_preds,target_names=label_encoder.classes_))


              precision    recall  f1-score   support

       anger       0.16      0.24      0.19       275
        fear       0.17      0.14      0.15       212
         joy       0.38      0.33      0.35       704
        love       0.12      0.06      0.08       178
     sadness       0.30      0.35      0.33       550
    surprise       0.14      0.10      0.12        81

    accuracy                           0.27      2000
   macro avg       0.21      0.20      0.20      2000
weighted avg       0.27      0.27      0.27      2000



In [125]:
import torch
from nltk.tokenize import word_tokenize

MAX_LEN = 50  # or whatever you used during training

def predict_emotion(text, model, word2idx, label_encoder, device='cpu'):
    model.eval()

    # Clean and tokenize
    text = text.lower()
    tokens = word_tokenize(text)
    input_ids = [word2idx.get(token, word2idx['<UNK>']) for token in tokens]

    # Pad or truncate
    if len(input_ids) < MAX_LEN:
        input_ids += [word2idx['<PAD>']] * (MAX_LEN - len(input_ids))
    else:
        input_ids = input_ids[:MAX_LEN]

    # Convert to tensor
    input_tensor = torch.tensor([input_ids], dtype=torch.long).to(device)

    # Predict
    with torch.no_grad():
        output = model(input_tensor)
        predicted_idx = torch.argmax(output, dim=1).item()

    # Map to label
    predicted_emotion = label_encoder.inverse_transform([predicted_idx])[0]
    return predicted_emotion


In [126]:
text = "i now feel compromised and skeptical of the value of every unit of work i put in"
emotion = predict_emotion(text,model_1,word2idx,label_encoder)

In [127]:
print(emotion)

fear


In [128]:
import pickle

In [129]:
with open('label_encoder.pkl','wb') as file:
    pickle.dump(label_encoder,file)

with open('word2idx.pkl','wb') as file:
    pickle.dump(word2idx,file)

In [130]:
torch.save(model_1.state_dict(), "model_1.pth")