# Reading from txt file and converting to DataFrame 

In [1]:
dataset_train = {"text" : [] , "emotions" : []}
dataset_test = {"text" : [] , "emotions" : []}

In [2]:
baseFolder = "data"
#-----------------------------train-----------------
with open(f"{baseFolder}/train.txt" , "r") as file:
    for i in file.readlines():
        sentence = i.split(";")
        dataset_train["text"].append(sentence[0])
        target = sentence[1].split("\n")
        dataset_train["emotions"].append(target[0])
#-----------------------test-------------------------------
with open(f"{baseFolder}/test.txt" , "r") as file:
    for i in file.readlines():
        sentence = i.split(";")
        dataset_test["text"].append(sentence[0])
        target = sentence[1].split("\n")
        dataset_test["emotions"].append(target[0])
#-----------------------valid------------------------------
with open(f"{baseFolder}/val.txt" , "r") as file:
    for i in file.readlines():
        sentence = i.split(";")
        dataset_train["text"].append(sentence[0])
        target = sentence[1].split("\n")
        dataset_train["emotions"].append(target[0])

In [3]:
import pandas as pd 
dataset_train = pd.DataFrame.from_dict(dataset_train)
dataset_test = pd.DataFrame.from_dict(dataset_test)

In [4]:
dataset_train

Unnamed: 0,text,emotions
0,i didnt feel humiliated,sadness
1,i can go from feeling so hopeless to so damned...,sadness
2,im grabbing a minute to post i feel greedy wrong,anger
3,i am ever feeling nostalgic about the fireplac...,love
4,i am feeling grouchy,anger
...,...,...
17995,im having ssa examination tomorrow in the morn...,sadness
17996,i constantly worry about their fight against n...,joy
17997,i feel its important to share this info for th...,joy
17998,i truly feel that if you are passionate enough...,joy


# Converting for data pre-processing

In [5]:
import spacy
import nltk
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
 
lemmatizer = WordNetLemmatizer()
nlp = spacy.load("en_core_web_sm")
# ps =PorterStemmer()
def preprocess_sentence(sen):
    doc = nlp(sen)
    new_sen = [token.lemma_ for token in doc ]
    return " ".join(new_sen)

In [6]:
dataset_train["text"] = dataset_train["text"].apply(preprocess_sentence)
dataset_test["text"] = dataset_test["text"].apply(preprocess_sentence)

In [7]:
dataset_train

Unnamed: 0,text,emotions
0,I do not feel humiliate,sadness
1,I can go from feel so hopeless to so damned ho...,sadness
2,I m grab a minute to post I feel greedy wrong,anger
3,I be ever feel nostalgic about the fireplace I...,love
4,I be feel grouchy,anger
...,...,...
17995,I m have ssa examination tomorrow in the morni...,sadness
17996,I constantly worry about their fight against n...,joy
17997,I feel its important to share this info for th...,joy
17998,I truly feel that if you be passionate enough ...,joy


In [8]:
target_dict = {value: index for index , value in enumerate(dataset_train["emotions"].unique().tolist())}

In [9]:
target_dict = {value: index for index , value in enumerate(dataset_test["emotions"].unique().tolist())}

In [10]:
target_dict

{'sadness': 0, 'joy': 1, 'fear': 2, 'anger': 3, 'love': 4, 'surprise': 5}

In [11]:
dataset_train["emotions"].replace(target_dict, inplace=True)
dataset_test["emotions"].replace(target_dict, inplace=True)

In [12]:
dataset_train["emotions"].value_counts()

emotions
1    6066
0    5216
3    2434
2    2149
4    1482
5     653
Name: count, dtype: int64

In [13]:
dataset_train.isnull().sum()

text        0
emotions    0
dtype: int64

In [14]:
dataset_train.isna().sum()

text        0
emotions    0
dtype: int64

In [15]:
dataset_train.duplicated().sum()

1

In [16]:
dataset_train.drop_duplicates(inplace = True)

In [17]:
dataset_train.value_counts().sum()

17999

# Data Preprocessing or feature extraction 

In [18]:
X = dataset_train["text"]
y = dataset_train["emotions"].astype(int)
X_test = dataset_test["text"]
y_test = dataset_test["emotions"].astype(int)

In [19]:
# from sklearn.model_selection import train_test_split
# import pickle
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [20]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from spacy.lang.en import stop_words
my_stop_words = list(stop_words.STOP_WORDS)
cv = TfidfVectorizer(stop_words=my_stop_words)
X = cv.fit_transform(X)
X_test = cv.transform(X_test)



## Machine Learning Model Naiev Bais , Xgboost , Random Forest , Voting classifier, KNN

In [26]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report
nb_classifier = MultinomialNB()
nb_classifier.fit(X, y)

y_pred = nb_classifier.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

report = classification_report(y_test, y_pred,zero_division=1)
print("Classification Report:\n", report)

Accuracy: 0.7125
Classification Report:
               precision    recall  f1-score   support

           0       0.70      0.91      0.79       581
           1       0.67      0.99      0.80       695
           2       0.91      0.38      0.54       224
           3       0.96      0.41      0.57       275
           4       0.93      0.09      0.16       159
           5       1.00      0.00      0.00        66

    accuracy                           0.71      2000
   macro avg       0.86      0.46      0.48      2000
weighted avg       0.78      0.71      0.66      2000



In [28]:
from sklearn.ensemble import GradientBoostingClassifier
clf = GradientBoostingClassifier(n_estimators=300, learning_rate=1.0,max_depth=5, random_state=0)
clf.fit(X, y)
y_pred = clf.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

report = classification_report(y_test, y_pred,zero_division=1)
print("Classification Report:\n", report)

Accuracy: 0.837
Classification Report:
               precision    recall  f1-score   support

           0       0.92      0.85      0.88       581
           1       0.82      0.90      0.86       695
           2       0.90      0.76      0.82       224
           3       0.85      0.87      0.86       275
           4       0.70      0.63      0.66       159
           5       0.53      0.71      0.61        66

    accuracy                           0.84      2000
   macro avg       0.79      0.79      0.78      2000
weighted avg       0.84      0.84      0.84      2000



In [24]:
from sklearn.metrics import accuracy_score, classification_report
from sklearn.ensemble import RandomForestClassifier
rf_classifier = RandomForestClassifier(n_estimators=100)
rf_classifier.fit(X, y)
y_pred = rf_classifier.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)
report = classification_report(y_test, y_pred,zero_division=1)
print("Classification Report:\n", report)

Accuracy: 0.862
Classification Report:
               precision    recall  f1-score   support

           0       0.91      0.90      0.90       581
           1       0.86      0.92      0.89       695
           2       0.83      0.86      0.84       224
           3       0.90      0.84      0.87       275
           4       0.77      0.64      0.70       159
           5       0.65      0.59      0.62        66

    accuracy                           0.86      2000
   macro avg       0.82      0.79      0.80      2000
weighted avg       0.86      0.86      0.86      2000



In [23]:
from sklearn.metrics import accuracy_score, classification_report
from sklearn.neighbors import KNeighborsClassifier
knn_classifier = KNeighborsClassifier()
knn_classifier.fit(X, y)
y_pred = knn_classifier.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)
report = classification_report(y_test, y_pred, zero_division=1)
print("Classification Report:")
print(report)

Accuracy: 0.7795
Classification Report:
              precision    recall  f1-score   support

           0       0.76      0.88      0.81       581
           1       0.80      0.84      0.82       695
           2       0.84      0.71      0.77       224
           3       0.79      0.71      0.75       275
           4       0.69      0.50      0.58       159
           5       0.68      0.48      0.57        66

    accuracy                           0.78      2000
   macro avg       0.76      0.69      0.72      2000
weighted avg       0.78      0.78      0.77      2000



In [35]:
from sklearn.ensemble import VotingClassifier
vc = VotingClassifier(estimators = [("xg" ,clf) , ("rf", rf_classifier) ] , voting = "soft")
vc = vc.fit(X , y)
y_pred = vc.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)
report = classification_report(y_test, y_pred,zero_division=1)
print("Classification Report:\n", report)

Accuracy: 0.854
Classification Report:
               precision    recall  f1-score   support

           0       0.93      0.88      0.90       581
           1       0.84      0.91      0.87       695
           2       0.90      0.76      0.82       224
           3       0.85      0.87      0.86       275
           4       0.77      0.65      0.71       159
           5       0.54      0.76      0.63        66

    accuracy                           0.85      2000
   macro avg       0.80      0.81      0.80      2000
weighted avg       0.86      0.85      0.85      2000



In [25]:
import pickle
with open("emotions.pkl", "wb") as f:
    pickle.dump((rf_classifier, cv), f)

# Deep Learning Model RNN , GRU , LSTM 

In [38]:
import torch 
from torch import nn 
from torch import optim
from torch.nn.utils.rnn import pad_sequence

In [39]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
device = 'cpu'

In [22]:
type(y_test)

pandas.core.series.Series

#### <u>Converting to Tensor for training model </u>

In [40]:
X_train_tensor = torch.from_numpy(X.toarray()).to(torch.float32).to(device)
X_train_tensor_pad = pad_sequence(X_train_tensor, batch_first=True, padding_value=0)
X_test_tensor  = torch.from_numpy(X_test.toarray()).to(device)
X_test_tensor_pad = pad_sequence(X_test_tensor, batch_first=True, padding_value=0)
y_train_tensor = torch.Tensor(y.tolist()).to(device)
y_test_tensor = torch.Tensor(y_test.tolist()).long().to(device)

In [41]:
X_train_tensor_pad.shape[1]

12516

In [42]:
X_train_tensor

tensor([[0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        ...,
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.]])

#### <u> Creating rnn , lstm and gru model </u>

In [217]:
class rnnModel(nn.Module):
    def __init__(self, input_size, hidden_size, num_mood=6):
        super(rnnModel, self).__init__()
        self.rnn = nn.RNN(input_size=input_size, hidden_size=hidden_size*2, num_layers=2 , batch_first = True)
        self.model = nn.Sequential(
             nn.Linear(hidden_size * 2, hidden_size),  # multiplied by 4 due to bidirectional LSTM
            nn.ReLU(),
            nn.Dropout(0.25),
            nn.Linear(hidden_size, hidden_size // 2),
            nn.ReLU(),
            nn.Dropout(0.25),
            nn.Linear(hidden_size // 2, num_mood),
        )
    def forward(self, x):
        output_rnn, _ = self.rnn(x)
        output = self.model(output_rnn)  
        return output


In [43]:
class BiLSTM(nn.Module):
    def __init__(self, input_size, hidden_size, num_mood=6):
        super(BiLSTM, self).__init__()
        self.lstm = nn.LSTM(input_size=input_size, hidden_size=hidden_size*2, num_layers=2, batch_first=True, bidirectional=False)
        self.model = nn.Sequential(
            nn.Linear(hidden_size * 2, hidden_size),  # multiplied by 4 due to bidirectional LSTM
            nn.ReLU(),
            nn.Dropout(0.25),
            nn.Linear(hidden_size, hidden_size // 2),
            nn.ReLU(),
            nn.Dropout(0.25),
            nn.Linear(hidden_size // 2, num_mood),
        )
    def forward(self, x):
        output_rnn, _ = self.lstm(x)
        output = self.model(output_rnn)  
        return output


In [44]:
class BiGRU(nn.Module):
    def __init__(self, input_size, hidden_size, num_mood=6):
        super(BiGRU, self).__init__()
        self.gru = nn.GRU(input_size=input_size, hidden_size=hidden_size*2, num_layers=2, batch_first=True, bidirectional=True)
        self.model = nn.Sequential(
            nn.Linear(hidden_size * 4, hidden_size),  # multiplied by 4 due to bidirectional GRU
            nn.ReLU(),
            nn.Dropout(0.25),
            nn.Linear(hidden_size, hidden_size // 2),
            nn.ReLU(),
            nn.Dropout(0.25),
            nn.Linear(hidden_size // 2, num_mood),
        )
        
    def forward(self, x):
        output_rnn, _ = self.gru(x)
        output = self.model(output_rnn)  
        return output


In [45]:
from torch.utils.data import TensorDataset, DataLoader
batch_size = 32
train_dataset = TensorDataset(X_train_tensor_pad, y_train_tensor)
test_dataset = TensorDataset(X_test_tensor_pad, y_test_tensor)
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

In [46]:
X_train_tensor[0]

tensor([0., 0., 0.,  ..., 0., 0., 0.])

In [47]:
len(X_train_tensor)

17999

In [60]:
epochs = 5
hidden_size = 128 
vocab_size = len(cv.vocabulary_)
model = BiGRU(X_train_tensor.shape[1], hidden_size = hidden_size).to(device)
loss_func = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(),lr = 0.001)

In [61]:
model

BiGRU(
  (gru): GRU(12516, 256, num_layers=2, batch_first=True, bidirectional=True)
  (model): Sequential(
    (0): Linear(in_features=512, out_features=128, bias=True)
    (1): ReLU()
    (2): Dropout(p=0.25, inplace=False)
    (3): Linear(in_features=128, out_features=64, bias=True)
    (4): ReLU()
    (5): Dropout(p=0.25, inplace=False)
    (6): Linear(in_features=64, out_features=6, bias=True)
  )
)

In [62]:

from torchmetrics import Accuracy
accuracy_metric = Accuracy(task="multiclass" , num_classes=6)

In [63]:
X_test_tensor_pad = X_test_tensor_pad.to(torch.float32)

In [64]:
import torch.nn.functional as F

In [65]:
for epoch in range(epochs):
    model.train()
    total_loss = 0
    for batch_X, batch_y in train_loader:
        optimizer.zero_grad()
        outputs = model(batch_X)
        batch_y = batch_y.long()
        loss = loss_func(outputs, batch_y)
        loss.backward()
        # nn.utils.clip_grad_norm_(model.parameters(),0.7)
        optimizer.step()
        total_loss += loss.item()
    avg_loss = total_loss / len(train_loader)
    
    model.eval()
    with torch.no_grad():
        y_pred = model(X_test_tensor_pad)
        probabilities = F.softmax(y_pred, dim=1)
        predicted_classes = torch.argmax(probabilities, dim=1)
        test_loss = loss_func(y_pred, y_test_tensor.long())
        test_acc = torch.sum(predicted_classes == y_test_tensor).item() / len(y_test_tensor)
    
    print(f"Epoch {epoch + 1}/{epochs}, Train Loss: {avg_loss:.4f}, Test Loss: {test_loss.item():.4f}, Test Acc: {test_acc:.4f}")


Epoch 1/5, Train Loss: 1.2233, Test Loss: 0.6915, Test Acc: 0.7460
Epoch 2/5, Train Loss: 0.4840, Test Loss: 0.4992, Test Acc: 0.8230
Epoch 3/5, Train Loss: 0.2453, Test Loss: 0.5495, Test Acc: 0.8320
Epoch 4/5, Train Loss: 0.1637, Test Loss: 0.5362, Test Acc: 0.8275
Epoch 5/5, Train Loss: 0.1320, Test Loss: 0.6262, Test Acc: 0.8230


In [None]:
len(cv.vocabulary_)

In [250]:
  print(y_pred.squeeze())

tensor([[ -2.5074,  -8.0761,   6.0209,  -5.4037,  -7.4044,   2.8774],
        [ -4.7107,  -8.0258,   4.8750,  -1.1334,  -5.8353,   4.2952],
        [  3.6265,  -9.6321,  -3.9772, -10.9237,  -7.4823,  12.5051],
        ...,
        [ 14.5118,  -0.8635,  -7.9909, -24.9178,  -8.4342,   2.3578],
        [ 14.3551,  -0.2435,  -7.9499, -25.1739,  -8.3177,   1.7826],
        [ -2.1728,   8.1011, -13.4091, -13.8454,   7.9622,  -7.5289]])
