In [6]:
import pandas as pd

import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
import re
import nltk
warnings.filterwarnings('ignore')

In [2]:
df = pd.read_csv('nlp_dataset.csv')
df.head()

Unnamed: 0,Comment,Emotion
0,i seriously hate one subject to death but now ...,fear
1,im so full of life i feel appalled,anger
2,i sit here to write i start to dig out my feel...,fear
3,ive been really angry with r and i feel like a...,joy
4,i feel suspicious if there is no one outside l...,fear


In [3]:
import re
def remove_tags(text):
    text = re.sub(r'<.*?>', '', text)           
    text = re.sub(r'https?://\S+|www\.\S+', '', text)  
    text = re.sub(r'[^\w\s]', ' ', text)       
    text = text.lower()
    return text

df['Comment'] = df['Comment'].apply(lambda cw : remove_tags(cw))

In [4]:
df

Unnamed: 0,Comment,Emotion
0,i seriously hate one subject to death but now ...,fear
1,im so full of life i feel appalled,anger
2,i sit here to write i start to dig out my feel...,fear
3,ive been really angry with r and i feel like a...,joy
4,i feel suspicious if there is no one outside l...,fear
...,...,...
5932,i begun to feel distressed for you,fear
5933,i left feeling annoyed and angry thinking that...,anger
5934,i were to ever get married i d have everything...,joy
5935,i feel reluctant in applying there because i w...,fear


In [5]:
df.shape

(5937, 2)

In [8]:
nltk.download('stopwords')
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\asus\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [9]:
df['Comment'] = df['Comment'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop_words)]))

In [10]:
df

Unnamed: 0,Comment,Emotion
0,seriously hate one subject death feel reluctan...,fear
1,im full life feel appalled,anger
2,sit write start dig feelings think afraid acce...,fear
3,ive really angry r feel like idiot trusting fi...,joy
4,feel suspicious one outside like rapture happe...,fear
...,...,...
5932,begun feel distressed,fear
5933,left feeling annoyed angry thinking center stu...,anger
5934,ever get married everything ready offer got to...,joy
5935,feel reluctant applying want able find company...,fear


In [12]:
w_tokenizer = nltk.tokenize.WhitespaceTokenizer()
lemmatizer = nltk.stem.WordNetLemmatizer()
def lemmatize_text(text):
    st = ""
    for w in w_tokenizer.tokenize(text):
        st = st + lemmatizer.lemmatize(w) + " "
    return st
df['Comment'] = df.Comment.apply(lemmatize_text)
df

Unnamed: 0,Comment,Emotion
0,seriously hate one subject death feel reluctan...,fear
1,im full life feel appalled,anger
2,sit write start dig feeling think afraid accep...,fear
3,ive really angry r feel like idiot trusting fi...,joy
4,feel suspicious one outside like rapture happe...,fear
...,...,...
5932,begun feel distressed,fear
5933,left feeling annoyed angry thinking center stu...,anger
5934,ever get married everything ready offer got to...,joy
5935,feel reluctant applying want able find company...,fear


# **STEPS COMPLETED**

**TEXT LOADING :** Loaded the data set and printed the head section of the file 

**STOP WORD REMOVAL :** Removed the stopwords (words with no meaning + more frequency) from the comments

**TOKENZIATION :** Did tokenizaiton where we splited the set into tokens with help of whitespace tokenzier

In [17]:

from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer(max_features=5000)
x = tfidf.fit_transform(df['Comment'])  
y = df['Emotion']        

**TF - IDF :** Here we convert the text into vector with weights , where we compute weight for each words and then assign them to each of them . Help to assign more weights to those words which are more important

In [20]:
from sklearn.preprocessing import LabelEncoder

Comments = df['Comment'].values
emotion = df['Emotion'].values

encoder = LabelEncoder()
encoded_emotion = encoder.fit_transform(emotion)

**Label Encoder :** Here we encode the labels into 0 and 1 since the systm is not able to decode text values

In [21]:
from sklearn.model_selection import train_test_split

In [23]:
train_sentances , test_sentances , train_labels , test_labels = train_test_split(Comments , encoded_emotion , stratify= encoded_emotion)

In [24]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [25]:
# Hyperparameters of the model
vocab_size = 3000 # choose based on statistics
oov_tok = ''
embedding_dim = 100
max_length = 200 # choose based on statistics, for example 150 to 200
padding_type='post'
trunc_type='post'
# tokenize sentences
tokenizer = Tokenizer(num_words = vocab_size, oov_token=oov_tok)
tokenizer.fit_on_texts(train_sentances)
word_index = tokenizer.word_index
# convert train dataset to sequence and pad sequences
train_sequences = tokenizer.texts_to_sequences(train_sentances)
train_padded = pad_sequences(train_sequences, padding='post', maxlen=max_length)
# convert Test dataset to sequence and pad sequences
test_sequences = tokenizer.texts_to_sequences(test_sentances)
test_padded = pad_sequences(test_sequences, padding='post', maxlen=max_length)

In [42]:
# Get the number of unique emotions
num_classes = len(encoder.classes_)

# Model initialization updated for Multi-Class
model = keras.Sequential([
    keras.layers.Embedding(vocab_size, embedding_dim, input_length=max_length),
    keras.layers.Bidirectional(keras.layers.LSTM(64)),
    keras.layers.Dense(24, activation='relu'),
    # Change 1 to num_classes and sigmoid to softmax
    keras.layers.Dense(num_classes, activation='softmax') 
])

# Change loss to sparse_categorical_crossentropy
model.compile(loss='sparse_categorical_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])
model.build(input_shape=(None, max_length))

model.summary()

In [43]:
num_epochs = 5
history = model.fit(train_padded, train_labels,
                    epochs=num_epochs, verbose=1,
                    validation_split=0.1)

Epoch 1/5
[1m126/126[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 74ms/step - accuracy: 0.5190 - loss: 0.9609 - val_accuracy: 0.8296 - val_loss: 0.4654
Epoch 2/5
[1m126/126[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 68ms/step - accuracy: 0.9438 - loss: 0.1726 - val_accuracy: 0.9081 - val_loss: 0.2858
Epoch 3/5
[1m126/126[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 72ms/step - accuracy: 0.9830 - loss: 0.0518 - val_accuracy: 0.9058 - val_loss: 0.2979
Epoch 4/5
[1m126/126[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 75ms/step - accuracy: 0.9905 - loss: 0.0281 - val_accuracy: 0.9013 - val_loss: 0.3012
Epoch 5/5
[1m126/126[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 70ms/step - accuracy: 0.9955 - loss: 0.0158 - val_accuracy: 0.9103 - val_loss: 0.3227


In [44]:
from sklearn.metrics import accuracy_score

In [45]:
# Predict probabilities for all classes
prediction = model.predict(test_padded)

# Use argmax to get the index of the highest probability
pred_labels = np.argmax(prediction, axis=1)

print("Accuracy of prediction on test set: ", accuracy_score(test_labels, pred_labels))

[1m47/47[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 27ms/step
Accuracy of prediction on test set:  0.9313131313131313


In [46]:

from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
y = le.fit_transform(df['Emotion'])

predicted_emotions = le.inverse_transform(pred_labels)

In [49]:
# reviews to predict
# New test comments
Comment = [
    "I am absolutely delighted with how my project turned out!",
    "I am so frustrated because the app keeps crashing!",
    "I was suddenly terrified when I heard a loud noise in the dark.",
    "I was terrified of the ride, but then I felt a huge rush of joy."
]
sequences = tokenizer.texts_to_sequences(Comment)
padded = pad_sequences(sequences, padding='post', maxlen=max_length)

# Get probabilities
prediction = model.predict(padded)

# Get the best class index for each sentence
pred_indices = np.argmax(prediction, axis=1)

# Convert indices back to original text labels
predicted_names = encoder.inverse_transform(pred_indices)

for i in range(len(Comment)):
    print(f"Comment: {Comment[i]}")
    print(f"Predicted Emotion: {predicted_names[i]}")
    print("-" * 40)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 38ms/step
Comment: I am absolutely delighted with how my project turned out!
Predicted Emotion: joy
----------------------------------------
Comment: I am so frustrated because the app keeps crashing!
Predicted Emotion: anger
----------------------------------------
Comment: I was suddenly terrified when I heard a loud noise in the dark.
Predicted Emotion: fear
----------------------------------------
Comment: I was terrified of the ride, but then I felt a huge rush of joy.
Predicted Emotion: fear
----------------------------------------
