In [10]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
import re
import nltk
warnings.filterwarnings('ignore')

In [11]:
data = pd.read_csv('nlp_dataset.csv')

In [12]:
data.head()

Unnamed: 0,Comment,Emotion
0,i seriously hate one subject to death but now ...,fear
1,im so full of life i feel appalled,anger
2,i sit here to write i start to dig out my feel...,fear
3,ive been really angry with r and i feel like a...,joy
4,i feel suspicious if there is no one outside l...,fear


In [13]:
print(data.columns)

Index(['Comment', 'Emotion'], dtype='object')


In [14]:
import re
def remove_tags(text):
    text = re.sub(r'<.*?>', '', text)           
    text = re.sub(r'https?://\S+|www\.\S+', '', text)  
    text = re.sub(r'[^\w\s]', ' ', text)        
    text = text.lower()
    return text
data['Comment'] = data['Comment'].apply(remove_tags)

In [15]:
nltk.download('stopwords')
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))
data['Comment'] = data['Comment'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop_words)]))

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\ANAMIKA\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [16]:
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\ANAMIKA\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [17]:
w_tokenizer = nltk.tokenize.WhitespaceTokenizer()
lemmatizer = nltk.stem.WordNetLemmatizer()
def lemmatize_text(text):
    st = ""
    for w in w_tokenizer.tokenize(text):
        st = st + lemmatizer.lemmatize(w) + " "
    return st
data['Comment'] = data.Comment.apply(lemmatize_text)
data

Unnamed: 0,Comment,Emotion
0,seriously hate one subject death feel reluctan...,fear
1,im full life feel appalled,anger
2,sit write start dig feeling think afraid accep...,fear
3,ive really angry r feel like idiot trusting fi...,joy
4,feel suspicious one outside like rapture happe...,fear
...,...,...
5932,begun feel distressed,fear
5933,left feeling annoyed angry thinking center stu...,anger
5934,ever get married everything ready offer got to...,joy
5935,feel reluctant applying want able find company...,fear


**Text Cleaning :**
All text was converted to lowercase and unnecessary elements such as HTML tags, special characters, and extra spaces were removed.

**Tokenization :**
The cleaned text was split into individual words (tokens) using whitespace tokenization.

**Stopword Removal :**
Commonly occurring words such as is, the, and, of were removed using NLTK’s English stopword list.

In [20]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer(max_features=5000)
X = tfidf.fit_transform(data['Comment'])  
y = data['Emotion']                       

TF-IDF transforms text data into numerical feature vectors by assigning weights to words based on how frequently they appear in a document and how rare they are across the entire dataset.

In [21]:
from sklearn.preprocessing import LabelEncoder

In [23]:
Comment = data['Comment'].values
labels = data['Emotion'].values
encoder = LabelEncoder()
encoded_labels = encoder.fit_transform(labels)

In [24]:
from sklearn.model_selection import train_test_split

In [25]:
train_sentences, test_sentences, train_labels, test_labels = train_test_split(reviews, encoded_labels, stratify = encoded_labels)

In [26]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [27]:
# Hyperparameters of the model
vocab_size = 3000 # choose based on statistics
oov_tok = ''
embedding_dim = 100
max_length = 200 # choose based on statistics, for example 150 to 200
padding_type='post'
trunc_type='post'
# tokenize sentences
tokenizer = Tokenizer(num_words = vocab_size, oov_token=oov_tok)
tokenizer.fit_on_texts(train_sentences)
word_index = tokenizer.word_index
# convert train dataset to sequence and pad sequences
train_sequences = tokenizer.texts_to_sequences(train_sentences)
train_padded = pad_sequences(train_sequences, padding='post', maxlen=max_length)
# convert Test dataset to sequence and pad sequences
test_sequences = tokenizer.texts_to_sequences(test_sentences)
test_padded = pad_sequences(test_sequences, padding='post', maxlen=max_length)

In [28]:
# model initialization
model = keras.Sequential([
    keras.layers.Embedding(vocab_size, embedding_dim, input_length=max_length),
    keras.layers.Bidirectional(keras.layers.LSTM(64)),
    keras.layers.Dense(24, activation='relu'),
    keras.layers.Dense(1, activation='sigmoid')
])
# compile model
model.compile(loss='binary_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])
model.build(input_shape=(None, max_length))
# model summary
model.summary()

In [29]:
num_epochs = 5
history = model.fit(train_padded, train_labels,
                    epochs=num_epochs, verbose=1,
                    validation_split=0.1)

Epoch 1/5
[1m126/126[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m18s[0m 88ms/step - accuracy: 0.3220 - loss: 0.0593 - val_accuracy: 0.3610 - val_loss: -0.0153
Epoch 2/5
[1m126/126[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 79ms/step - accuracy: 0.3467 - loss: -9.1843 - val_accuracy: 0.4731 - val_loss: -24.2282
Epoch 3/5
[1m126/126[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 75ms/step - accuracy: 0.4720 - loss: -69.1613 - val_accuracy: 0.4462 - val_loss: -103.5424
Epoch 4/5
[1m126/126[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 79ms/step - accuracy: 0.5457 - loss: -184.8298 - val_accuracy: 0.5471 - val_loss: -225.4636
Epoch 5/5
[1m126/126[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 80ms/step - accuracy: 0.5721 - loss: -342.1447 - val_accuracy: 0.5942 - val_loss: -374.4141


In [30]:
from sklearn.metrics import accuracy_score

In [35]:
prediction = model.predict(test_padded)
# Get labels based on probability 1 if p>= 0.5 else 0
pred_labels = []
for i in prediction:
    if i >= 0.5:
        pred_labels.append(1)
    else:
        pred_labels.append(0)
print("Accuracy of prediction on test set : ", accuracy_score(test_labels,pred_labels))

[1m47/47[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 30ms/step
Accuracy of prediction on test set :  0.5797979797979798


In [41]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
y = le.fit_transform(data['Emotion'])

predicted_emotions = le.inverse_transform(pred_labels)


In [42]:

# reviews on which we need to predict
Comment = ["i feel reluctant in applying there because i want to be able to find a company where i know at least one person"]
# convert to a sequence
sequences = tokenizer.texts_to_sequences(Comment)
# pad the sequence
padded = pad_sequences(sequences, padding='post', maxlen=max_length)
# Get labels based on probability 1 if p>= 0.5 else 0
prediction = model.predict(padded)
pred_labels = []
for i in prediction:
    if i >= 0.5:
        pred_labels.append(1)
    else:
        pred_labels.append(0)
for i in range(len(Comment)):
    print("Comment:", Comment[i])
    print("Predicted Emotion:", predicted_emotions[i])
    print("-" * 40)


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 124ms/step
Comment: i feel reluctant in applying there because i want to be able to find a company where i know at least one person
Predicted Emotion: fear
----------------------------------------


This sentiment analysis system uses an LSTM model to classify emotions in text. The text is preprocessed, converted into numerical sequences using tokenization and padding, and then passed to the LSTM, which predicts the most likely emotion based on learned patterns in the data.