**Importing necessary libraries**

In [12]:
import pandas as pd
import numpy as np
import re
import tensorflow as tf

In [13]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from  tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, LSTM
import warnings
warnings.filterwarnings('ignore')

**Load and Data set**

In [14]:
data = pd.read_csv('tweet_emotions.csv', encoding = 'latin-1')
pd.set_option('display.max_colwidth', None)
data.head()

Unnamed: 0,tweet_id,sentiment,content
0,1956967341,empty,@tiffanylue i know i was listenin to bad habit earlier and i started freakin at his part =[
1,1956967666,sadness,Layin n bed with a headache ughhhh...waitin on your call...
2,1956967696,sadness,Funeral ceremony...gloomy friday...
3,1956967789,enthusiasm,wants to hang out with friends SOON!
4,1956968416,neutral,"@dannycastillo We want to trade with someone who has Houston tickets, but no one will."


In [15]:
data.content.value_counts()

Unnamed: 0_level_0,count
content,Unnamed: 1_level_1
I just received a mothers day card from my lovely daughter wishing my a happy mothers day http://tr.im/kWK9,14
FREE UNLIMITED RINGTONES!!! - http://tinyurl.com/freeringring - USA ONLY - Awesome 4 iphone,13
Happy Mother's Day!,10
Happy Mothers Day,10
happy mother's day,8
...,...
Is leaving Utah today Super Sad Face,1
"@KulpreetSingh Good god, no. You can fit very few swear words into 140 characters",1
@mleshock Noooo! Poor SE,1
I wish GM stock would turn around at 80 cents a share I would be willing to buy some...if I knew it would do something,1


In [16]:
data.shape

(40000, 3)

In [17]:
# Check for null values
print('Null values-\n',data.isna().sum(),'\n')


Null values-
 tweet_id     0
sentiment    0
content      0
dtype: int64 



**Preprocessing**

In [18]:
# Data Preprocessing

# Text cleaning function

def clean_text(text):
    text = text.lower()  # Convert text to lowercase
    text = re.sub(r'http\S+|www\S+|https\S+', '', text)  # Remove URLs
    text = re.sub(r'@[A-Za-z0-9]+', '', text)  # Remove @mentions
    text = re.sub(r'#', '', text)  # Remove hashtags
    # text = re.sub(r'[^a-zA-Z\s]', '', text)  # Remove non-alphabetic characters
    return text

# Clean the tweet text
data['content'] = data['content'].apply(clean_text)

data.head()

Unnamed: 0,tweet_id,sentiment,content
0,1956967341,empty,i know i was listenin to bad habit earlier and i started freakin at his part =[
1,1956967666,sadness,layin n bed with a headache ughhhh...waitin on your call...
2,1956967696,sadness,funeral ceremony...gloomy friday...
3,1956967789,enthusiasm,wants to hang out with friends soon!
4,1956968416,neutral,"we want to trade with someone who has houston tickets, but no one will."


In [19]:
# Feature Extraction - TF-IDF Vectorization
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer(max_features=5000)
X_tfidf = tfidf.fit_transform(data['content'])
y = data['sentiment']


**Encoding**

In [20]:
# Convert labels to numerical values
from sklearn.preprocessing import LabelEncoder
label_encoder = LabelEncoder()
data['label_encoded'] = label_encoder.fit_transform(data['sentiment'])

In [21]:
data.head(10)

Unnamed: 0,tweet_id,sentiment,content,label_encoded
0,1956967341,empty,i know i was listenin to bad habit earlier and i started freakin at his part =[,2
1,1956967666,sadness,layin n bed with a headache ughhhh...waitin on your call...,10
2,1956967696,sadness,funeral ceremony...gloomy friday...,10
3,1956967789,enthusiasm,wants to hang out with friends soon!,3
4,1956968416,neutral,"we want to trade with someone who has houston tickets, but no one will.",8
5,1956968477,worry,re-pinging : why didn't you go to prom? bc my bf didn't like my friends,12
6,1956968487,sadness,"i should be sleep, but im not! thinking about an old friend who i want. but he's married now. damn, &amp; he wants me 2! scandalous!",10
7,1956968636,worry,hmmm. is down,12
8,1956969035,sadness,charlene my love. i miss you,10
9,1956969172,sadness,i'm sorry at least it's friday?,10


In [22]:
data['label_encoded'].value_counts()

Unnamed: 0_level_0,count
label_encoded,Unnamed: 1_level_1
8,8638
12,8459
5,5209
10,5165
7,3842
11,2187
4,1776
9,1526
6,1323
2,827


**SPLITTING THE DATASET**

In [23]:
X = X_tfidf
y = data['label_encoded']

In [24]:
# Train-Test Split
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

**Model building and Evaluation**

In [25]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

**LogisticRegression**

In [26]:
# LogisticRegression
lr = LogisticRegression()
lr.fit(X_train, y_train)
y_pred_lr = lr.predict(X_test)

# Evaluate the model
accuracy_lr = accuracy_score(y_test, y_pred_lr)
print("Logistic Regression Accuracy:", accuracy_lr)
print("Classification Report:\n", classification_report(y_test, y_pred_lr))

Logistic Regression Accuracy: 0.34925
Classification Report:
               precision    recall  f1-score   support

           0       0.00      0.00      0.00        19
           1       0.00      0.00      0.00        31
           2       0.33      0.01      0.01       162
           3       0.00      0.00      0.00       163
           4       0.05      0.01      0.01       338
           5       0.35      0.36      0.36      1028
           6       0.49      0.14      0.22       268
           7       0.52      0.38      0.44       762
           8       0.34      0.57      0.43      1740
           9       0.31      0.02      0.04       352
          10       0.33      0.25      0.28      1046
          11       0.37      0.05      0.09       425
          12       0.33      0.49      0.39      1666

    accuracy                           0.35      8000
   macro avg       0.26      0.17      0.17      8000
weighted avg       0.34      0.35      0.31      8000



**Naive Bayes**

In [27]:
# Naive Bayes
from sklearn.naive_bayes import MultinomialNB
nb_model = MultinomialNB()
nb_model.fit(X_train, y_train)
y_pred_nb = nb_model.predict(X_test)
print("Naive Bayes Accuracy:", accuracy_score(y_test, y_pred_nb))
print(classification_report(y_test, y_pred_nb))


Naive Bayes Accuracy: 0.315375
              precision    recall  f1-score   support

           0       0.00      0.00      0.00        19
           1       0.00      0.00      0.00        31
           2       0.00      0.00      0.00       162
           3       0.00      0.00      0.00       163
           4       0.00      0.00      0.00       338
           5       0.37      0.26      0.31      1028
           6       0.50      0.00      0.01       268
           7       0.52      0.25      0.33       762
           8       0.30      0.55      0.39      1740
           9       0.00      0.00      0.00       352
          10       0.38      0.09      0.15      1046
          11       0.00      0.00      0.00       425
          12       0.29      0.61      0.40      1666

    accuracy                           0.32      8000
   macro avg       0.18      0.14      0.12      8000
weighted avg       0.29      0.32      0.26      8000



**Train a Deep leaning model**

In [28]:
#TOKENIZE AND PAD THE DATA
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
tokenizer = Tokenizer(num_words=5000)
tokenizer.fit_on_texts(data['content'])
X_seq = tokenizer.texts_to_sequences(data['content'])
X_pad = pad_sequences(X_seq, maxlen=100)

X_train_dl, X_test_dl, y_train_dl, y_test_dl = train_test_split(X_pad, y, test_size=0.2, random_state=42)

In [36]:
!wget http://nlp.stanford.edu/data/glove.6B.zip
!unzip glove.6B.zip


--2025-03-09 07:31:33--  http://nlp.stanford.edu/data/glove.6B.zip
Resolving nlp.stanford.edu (nlp.stanford.edu)... 171.64.67.140
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:80... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://nlp.stanford.edu/data/glove.6B.zip [following]
--2025-03-09 07:31:33--  https://nlp.stanford.edu/data/glove.6B.zip
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:443... connected.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: https://downloads.cs.stanford.edu/nlp/data/glove.6B.zip [following]
--2025-03-09 07:31:34--  https://downloads.cs.stanford.edu/nlp/data/glove.6B.zip
Resolving downloads.cs.stanford.edu (downloads.cs.stanford.edu)... 171.64.64.22
Connecting to downloads.cs.stanford.edu (downloads.cs.stanford.edu)|171.64.64.22|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 862182613 (822M) [application/zip]
Saving to: ‘glove.6B.zip’


202

In [37]:
embedding_index = {}
glove_path = "glove.6B.100d.txt"  # Ensure this file exists in your working directory

# Load the GloVe embeddings
with open(glove_path, encoding='utf8') as f:
    for line in f:
        values = line.split()
        word = values[0]
        embedding_index[word] = np.array(values[1:], dtype='float32')

# Create the embedding matrix
embedding_matrix = np.zeros((5000, 100))
for word, i in tokenizer.word_index.items():
    if i < 5000:
        embedding_vector = embedding_index.get(word)
        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector


**LSTM Model**

In [38]:
# Building LSTM Model
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout
model = Sequential([
    Embedding(input_dim=5000, output_dim=128, input_length=100),
    LSTM(64, return_sequences=True),
    Dropout(0.2),
    LSTM(32),
    Dense(16, activation='relu'),
    Dropout(0.2),
    Dense(len(set(y)), activation='softmax')
])

In [39]:
model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

In [41]:
from tensorflow.keras.callbacks import EarlyStopping

# Early Stopping to prevent overfitting
early_stopping = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)

# Train the Model with Early Stopping
model.fit(X_train_dl, y_train_dl,
          epochs=15, batch_size=64,
          validation_data=(X_test_dl, y_test_dl),
          callbacks=[early_stopping])


Epoch 1/15
[1m500/500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 13ms/step - accuracy: 0.4735 - loss: 1.5662 - val_accuracy: 0.3289 - val_loss: 2.0597
Epoch 2/15
[1m500/500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 12ms/step - accuracy: 0.4948 - loss: 1.5103 - val_accuracy: 0.3284 - val_loss: 2.1664
Epoch 3/15
[1m500/500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 15ms/step - accuracy: 0.5143 - loss: 1.4431 - val_accuracy: 0.3224 - val_loss: 2.2334
Epoch 4/15
[1m500/500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 14ms/step - accuracy: 0.5224 - loss: 1.4017 - val_accuracy: 0.3122 - val_loss: 2.3493


<keras.src.callbacks.history.History at 0x788ba7288a10>