In [57]:
#impport modules
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense
from tensorflow.keras.utils import to_categorical

In [58]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [59]:
#reading dataset
data=pd.read_csv('/content/drive/MyDrive/csv/tweet_emotions .csv')
data.head()

Unnamed: 0,tweet_id,sentiment,content
0,1956967341,empty,@tiffanylue i know i was listenin to bad habi...
1,1956967666,sadness,Layin n bed with a headache ughhhh...waitin o...
2,1956967696,sadness,Funeral ceremony...gloomy friday...
3,1956967789,enthusiasm,wants to hang out with friends SOON!
4,1956968416,neutral,@dannycastillo We want to trade with someone w...


In [60]:
#no need of tweet id .so dropping the column
data.drop(['tweet_id'],axis=1)

Unnamed: 0,sentiment,content
0,empty,@tiffanylue i know i was listenin to bad habi...
1,sadness,Layin n bed with a headache ughhhh...waitin o...
2,sadness,Funeral ceremony...gloomy friday...
3,enthusiasm,wants to hang out with friends SOON!
4,neutral,@dannycastillo We want to trade with someone w...
...,...,...
39995,neutral,@JohnLloydTaylor
39996,love,Happy Mothers Day All my love
39997,love,Happy Mother's Day to all the mommies out ther...
39998,happiness,@niariley WASSUP BEAUTIFUL!!! FOLLOW ME!! PEE...


In [61]:
#checking is there any null values present
data.isnull().sum()

tweet_id     0
sentiment    0
content      0
dtype: int64

In [62]:
data.describe()

Unnamed: 0,tweet_id
count,40000.0
mean,1845184000.0
std,118857900.0
min,1693956000.0
25%,1751431000.0
50%,1855443000.0
75%,1962781000.0
max,1966441000.0


In [63]:
# Creating an array of the sentiment values, Note tweet_id is not relevent to us
# No preprocessing needs to be done on the sentiment values
sentiments = data["sentiment"].values
contents= data["content"].values # Extracting the content column, Based on this we will be developing the model

In [64]:
contents[1:10] # Example of unclean data

array(['Layin n bed with a headache  ughhhh...waitin on your call...',
       'Funeral ceremony...gloomy friday...',
       'wants to hang out with friends SOON!',
       '@dannycastillo We want to trade with someone who has Houston tickets, but no one will.',
       "Re-pinging @ghostridah14: why didn't you go to prom? BC my bf didn't like my friends",
       "I should be sleep, but im not! thinking about an old friend who I want. but he's married now. damn, &amp; he wants me 2! scandalous!",
       'Hmmm. http://www.djhero.com/ is down',
       '@charviray Charlene my love. I miss you',
       "@kelcouch I'm sorry  at least it's Friday?"], dtype=object)

There is url,hashtags,expressions .so we need to clean the sentance

In [65]:
import re
import string
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [66]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [67]:
def preprocess_text(text):
    # Remove URLs
    text = re.sub(r"http\S+|www\S+|https\S+", "", text)

    # Remove usernames and hashtags
    text = re.sub(r"@[^\s]+|\#[^\s]+", "", text)

    # Remove punctuation
    text = text.translate(str.maketrans("", "", string.punctuation))

    # Convert to lowercase
    text = text.lower()

    # Tokenize the text
    tokens = nltk.word_tokenize(text)

    # Remove stopwords
    stop_words = set(stopwords.words("english"))
    filtered_tokens = [token for token in tokens if token not in stop_words]

    # Lemmatize the tokens
    lemmatizer = WordNetLemmatizer()
    lemmatized_tokens = [lemmatizer.lemmatize(token) for token in filtered_tokens]

    # Join tokens back into a single string
    preprocessed_text = " ".join(lemmatized_tokens)

    return preprocessed_text

In [68]:
# Preprocess the tweets in the dataset
data['content'] = data['content'].apply(preprocess_text)


In [69]:
clean_tweet = [preprocess_text(content) for content in contents]

In [70]:
clean_tweet[0:10]

['know listenin bad habit earlier started freakin part',
 'layin n bed headache ughhhhwaitin call',
 'funeral ceremonygloomy friday',
 'want hang friend soon',
 'want trade someone houston ticket one',
 'repinging didnt go prom bc bf didnt like friend',
 'sleep im thinking old friend want he married damn amp want 2 scandalous',
 'hmmm',
 'charlene love miss',
 'im sorry least friday']

In [71]:
# Splitting the data into training and testing sets
X_train, X_test, Y_train, Y_test = train_test_split(clean_tweet, sentiments, test_size=0.3, random_state=50)

In [72]:
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.preprocessing.text import Tokenizer

In [73]:
# Preprocessing the text data
tokenizer = Tokenizer()
tokenizer.fit_on_texts(X_train)
vocab_size = len(tokenizer.word_index) + 1

In [74]:
X_train = tokenizer.texts_to_sequences(X_train)
X_test = tokenizer.texts_to_sequences(X_test)

In [75]:
max_sequence_length = 100
X_train = pad_sequences(X_train, maxlen=max_sequence_length)
X_test = pad_sequences(X_test, maxlen=max_sequence_length)

In [76]:
# Encoding emotions into numerical labels
label_encoder = LabelEncoder()
label_encoder.fit(Y_train)
Y_train = label_encoder.transform(Y_train)
Y_test = label_encoder.transform(Y_test)


In [77]:
Y_test

array([ 3, 12, 11, ..., 11,  8, 12])

In [81]:
# Convert the numerical labels into one-hot encodings
Y_train = to_categorical(Y_train)
Y_test = to_categorical(Y_test)


In [82]:
Y_test

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 1.],
       [0., 0., 0., ..., 0., 1., 0.],
       ...,
       [0., 0., 0., ..., 0., 1., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 1.]], dtype=float32)

Next step is to develop the model

In [83]:
# Step 3: Model Selection
embedding_dim = 100

model = Sequential()
model.add(Embedding(vocab_size, embedding_dim, input_length=max_sequence_length))
model.add(LSTM(128))
model.add(Dense(64, activation='relu'))
model.add(Dense(len(label_encoder.classes_), activation='softmax'))


In [84]:
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

In [85]:
 #Model Training
model.fit(X_train, Y_train, validation_data=(X_test, Y_test), epochs=10, batch_size=32)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7f9df31fe9e0>

we get 87.5 accuracy rate ,now we need to predict

In [115]:
# Step 4: Model Evaluation
loss, accuracy = model.evaluate(X_test, Y_test)
print('Test Loss:', loss)
print('Test Accuracy:', accuracy)

Test Loss: 4.571835517883301
Test Accuracy: 0.26466667652130127


In [112]:
# Step 5: Prediction on New Sentences
new_sentences = ["So sleepy again and it's not even that late. I fail once again.", "wants to hang out with friends SOON!",'tomorrow i have exam i didint studied anything','RIP leonardo. You were a great mini fiddler crab']
new_sequences = tokenizer.texts_to_sequences(new_sentences)
new_sequences = pad_sequences(new_sequences, maxlen=max_sequence_length)
predictions = model.predict(new_sequences)



In [113]:
predicted_emotions = label_encoder.inverse_transform(np.argmax(predictions, axis=1))
for sentence, emotion in zip(new_sentences, predicted_emotions):
    print('Sentence:', sentence)
    print('Predicted Emotion:', emotion)

Sentence: So sleepy again and it's not even that late. I fail once again.
Predicted Emotion: worry
Sentence: wants to hang out with friends SOON!
Predicted Emotion: neutral
Sentence: tomorrow i have exam i didint studied anything
Predicted Emotion: worry
Sentence: RIP leonardo. You were a great mini fiddler crab
Predicted Emotion: fun
