## Importing Libraries

In [1]:
import pandas as pd
import numpy as np
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.preprocessing import LabelEncoder
import tensorflow as tf
from tensorflow.keras.layers import Embedding,LSTM,Dense,Flatten,Dropout
from tensorflow.keras.models import Sequential
from tensorflow.keras.callbacks import EarlyStopping
import warnings
warnings.filterwarnings('ignore')

## Data Loading and Understanding

In [2]:
#load teh csv data to a pandas dataframe

data=pd.read_csv("Twitter_Data.csv")

In [3]:
# First 5 rows of the dataset

data.head()

Unnamed: 0,textID,text,selected_text,sentiment
0,cb774db0d1,"I`d have responded, if I were going","I`d have responded, if I were going",neutral
1,549e992a42,Sooo SAD I will miss you here in San Diego!!!,Sooo SAD,negative
2,088c60f138,my boss is bullying me...,bullying me,negative
3,9642c003ef,what interview! leave me alone,leave me alone,negative
4,358bd9e861,"Sons of ****, why couldn`t they put them on t...","Sons of ****,",negative


In [4]:
# Last 5 rows of the dataset

data.tail()

Unnamed: 0,textID,text,selected_text,sentiment
27476,4eac33d1c0,wish we could come see u on Denver husband l...,d lost,negative
27477,4f4c4fc327,I`ve wondered about rake to. The client has ...,", don`t force",negative
27478,f67aae2310,Yay good for both of you. Enjoy the break - y...,Yay good for both of you.,positive
27479,ed167662a5,But it was worth it ****.,But it was worth it ****.,positive
27480,6f7127d9d7,All this flirting going on - The ATG smiles...,All this flirting going on - The ATG smiles. Y...,neutral


In [5]:
#number of row and columns

data.shape

(27481, 4)

In [6]:
# Dataset information

data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 27481 entries, 0 to 27480
Data columns (total 4 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   textID         27481 non-null  object
 1   text           27480 non-null  object
 2   selected_text  27480 non-null  object
 3   sentiment      27481 non-null  object
dtypes: object(4)
memory usage: 858.9+ KB


## Text Preprocessing

In [7]:
# Chang the data type for text

data['text']=data['text'].astype(str)

In [8]:
data['text']

0                      I`d have responded, if I were going
1            Sooo SAD I will miss you here in San Diego!!!
2                                my boss is bullying me...
3                           what interview! leave me alone
4         Sons of ****, why couldn`t they put them on t...
                               ...                        
27476     wish we could come see u on Denver  husband l...
27477     I`ve wondered about rake to.  The client has ...
27478     Yay good for both of you. Enjoy the break - y...
27479                           But it was worth it  ****.
27480       All this flirting going on - The ATG smiles...
Name: text, Length: 27481, dtype: object

In [9]:
# Cleans 'text' by removing non-alphabetical characters and converting to lowercase

data['clean_text']= data['text'].str.replace('[^a-zA-Z\s]','',regex=True).str.lower()

In [10]:
# Retrieves the 'clean_text' column

data['clean_text']

0                        id have responded if i were going
1               sooo sad i will miss you here in san diego
2                                   my boss is bullying me
3                            what interview leave me alone
4         sons of  why couldnt they put them on the rel...
                               ...                        
27476     wish we could come see u on denver  husband l...
27477     ive wondered about rake to  the client has ma...
27478     yay good for both of you enjoy the break  you...
27479                                but it was worth it  
27480       all this flirting going on  the atg smiles ...
Name: clean_text, Length: 27481, dtype: object

In [11]:
# Drops the 'textID' and 'selected_text' columns from the DataFrame in place

data.drop(columns=['textID','selected_text'],inplace=True)

In [12]:
data

Unnamed: 0,text,sentiment,clean_text
0,"I`d have responded, if I were going",neutral,id have responded if i were going
1,Sooo SAD I will miss you here in San Diego!!!,negative,sooo sad i will miss you here in san diego
2,my boss is bullying me...,negative,my boss is bullying me
3,what interview! leave me alone,negative,what interview leave me alone
4,"Sons of ****, why couldn`t they put them on t...",negative,sons of why couldnt they put them on the rel...
...,...,...,...
27476,wish we could come see u on Denver husband l...,negative,wish we could come see u on denver husband l...
27477,I`ve wondered about rake to. The client has ...,negative,ive wondered about rake to the client has ma...
27478,Yay good for both of you. Enjoy the break - y...,positive,yay good for both of you enjoy the break you...
27479,But it was worth it ****.,positive,but it was worth it


In [13]:
# Assigns the 'clean_text' column to variable 'x' and 'sentiment' column to variable 'y'

x=data['clean_text']
y=data['sentiment']

In [14]:
#Retrieves unique values from the 'sentiment' column and stores them in 'unique_sentiments'

unique_sentiments = y.unique()

In [15]:
# Replaces sentiment labels with numerical values: 'negative' -> 0, 'neutral' -> 1, 'positive' -> 2

y = y.replace({'negative':0, 'neutral':1,'positive':2})

### Divide data into two parts: test and train.

In [16]:
from sklearn.model_selection import train_test_split

x_train ,x_test, y_train , y_test = train_test_split(x,y, test_size=0.2, random_state=42 , stratify=y)

### Tokenizes the text data and converts it into sequences for training and testing

In [17]:
#from tensorflow.keras.preprocessing.text import Tokenizer

tokenizer = Tokenizer(num_words=5000)
tokenizer.fit_on_texts(x_train)
x_train_seq = tokenizer.texts_to_sequences(x_train)
x_test_seq = tokenizer.texts_to_sequences(x_test)

### Pads the tokenized sequences to ensure uniform length for training and testing 

In [18]:
#from tensorflow.keras.preprocessing.sequence import pad_sequences

x_train_pad = pad_sequences(x_train_seq, maxlen = 100 ,padding='post')
x_test_pad = pad_sequences(x_test_seq, maxlen = 100 ,padding='post')

### Encodes the sentiment labels into numerical values for training and testing

In [19]:
#from sklearn.preprocessing import LabelEncoder

label_encoder=LabelEncoder()
y_train_encoded=label_encoder.fit_transform(y_train)
y_test_encoded=label_encoder.fit_transform(y_test)

### Converts the encoded labels into one-hot encoded format for training and testing

In [20]:
#import tensorflow as tf

num_classes=len(unique_sentiments)
y_train_onehot=tf.keras.utils.to_categorical(y_train_encoded,num_classes=num_classes)
y_test_onehot=tf.keras.utils.to_categorical(y_test_encoded,num_classes=num_classes)

### Create  Model

In [21]:
#from tensorflow.keras.layers import Embedding,LSTM,Dense,Flatten

In [22]:
# Initialize the Sequential model
model = Sequential()

# Add the Embedding layer
model.add(Embedding(input_dim=5000, output_dim=100,input_length=100))

# Adds LSTM layer with 128 units and dropout
model.add(LSTM(128, return_sequences=True, dropout=0.2))

# Add the Flatten layer
model.add(Flatten())

# Add the Dropout layer
model.add(Dropout(0.5))

# Add the Dense layer with 64 neurons and ReLU activation
model.add(Dense(64, activation='relu'))

# Add the output Dense layer with 3 neurons and softmax activation
model.add(Dense(3, activation='softmax'))

In [24]:
# Compiles the model with Adam optimizer, categorical cross-entropy loss, and accuracy metric

model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=0.0001),loss='categorical_crossentropy',metrics=['accuracy'])

In [25]:
# Sets early stopping based on validation accuracy with patience of 4 epochs

early_stopping = EarlyStopping(monitor='val_accuracy', patience=4, restore_best_weights=True)

In [26]:
# Trains the model with early stopping and validation split of 20%

model.fit(x_train_pad, y_train_onehot, epochs=50, batch_size=64,validation_split=0.2,callbacks=[early_stopping])

Epoch 1/50
[1m275/275[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m88s[0m 296ms/step - accuracy: 0.4057 - loss: 1.0841 - val_accuracy: 0.5383 - val_loss: 0.9362
Epoch 2/50
[1m275/275[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m123s[0m 446ms/step - accuracy: 0.5927 - loss: 0.8668 - val_accuracy: 0.6516 - val_loss: 0.7738
Epoch 3/50
[1m275/275[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m124s[0m 379ms/step - accuracy: 0.6826 - loss: 0.7367 - val_accuracy: 0.6686 - val_loss: 0.7489
Epoch 4/50
[1m275/275[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m114s[0m 413ms/step - accuracy: 0.7227 - loss: 0.6646 - val_accuracy: 0.6791 - val_loss: 0.7401
Epoch 5/50
[1m275/275[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m117s[0m 426ms/step - accuracy: 0.7368 - loss: 0.6346 - val_accuracy: 0.6889 - val_loss: 0.7196
Epoch 6/50
[1m275/275[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m116s[0m 421ms/step - accuracy: 0.7597 - loss: 0.6060 - val_accuracy: 0.6887 - val_loss: 0.7386
Epoch

<keras.src.callbacks.history.History at 0x2701057bdd0>

In [27]:
# Makes predictions on the test data and converts probabilities to class labels

y_pred_probs = model.predict(x_test_pad)
y_pred = np.argmax(y_pred_probs,axis=1)

[1m172/172[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 75ms/step


In [28]:
# Prints the classification report for the model's predictions on the test data

from sklearn.metrics import confusion_matrix ,classification_report 
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.75      0.61      0.67      1556
           1       0.63      0.77      0.69      2224
           2       0.80      0.72      0.76      1717

    accuracy                           0.71      5497
   macro avg       0.73      0.70      0.71      5497
weighted avg       0.72      0.71      0.71      5497



* Accuracy: 71% of the total predictions are correct.
* Macro Average: Averages precision, recall, and F1-score across all classes equally (0.73, 0.70, 0.71).
* Weighted Average: Averages metrics, weighted by the number of instances in each class (0.72, 0.71, 0.71).

In [29]:
# Prints the confusion matrix for the model's predictions on the test data

print(confusion_matrix(y_test,y_pred))

[[ 953  544   59]
 [ 276 1707  241]
 [  47  440 1230]]


The model correctly classified 953 instances of class 0, but misclassified 544 instances as class 1 and 59 instances as class 2. Similarly, it correctly classified 1,707 instances of class 1, while 276 instances were misclassified as class 0 and 241 as class 2. For class 2, the model accurately classified 1,230 instances, with 47 instances misclassified as class 0 and 440 as class 1.

* The model performs best at identifying class 1 (1707 correct classifications).
* Misclassifications are higher for class 0 being predicted as 1 and vice versa.
* Class 2 has relatively fewer misclassifications

In [30]:
import pickle

In [31]:
# Saves the trained model to a file named 'sentiment_model2.h5'

model.save("sentiment_model2.h5")



In [32]:
# Saves the tokenizer object to a file named 'tokenizer.pickle' using pickle

with open ('tokenizer.pickle','wb') as handle:
    pickle.dump(tokenizer,handle,protocol=pickle.HIGHEST_PROTOCOL)