In [None]:
import pandas as pd
import re
import string
import nltk
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from nltk.corpus import stopwords
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score

In [None]:
Twitter_Data=pd.read_csv("/content/Twitter_Data.csv")
Twitter_Data.head()

Unnamed: 0,clean_text,category
0,when modi promised “minimum government maximum...,-1.0
1,talk all the nonsense and continue all the dra...,0.0
2,what did just say vote for modi welcome bjp t...,1.0
3,asking his supporters prefix chowkidar their n...,1.0
4,answer who among these the most powerful world...,1.0


In [None]:
Twitter_Data.shape

(162980, 2)

In [None]:
Twitter_Data.category.value_counts()

 1.0    72250
 0.0    55213
-1.0    35510
Name: category, dtype: int64

In [None]:
Twitter_Data['category']=Twitter_Data['category'].map({-1.0:'Negative', 0.0:'Neutral', 1.0:'Positive'})

In [None]:
Twitter_Data.isna().sum()

clean_text    4
category      7
dtype: int64

In [None]:
# Check whether the Twitter_Dataset have the null Values

Twitter_Data=Twitter_Data.dropna()

In [None]:
# Remove symbols
Twitter_Data['clean_text'] = Twitter_Data['clean_text'].apply(lambda x: re.sub(r'\W+', ' ', x))

# Convert to lowercase
Twitter_Data['clean_text'] = Twitter_Data['clean_text'].str.lower()

# Remove punctuation
Twitter_Data['clean_text'] = Twitter_Data['clean_text'].apply(lambda x: x.translate(str.maketrans('', '', string.punctuation)))

# Remove stopwords
nltk.download('stopwords')
stopwords = set(stopwords.words('english'))
Twitter_Data['clean_text'] = Twitter_Data['clean_text'].apply(lambda x: ' '.join(word for word in x.split() if word not in stopwords))

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [None]:
#Create a New Column for Sentence Length To create a new column for sentence length, you can split the text into words and calculate the length
Twitter_Data['Sentence_Length'] = Twitter_Data['clean_text'].apply(lambda x: len(x.split()))


In [None]:
Twitter_Data.head()

Unnamed: 0,clean_text,category,Sentence_Length
0,modi promised minimum government maximum gover...,Negative,21
1,talk nonsense continue drama vote modi,Neutral,6
2,say vote modi welcome bjp told rahul main camp...,Positive,13
3,asking supporters prefix chowkidar names modi ...,Positive,19
4,answer among powerful world leader today trump...,Positive,10


In [None]:
Twitter_Data.shape

(162969, 3)

In [None]:
# Split Data into X and y Dataframes
X = Twitter_Data['clean_text']
y = Twitter_Data['category']

In [None]:
# Define vocabulary size
vocab_size = 10000

# Create tokenizer
tokenizer = Tokenizer(num_words=vocab_size, oov_token='<OOV>')

# Fit tokenizer on X data
tokenizer.fit_on_texts(X)

# Convert text to sequences
sequences = tokenizer.texts_to_sequences(X)

# Pad sequences
max_length = max(Twitter_Data['Sentence_Length'])
padded_sequences = pad_sequences(sequences, maxlen=max_length, padding='pre')


In [None]:
Twitter_Data['category']=Twitter_Data['category'].map({'Neutral': 0, 'Negative': -1, 'Positive': 1})

In [None]:
# Perform one-hot encoding
y_encoded = tf.keras.utils.to_categorical(Twitter_Data['category'], num_classes=3)

In [None]:
#  Build and Compile the LSTM Model
# Define input length and vocabulary size
input_length = max_length
vocabulary_size = vocab_size

# Build the model
model = Sequential()
model.add(Embedding(vocabulary_size, 64, input_length=input_length))
model.add(LSTM(64))
model.add(Dense(3, activation='softmax'))

# Compile the model
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

In [None]:
#  Create Dummy Variables for the Dependent Variable
y = pd.get_dummies(y)

In [None]:
#Split the Data into Training and Test Sets
X_train, X_test, y_train, y_test = train_test_split(padded_sequences, y, test_size=0.2, random_state=42)

In [None]:
model.fit(X_train, y_train, epochs=10, batch_size=32)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7fe123bb7b80>

In [None]:
y_pred = model.predict(X_test)
y_pred



array([[2.2303022e-03, 9.6508205e-01, 3.2687627e-02],
       [2.0999552e-04, 4.0414609e-05, 9.9974960e-01],
       [9.9634689e-01, 6.5293520e-06, 3.6465952e-03],
       ...,
       [1.2618621e-03, 9.9809283e-01, 6.4535282e-04],
       [9.0481043e-03, 1.2420677e-04, 9.9082774e-01],
       [7.4681344e-07, 2.0504974e-06, 9.9999726e-01]], dtype=float32)

In [None]:
# Normalize the Predictions
y_pred = model.predict(X_test)
count = 0
for i in range(len(y_pred)):
    if (y_pred[i][0] > y_pred[i][1] and y_pred[i][0] > y_pred[i][2]):
        count = 0

    elif(y_pred[i][0] < y_pred[i][1] and y_pred[i][1] > y_pred[i][2]):
        count = 1

    elif(y_pred[i][0] < y_pred[i][2] and y_pred[i][1] < y_pred[i][2]):
        count = 2
    print(i,"for count is",count)
    for j in range(3):
        y_pred[i][j] = 0
        y_pred[i][count] = 1
    print(y_pred[i])
    count = 0

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
30094 for count is 2
[0. 0. 1.]
30095 for count is 0
[1. 0. 0.]
30096 for count is 1
[0. 1. 0.]
30097 for count is 0
[1. 0. 0.]
30098 for count is 2
[0. 0. 1.]
30099 for count is 2
[0. 0. 1.]
30100 for count is 1
[0. 1. 0.]
30101 for count is 2
[0. 0. 1.]
30102 for count is 1
[0. 1. 0.]
30103 for count is 1
[0. 1. 0.]
30104 for count is 1
[0. 1. 0.]
30105 for count is 0
[1. 0. 0.]
30106 for count is 1
[0. 1. 0.]
30107 for count is 1
[0. 1. 0.]
30108 for count is 1
[0. 1. 0.]
30109 for count is 0
[1. 0. 0.]
30110 for count is 0
[1. 0. 0.]
30111 for count is 2
[0. 0. 1.]
30112 for count is 1
[0. 1. 0.]
30113 for count is 0
[1. 0. 0.]
30114 for count is 0
[1. 0. 0.]
30115 for count is 2
[0. 0. 1.]
30116 for count is 1
[0. 1. 0.]
30117 for count is 0
[1. 0. 0.]
30118 for count is 1
[0. 1. 0.]
30119 for count is 2
[0. 0. 1.]
30120 for count is 1
[0. 1. 0.]
30121 for count is 0
[1. 0. 0.]
30122 for count is 1
[0. 1. 0.]
30123 f

In [None]:
y_test.shape

(32594, 3)

In [None]:
y_pred.shape

(32594, 3)

In [None]:
print("Accuracy Score : ",accuracy_score(y_test,y_pred))

Accuracy Score :  0.8832914033257655


In [None]:
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.82      0.83      0.82      7152
           1       0.91      0.91      0.91     11067
           2       0.90      0.89      0.89     14375

   micro avg       0.88      0.88      0.88     32594
   macro avg       0.87      0.88      0.88     32594
weighted avg       0.88      0.88      0.88     32594
 samples avg       0.88      0.88      0.88     32594

