In [35]:
import pandas as pd
import re
from tensorflow import keras
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from keras.models import Sequential
from keras.layers import Dense, Dropout, Embedding, Flatten, SimpleRNN
from keras.optimizers import Adam
from keras.callbacks import EarlyStopping

#### Loading the dataset

In [11]:
df = pd.read_csv('./cleaned_data.csv')

df.head()

Unnamed: 0,Sentiment,Tweets,Cleaned_Tweets,Word_Count,Character_Count
0,0,Wants to go home... bored of work now,want go home bored work,5,23
1,0,@jeffLmsu good call on holding for take 3........,good call holding take 3 cause take 1 wa huge ...,11,53
2,1,just watched the new moon trailer aa i love it,watched new moon trailer aa love,6,32
3,0,Should I go check what it iss????,go check,2,8
4,1,the mammaaaaa &amp; sister. http://twitpic.co...,mammaaaaa amp sister,3,20


#### Data Cleaning

In [12]:
df.isna().sum()

Sentiment           0
Tweets              0
Cleaned_Tweets     47
Word_Count          0
Character_Count     0
dtype: int64

In [15]:
df = df.dropna().reset_index()

In [16]:

df.isna().sum()

index              0
Sentiment          0
Tweets             0
Cleaned_Tweets     0
Word_Count         0
Character_Count    0
dtype: int64

#### Tokenization

In [17]:
sentences = df['Cleaned_Tweets'].values

sentences

array(['want go home bored work',
       'good call holding take 3 cause take 1 wa huge failure',
       'watched new moon trailer aa love', ...,
       'gettin thangs together head san antonio wish wa goin spur game lol good',
       'evening traffic jam', 'talk people hows everybody today'],
      dtype=object)

In [19]:
words = []

for sent in sentences:
    words.extend(re.findall(r'[\w]+',sent))

words = list(set(words))

print(len(words))

13914


In [20]:
token = Tokenizer()

token.fit_on_texts(words)

word_index = token.word_index

word_index

{'t': 1,
 'gt': 2,
 '0': 3,
 '11': 4,
 'panda': 5,
 'parade': 6,
 'b': 7,
 'patch': 8,
 'poop': 9,
 'sexy': 10,
 'block': 11,
 'universe': 12,
 '9': 13,
 'menu': 14,
 'john': 15,
 'ani': 16,
 'bee': 17,
 'sex': 18,
 'o': 19,
 'livi': 20,
 'jazzquot': 21,
 'shane': 22,
 'chá': 23,
 'surfingi': 24,
 'ðºððºðññð¾': 25,
 'diggin': 26,
 'avoided': 27,
 '1000': 28,
 'lyxanda': 29,
 'utaban': 30,
 'plasma': 31,
 'wondering': 32,
 'map': 33,
 'slap': 34,
 'mee8': 35,
 'risha': 36,
 'yaaay': 37,
 'jonasbrothers': 38,
 'weve': 39,
 'disorder': 40,
 'gooodmorning': 41,
 'napseems': 42,
 'î³ï': 43,
 'wshops': 44,
 'grail': 45,
 'nambu': 46,
 'alhamdulilah': 47,
 'rebecca': 48,
 'gorgeous': 49,
 'plant': 50,
 'myanmar': 51,
 'aim': 52,
 'widespread': 53,
 'daynothing': 54,
 'tanya': 55,
 'minswoohoo': 56,
 'bluray': 57,
 'feed': 58,
 'assistant': 59,
 'declare': 60,
 'flashcards1': 61,
 'ruined': 62,
 'himmalayen': 63,
 'island': 64,
 'whipped': 65,
 'accomplishment': 66,
 'boutta': 67,
 'ð½ðððñðð':

In [21]:
sequences = token.texts_to_sequences(sentences)

sequences

[[13275, 9217, 4870, 8623, 7863],
 [11323, 2149, 10779, 6193, 10859, 3544, 6193, 5485, 12515, 8873, 5836],
 [10067, 814, 3976, 13662, 7476, 8849],
 [9217, 4574],
 [12943, 739, 9127],
 [2330, 4648, 8119, 7056, 13583, 5176],
 [11011, 2751, 1192, 8285, 12080, 12999, 5856, 8366, 1477, 4902],
 [1298, 9566, 5186, 9030, 11082, 7016, 13475],
 [7333, 1908, 8285, 197, 13648, 739, 6319, 11598],
 [11073, 2, 11073],
 [8064],
 [13339, 13067, 4526, 8126, 2330, 13339, 4976, 3836, 10714],
 [6502, 8285, 9464, 1836, 11035],
 [8774, 225, 1588, 1101, 6254],
 [4683, 6040],
 [2189, 2112],
 [8366, 12725, 711, 8526],
 [4701,
  248,
  7201,
  3107,
  13464,
  4701,
  12285,
  9047,
  7567,
  4669,
  12285,
  6537,
  3517],
 [11918, 6237, 2930, 8224, 2880, 12676, 12919, 1427],
 [1607, 12599, 4378, 8837, 2932, 12871, 7567, 10170, 7396],
 [1952, 11323, 7577, 4786, 7577, 13246, 3716, 12515, 4976, 10067],
 [1192, 313],
 [10088, 704, 3495, 3299],
 [7846],
 [5230,
  2906,
  9791,
  8459,
  9263,
  12725,
  9700,
  181

In [22]:
max_len = df['Word_Count'].sort_values(ascending=False).values[0] + 1


max_len

27

In [23]:
padded_sequence = pad_sequences(sequences=sequences, maxlen=max_len, padding='post')

padded_sequence

array([[13275,  9217,  4870, ...,     0,     0,     0],
       [11323,  2149, 10779, ...,     0,     0,     0],
       [10067,   814,  3976, ...,     0,     0,     0],
       ...,
       [ 7896,  1264,   711, ...,     0,     0,     0],
       [ 8840, 11673,  1046, ...,     0,     0,     0],
       [ 6647,  2519,  4259, ...,     0,     0,     0]])

#### Spliting the Data

In [24]:
x_train, x_test, y_train, y_test = train_test_split(padded_sequence, df['Sentiment'], test_size=0.2, random_state=42)
x_test, x_val, y_test, y_val = train_test_split(x_test, y_test, test_size=0.5, random_state=42)

print(x_train.shape, y_train.shape)
print(x_test.shape, y_test.shape)
print(x_val.shape, y_val.shape)

(7962, 27) (7962,)
(995, 27) (995,)
(996, 27) (996,)


#### Model Creation

In [69]:
embedding_length = 200
e_pochs = 10
batch_size = 64

In [71]:
model = Sequential()

# input layer
model.add(Embedding(input_dim = len(word_index)+1, output_dim=embedding_length, input_length=max_len))
model.add(SimpleRNN(units=embedding_length, return_sequences=True, dropout=0.2))
model.add(SimpleRNN(units=50))

# hidden layer
model.add(Dense(32, activation='relu'))
model.add(Dropout(0.2))
model.add(Dense(16, activation='relu'))
model.add(Dropout(0.2))

# output layer
model.add(Dense(1, activation='sigmoid'))

model.compile(optimizer=Adam(learning_rate=0.0001), loss = 'binary_crossentropy', metrics=['accuracy'])

model.summary()

Model: "sequential_8"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_8 (Embedding)     (None, 27, 200)           2781600   
                                                                 
 simple_rnn_15 (SimpleRNN)   (None, 27, 200)           80200     
                                                                 
 simple_rnn_16 (SimpleRNN)   (None, 50)                12550     
                                                                 
 dense_14 (Dense)            (None, 32)                1632      
                                                                 
 dropout_7 (Dropout)         (None, 32)                0         
                                                                 
 dense_15 (Dense)            (None, 16)                528       
                                                                 
 dropout_8 (Dropout)         (None, 16)               

In [72]:
early_stopping = EarlyStopping( monitor='val_accuracy', restore_best_weights=True, patience=3)

In [73]:
model.fit(x_train, y_train, epochs=e_pochs, batch_size=batch_size, validation_data=(x_val, y_val), callbacks=[early_stopping])

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10


<keras.src.callbacks.History at 0x21510ece6d0>

In [74]:
print("Accuracy :",model.evaluate(x_test, y_test)[1]*100,"%")

Accuracy : 68.74371767044067 %
