In [38]:
import pandas as pd
import re
from tensorflow import keras
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from keras.models import Sequential
from keras.layers import Dense, Dropout, Embedding, GRU, LSTM
from keras.optimizers import Adam
from keras.callbacks import EarlyStopping

#### Loading the dataset

In [39]:
df = pd.read_csv('./cleaned_data.csv')

df.head()

Unnamed: 0,Sentiment,Tweets,Cleaned_Tweets,Word_Count,Character_Count
0,0,Wants to go home... bored of work now,want go home bored work,5,23
1,0,@jeffLmsu good call on holding for take 3........,good call holding take 3 cause take 1 wa huge ...,11,53
2,1,just watched the new moon trailer aa i love it,watched new moon trailer aa love,6,32
3,0,Should I go check what it iss????,go check,2,8
4,1,the mammaaaaa &amp; sister. http://twitpic.co...,mammaaaaa amp sister,3,20


#### Data Cleaning

In [40]:
df.isna().sum()

Sentiment           0
Tweets              0
Cleaned_Tweets     47
Word_Count          0
Character_Count     0
dtype: int64

In [41]:
df = df.dropna().reset_index()

In [42]:

df.isna().sum()

index              0
Sentiment          0
Tweets             0
Cleaned_Tweets     0
Word_Count         0
Character_Count    0
dtype: int64

#### Tokenization

In [43]:
sentences = df['Cleaned_Tweets'].values

sentences

array(['want go home bored work',
       'good call holding take 3 cause take 1 wa huge failure',
       'watched new moon trailer aa love', ...,
       'gettin thangs together head san antonio wish wa goin spur game lol good',
       'evening traffic jam', 'talk people hows everybody today'],
      dtype=object)

In [44]:
words = []

for sent in sentences:
    words.extend(re.findall(r'[\w]+',sent))

words = list(set(words))

print(len(words))

13914


In [45]:
token = Tokenizer()

token.fit_on_texts(words)

word_index = token.word_index

word_index

{'t': 1,
 '0': 2,
 'gt': 3,
 '11': 4,
 'ani': 5,
 'patch': 6,
 'o': 7,
 'universe': 8,
 'sexy': 9,
 'john': 10,
 'menu': 11,
 'block': 12,
 'poop': 13,
 'sex': 14,
 'panda': 15,
 'parade': 16,
 'b': 17,
 'livi': 18,
 'bee': 19,
 '9': 20,
 'plata': 21,
 'stopmotion': 22,
 'mu': 23,
 'charge': 24,
 'photovoltaic': 25,
 'quiz': 26,
 'productivity': 27,
 'sehr': 28,
 'perform': 29,
 'x': 30,
 'ecclesiastes': 31,
 'material': 32,
 'againbut': 33,
 'legendary': 34,
 'puzzle': 35,
 'wahtever': 36,
 'werk': 37,
 'hunter': 38,
 'anythingquot': 39,
 'rosa': 40,
 'quotim': 41,
 'geographic': 42,
 'macarenagag': 43,
 'pass': 44,
 'kama': 45,
 'kaneohe': 46,
 'generally': 47,
 'sterwat': 48,
 'aladin': 49,
 'snoring': 50,
 '2348': 51,
 'twittering': 52,
 'axl': 53,
 'jewel': 54,
 'phineas': 55,
 'brandt': 56,
 'lucias': 57,
 'impressive': 58,
 'gad': 59,
 'gtg': 60,
 'rehearsal': 61,
 'noris': 62,
 'tia': 63,
 'gb': 64,
 'evet': 65,
 'anxiety': 66,
 'wah': 67,
 'walked': 68,
 'install': 69,
 'conce

In [46]:
sequences = token.texts_to_sequences(sentences)

sequences

[[12627, 11910, 11883, 7985, 6756],
 [11230, 11236, 2620, 4372, 10588, 2035, 4372, 8125, 4502, 10302, 2736],
 [11007, 3825, 2913, 5679, 13201, 8547],
 [11910, 4679],
 [6099, 13563, 7052],
 [9340, 13068, 3827, 5308, 1455, 13482],
 [704, 1583, 10140, 3768, 102, 12669, 6676, 10843, 6396, 9351],
 [3477, 9803, 7010, 10186, 6412, 8138, 3961],
 [682, 10717, 3768, 799, 10409, 13563, 724, 1305],
 [4939, 3, 4939],
 [11443],
 [3231, 798, 10533, 9827, 9340, 3231, 7111, 4602, 12994],
 [12091, 3768, 10213, 2298, 12929],
 [5050, 13325, 11442, 6721, 7120],
 [5446, 8019],
 [3894, 8858],
 [10843, 8446, 12691, 9886],
 [8078,
  7980,
  4280,
  12898,
  3248,
  8078,
  4552,
  5598,
  12281,
  5429,
  4552,
  11967,
  5599],
 [13310, 12891, 7012, 1273, 8105, 1842, 3783, 6428],
 [7348, 4796, 12947, 10882, 12800, 9837, 12281, 5583, 12629],
 [1814, 11230, 11703, 5171, 11703, 5791, 399, 4502, 7111, 11007],
 [10140, 8234],
 [4039, 12374, 1985, 4110],
 [10231],
 [4425,
  6779,
  3711,
  1808,
  5947,
  8446,
  1

In [47]:
max_len = df['Word_Count'].sort_values(ascending=False).values[0] + 1


max_len

27

In [48]:
padded_sequence = pad_sequences(sequences=sequences, maxlen=max_len, padding='post')

padded_sequence

array([[12627, 11910, 11883, ...,     0,     0,     0],
       [11230, 11236,  2620, ...,     0,     0,     0],
       [11007,  3825,  2913, ...,     0,     0,     0],
       ...,
       [ 5942,  9642, 12691, ...,     0,     0,     0],
       [ 5380,  9004, 13664, ...,     0,     0,     0],
       [ 4911, 12392,  7746, ...,     0,     0,     0]])

#### Spliting the Data

In [49]:
x_train, x_test, y_train, y_test = train_test_split(padded_sequence, df['Sentiment'], test_size=0.2, random_state=42)
x_test, x_val, y_test, y_val = train_test_split(x_test, y_test, test_size=0.5, random_state=42)

print(x_train.shape, y_train.shape)
print(x_test.shape, y_test.shape)
print(x_val.shape, y_val.shape)

(7962, 27) (7962,)
(995, 27) (995,)
(996, 27) (996,)


#### Model Creation

In [50]:
embedding_length = 100
e_pochs = 12
batch_size = 64

In [61]:
model = Sequential()

# input layer
model.add(Embedding(input_dim = len(word_index)+1, output_dim=embedding_length, input_length=max_len))
model.add(GRU(units=embedding_length, return_sequences=True, dropout=0.2))
model.add(GRU(units=50))

# hidden layer
model.add(Dense(32, activation='relu'))
model.add(Dropout(0.2))
model.add(Dense(16, activation='relu'))
model.add(Dropout(0.2))

# output layer
model.add(Dense(1, activation='sigmoid'))

model.compile(optimizer=Adam(learning_rate=0.0001), loss = 'binary_crossentropy', metrics=['accuracy'])

model.summary()

Model: "sequential_8"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_8 (Embedding)     (None, 27, 100)           1390800   
                                                                 
 gru_10 (GRU)                (None, 27, 100)           60600     
                                                                 
 gru_11 (GRU)                (None, 50)                22800     
                                                                 
 dense_24 (Dense)            (None, 32)                1632      
                                                                 
 dropout_16 (Dropout)        (None, 32)                0         
                                                                 
 dense_25 (Dense)            (None, 16)                528       
                                                                 
 dropout_17 (Dropout)        (None, 16)               

In [62]:
early_stopping = EarlyStopping( monitor='val_accuracy', restore_best_weights=True, patience=3)

In [63]:
model.fit(x_train, y_train, epochs=e_pochs, batch_size=batch_size, validation_data=(x_val, y_val), callbacks=early_stopping)

Epoch 1/12
Epoch 2/12
Epoch 3/12
Epoch 4/12


<keras.src.callbacks.History at 0x27a62fef890>

In [64]:
print("Accuracy :",model.evaluate(x_test, y_test)[1]*100,"%")

Accuracy : 53.96984815597534 %
