In [2]:
import pandas as pd
import re
from tensorflow import keras
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from keras.models import Sequential
from keras.layers import Dense, Dropout, Embedding, LSTM, Bidirectional
from keras.optimizers import Adam
from keras.callbacks import EarlyStopping

#### Loading the dataset

In [3]:
df = pd.read_csv('./cleaned_data.csv')

df.head()

Unnamed: 0,Sentiment,Tweets,Cleaned_Tweets,Word_Count,Character_Count
0,0,Wants to go home... bored of work now,want go home bored work,5,23
1,0,@jeffLmsu good call on holding for take 3........,good call holding take 3 cause take 1 wa huge ...,11,53
2,1,just watched the new moon trailer aa i love it,watched new moon trailer aa love,6,32
3,0,Should I go check what it iss????,go check,2,8
4,1,the mammaaaaa &amp; sister. http://twitpic.co...,mammaaaaa amp sister,3,20


#### Data Cleaning

In [4]:
df.isna().sum()

Sentiment           0
Tweets              0
Cleaned_Tweets     47
Word_Count          0
Character_Count     0
dtype: int64

In [5]:
df = df.dropna().reset_index()

In [6]:

df.isna().sum()

index              0
Sentiment          0
Tweets             0
Cleaned_Tweets     0
Word_Count         0
Character_Count    0
dtype: int64

#### Tokenization

In [7]:
sentences = df['Cleaned_Tweets'].values

sentences

array(['want go home bored work',
       'good call holding take 3 cause take 1 wa huge failure',
       'watched new moon trailer aa love', ...,
       'gettin thangs together head san antonio wish wa goin spur game lol good',
       'evening traffic jam', 'talk people hows everybody today'],
      dtype=object)

In [8]:
words = []

for sent in sentences:
    words.extend(re.findall(r'[\w]+',sent))

words = list(set(words))

print(len(words))

13914


In [9]:
token = Tokenizer()

token.fit_on_texts(words)

word_index = token.word_index

word_index

{'t': 1,
 '0': 2,
 'gt': 3,
 'universe': 4,
 'panda': 5,
 'ani': 6,
 '11': 7,
 '9': 8,
 'livi': 9,
 'bee': 10,
 'b': 11,
 'menu': 12,
 'block': 13,
 'sexy': 14,
 'sex': 15,
 'parade': 16,
 'john': 17,
 'poop': 18,
 'o': 19,
 'patch': 20,
 '810': 21,
 'jerome': 22,
 'smackdownecw': 23,
 'erased': 24,
 'major': 25,
 'libraryno': 26,
 'dying': 27,
 'gah': 28,
 'asking': 29,
 'trailerpark': 30,
 'license': 31,
 'forcing': 32,
 'weaponquot': 33,
 'superdy': 34,
 'premiumï½': 35,
 'creature': 36,
 'fortunately': 37,
 'overtweaked': 38,
 'rockedyou': 39,
 'crashing': 40,
 'recognizes': 41,
 'man': 42,
 'superman': 43,
 'funshopping': 44,
 'virgina': 45,
 'panera': 46,
 'derailing': 47,
 'hovercraft': 48,
 'ocupare': 49,
 'â': 50,
 'ellen': 51,
 'jakes': 52,
 'âmy': 53,
 'quit': 54,
 'richly': 55,
 'eh': 56,
 'tingle': 57,
 'magically': 58,
 'sean': 59,
 'athritis': 60,
 'offspring': 61,
 'cat': 62,
 'explain': 63,
 'dermatologist': 64,
 'quoted': 65,
 'mcfly': 66,
 'shrug': 67,
 '47': 68,
 'l

In [10]:
sequences = token.texts_to_sequences(sentences)

sequences

[[1310, 7415, 8736, 173, 983],
 [6300, 5089, 2169, 3957, 7377, 2937, 3957, 3673, 6364, 1540, 11816],
 [1118, 11990, 10329, 3183, 3610, 212],
 [7415, 930],
 [5513, 1356, 10142],
 [9339, 11402, 808, 8539, 4270, 10357],
 [7893, 2508, 10096, 9861, 4416, 13524, 8427, 13407, 566, 4902],
 [11117, 8891, 9554, 7058, 10682, 4486, 208],
 [4055, 5966, 9861, 5340, 3202, 1356, 12058, 8163],
 [5992, 3, 5992],
 [12221],
 [3232, 13735, 7657, 13370, 9339, 3232, 8840, 3443, 5393],
 [9405, 9861, 13413, 1238, 2598],
 [5856, 5294, 6820, 8859, 9439],
 [8248, 8628],
 [11503, 5636],
 [13407, 11493, 6049, 5075],
 [5163,
  6271,
  10866,
  13062,
  10930,
  5163,
  117,
  4015,
  11918,
  10116,
  117,
  13709,
  4464],
 [9511, 13579, 13145, 11158, 11610, 7513, 12746, 3512],
 [9233, 6841, 4422, 10714, 13895, 5083, 11918, 10987, 3438],
 [4359, 6300, 8868, 2435, 8868, 12029, 6031, 6364, 8840, 1118],
 [10096, 5061],
 [10662, 7881, 9379, 7134],
 [455],
 [8119,
  1165,
  272,
  11600,
  8081,
  11493,
  8291,
  13289

In [11]:
max_len = df['Word_Count'].sort_values(ascending=False).values[0] + 1


max_len

27

In [12]:
padded_sequence = pad_sequences(sequences=sequences, maxlen=max_len, padding='post')

padded_sequence

array([[ 1310,  7415,  8736, ...,     0,     0,     0],
       [ 6300,  5089,  2169, ...,     0,     0,     0],
       [ 1118, 11990, 10329, ...,     0,     0,     0],
       ...,
       [11470,  5649,  6049, ...,     0,     0,     0],
       [ 1208,  8099,  4280, ...,     0,     0,     0],
       [ 6497,  7173, 10655, ...,     0,     0,     0]])

#### Spliting the Data

In [13]:
x_train, x_test, y_train, y_test = train_test_split(padded_sequence, df['Sentiment'], test_size=0.2, random_state=42)
x_test, x_val, y_test, y_val = train_test_split(x_test, y_test, test_size=0.5, random_state=42)

print(x_train.shape, y_train.shape)
print(x_test.shape, y_test.shape)
print(x_val.shape, y_val.shape)

(7962, 27) (7962,)
(995, 27) (995,)
(996, 27) (996,)


#### Model Creation

In [19]:
embedding_length = 200
e_pochs = 10
batch_size = 64

In [24]:
model = Sequential()

# input layer
model.add(Embedding(input_dim = len(word_index)+1, output_dim=embedding_length, input_length=max_len))
model.add(Bidirectional(LSTM(units=embedding_length, return_sequences=True, dropout=0.2)))
model.add(Bidirectional(LSTM(units=50)))

# hidden layer
model.add(Dense(64, activation='relu'))
model.add(Dropout(0.2))
model.add(Dense(32, activation='relu'))
model.add(Dropout(0.2))
model.add(Dense(16, activation='relu'))
model.add(Dropout(0.2))

# output layer
model.add(Dense(1, activation='sigmoid'))

model.compile(optimizer=Adam(learning_rate=0.0001), loss = 'binary_crossentropy', metrics=['accuracy'])

model.summary()

Model: "sequential_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_2 (Embedding)     (None, 27, 200)           2781600   
                                                                 
 bidirectional_4 (Bidirecti  (None, 27, 400)           641600    
 onal)                                                           
                                                                 
 bidirectional_5 (Bidirecti  (None, 100)               180400    
 onal)                                                           
                                                                 
 dense_6 (Dense)             (None, 64)                6464      
                                                                 
 dropout_4 (Dropout)         (None, 64)                0         
                                                                 
 dense_7 (Dense)             (None, 32)               

In [25]:
early_stopping = EarlyStopping( monitor='val_accuracy', restore_best_weights=True, patience=3)

In [26]:
model.fit(x_train, y_train, epochs=e_pochs, batch_size=batch_size, validation_data=(x_val, y_val), callbacks=[early_stopping])

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10


<keras.src.callbacks.History at 0x27bdab58350>

In [27]:
print("Accuracy :",model.evaluate(x_test, y_test)[1]*100,"%")

Accuracy : 71.7587947845459 %
