In [90]:
import pandas as pd
import re
from tensorflow import keras
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from keras.models import Sequential
from keras.layers import Dense, Dropout, Embedding, Flatten, LSTM, GlobalMaxPooling1D, Bidirectional
from keras.optimizers import Adam
from keras.callbacks import EarlyStopping
from gensim.models import Word2Vec
import numpy as np

#### Loading the dataset

In [91]:
df = pd.read_csv('./cleaned_data.csv')

df.head()

Unnamed: 0,Sentiment,Tweets,Cleaned_Tweets,Word_Count,Character_Count
0,0,Wants to go home... bored of work now,want go home bored work,5,23
1,0,@jeffLmsu good call on holding for take 3........,good call holding take 3 cause take 1 wa huge ...,11,53
2,1,just watched the new moon trailer aa i love it,watched new moon trailer aa love,6,32
3,0,Should I go check what it iss????,go check,2,8
4,1,the mammaaaaa &amp; sister. http://twitpic.co...,mammaaaaa amp sister,3,20


#### Data Cleaning

In [92]:
df.isna().sum()

Sentiment           0
Tweets              0
Cleaned_Tweets     47
Word_Count          0
Character_Count     0
dtype: int64

In [93]:
df = df.dropna().reset_index()

In [94]:

df.isna().sum()

index              0
Sentiment          0
Tweets             0
Cleaned_Tweets     0
Word_Count         0
Character_Count    0
dtype: int64

#### Tokenization

In [95]:
sentences = df['Cleaned_Tweets'].values

sentences

array(['want go home bored work',
       'good call holding take 3 cause take 1 wa huge failure',
       'watched new moon trailer aa love', ...,
       'gettin thangs together head san antonio wish wa goin spur game lol good',
       'evening traffic jam', 'talk people hows everybody today'],
      dtype=object)

In [96]:
words = []

for sent in sentences:
    words.extend(re.findall(r'[\w]+',sent))

words = list(set(words))

print(len(words))

13914


In [97]:
token = Tokenizer()

token.fit_on_texts(words)

word_index = token.word_index

word_index

{'t': 1,
 '0': 2,
 'gt': 3,
 'sex': 4,
 'ani': 5,
 'bee': 6,
 'b': 7,
 'parade': 8,
 'john': 9,
 'sexy': 10,
 'patch': 11,
 'panda': 12,
 'menu': 13,
 'livi': 14,
 'poop': 15,
 'block': 16,
 'universe': 17,
 'o': 18,
 '11': 19,
 '9': 20,
 'grim': 21,
 'referring': 22,
 'discovery': 23,
 'sang': 24,
 '50th': 25,
 'writerslol': 26,
 'jordi': 27,
 'ninaaaa': 28,
 'netty': 29,
 '1022': 30,
 'september': 31,
 'lameurijah': 32,
 'negative': 33,
 'damn': 34,
 'prom': 35,
 'ice': 36,
 'breathe': 37,
 'rally': 38,
 'gcse': 39,
 'rb': 40,
 'outside': 41,
 'laugh': 42,
 'ah8u9sdig': 43,
 'ily': 44,
 'ikr': 45,
 'jake': 46,
 'crap': 47,
 'grader': 48,
 'virtually': 49,
 'peacenik': 50,
 'clay': 51,
 'iplayer': 52,
 '30stm': 53,
 'khols': 54,
 'respect': 55,
 'andrew': 56,
 'valencia': 57,
 'scary': 58,
 'noches': 59,
 'phplurkcom': 60,
 'loneliness': 61,
 '20082009': 62,
 'mouthsays': 63,
 'gampg': 64,
 'norah': 65,
 'yapldn': 66,
 'rained': 67,
 'bouta': 68,
 'er': 69,
 'either': 70,
 'holiday': 

In [98]:
sequences = token.texts_to_sequences(sentences)

sequences

[[8047, 4447, 2901, 462, 3743],
 [10474, 12075, 10593, 4667, 9013, 13058, 4667, 6815, 10355, 8200, 6575],
 [533, 13820, 4986, 12372, 7264, 11172],
 [4447, 3610],
 [10245, 2843, 4344],
 [9676, 2638, 7749, 2330, 987, 6323],
 [11348, 9369, 5612, 4845, 1423, 6381, 13797, 4318, 10859, 4379],
 [3718, 4893, 2204, 4848, 13771, 6507, 1631],
 [3661, 1425, 4845, 8071, 4327, 2843, 11900, 9025],
 [545, 3, 545],
 [7935],
 [2806, 10144, 11857, 2236, 9676, 2806, 3581, 4504, 7776],
 [8777, 4845, 7323, 9937, 4618],
 [1808, 866, 1993, 6247, 7542],
 [7454, 9618],
 [11103, 5107],
 [4318, 10360, 5281, 6062],
 [11276,
  3682,
  8137,
  5567,
  1057,
  11276,
  12907,
  6611,
  6940,
  629,
  12907,
  6041,
  3570],
 [654, 10434, 9041, 9370, 2149, 7527, 10096, 928],
 [1707, 3873, 11033, 5718, 5455, 2967, 6940, 8099, 12225],
 [2300, 10474, 13080, 9264, 13080, 6419, 5507, 10355, 3581, 533],
 [5612, 8195],
 [7100, 10848, 7529, 10249],
 [58],
 [9218,
  9424,
  3167,
  13148,
  13886,
  10360,
  11518,
  851,
  62

In [99]:
max_len = df['Word_Count'].sort_values(ascending=False).values[0] + 1

max_len

27

In [100]:
padded_sequence = pad_sequences(sequences=sequences, maxlen=max_len, padding='post')

padded_sequence

array([[ 8047,  4447,  2901, ...,     0,     0,     0],
       [10474, 12075, 10593, ...,     0,     0,     0],
       [  533, 13820,  4986, ...,     0,     0,     0],
       ...,
       [ 9653, 11471,  5281, ...,     0,     0,     0],
       [10126,  8954,  6613, ...,     0,     0,     0],
       [ 6083,   173,  5500, ...,     0,     0,     0]])

#### Spliting the Data

In [101]:
x_train, x_test, y_train, y_test = train_test_split(padded_sequence, df['Sentiment'], test_size=0.2, random_state=42)
x_test, x_val, y_test, y_val = train_test_split(x_test, y_test, test_size=0.5, random_state=42)

print(x_train.shape, y_train.shape)
print(x_test.shape, y_test.shape)
print(x_val.shape, y_val.shape)

(7962, 27) (7962,)
(995, 27) (995,)
(996, 27) (996,)


In [102]:
embedding_length = 200
e_pochs = 10
batch_size = 64

#### Word2Vec

1. Converting word to vector

In [103]:
word_list = list(map(lambda x: re.findall(r'[\w]+',x), df['Cleaned_Tweets']))

word_list

[['want', 'go', 'home', 'bored', 'work'],
 ['good',
  'call',
  'holding',
  'take',
  '3',
  'cause',
  'take',
  '1',
  'wa',
  'huge',
  'failure'],
 ['watched', 'new', 'moon', 'trailer', 'aa', 'love'],
 ['go', 'check'],
 ['mammaaaaa', 'amp', 'sister'],
 ['oh', 'ouchie', 'back', 'maybe', 'reason', 'woke'],
 ['lol',
  'best',
  'friend',
  'im',
  'assuming',
  'mean',
  'snooter',
  'great',
  'pic',
  'way'],
 ['favicon', 'ripoff', 'smashing', 'magazine', 'ltgt', 'see', 'similarity'],
 ['didnt', 'topic', 'im', 'trying', 'branch', 'amp', 'relationship', 'stuff'],
 ['life', 'gt', 'life'],
 ['yes'],
 ['pervs',
  'twitter',
  'dont',
  'say',
  'oh',
  'pervs',
  'would',
  'assume',
  'freudian'],
 ['nooo', 'im', 'loosin', 'follower', 'aweee'],
 ['car', 'wiff', 'danny', 'going', '209'],
 ['congratulation', 'finally'],
 ['temporary', 'happiness'],
 ['great', 'get', 'together', 'dgbians'],
 ['theyre',
  'still',
  'young',
  'wait',
  'year',
  'theyre',
  'playing',
  'prank',
  'first

In [104]:
word2vec = Word2Vec(word_list, vector_size=embedding_length, workers=8, min_count=1)

In [105]:
print("Vocabolary length :",len(word2vec.wv.key_to_index))
word2vec.wv.key_to_index

Vocabolary length : 13914


{'im': 0,
 'day': 1,
 'wa': 2,
 'good': 3,
 'get': 4,
 'like': 5,
 'got': 6,
 'go': 7,
 'dont': 8,
 'u': 9,
 'today': 10,
 'love': 11,
 'work': 12,
 'going': 13,
 'cant': 14,
 'time': 15,
 'back': 16,
 'one': 17,
 'know': 18,
 'lol': 19,
 'want': 20,
 'see': 21,
 'na': 22,
 'well': 23,
 'really': 24,
 'think': 25,
 'still': 26,
 'night': 27,
 'home': 28,
 'new': 29,
 '2': 30,
 'ha': 31,
 'oh': 32,
 'thanks': 33,
 'amp': 34,
 'need': 35,
 'miss': 36,
 'make': 37,
 'feel': 38,
 'last': 39,
 'tomorrow': 40,
 'much': 41,
 'morning': 42,
 'ill': 43,
 'happy': 44,
 'great': 45,
 'thats': 46,
 'hope': 47,
 'haha': 48,
 'wish': 49,
 'twitter': 50,
 'tonight': 51,
 'didnt': 52,
 'fun': 53,
 'right': 54,
 'thing': 55,
 'nice': 56,
 'would': 57,
 'way': 58,
 'sad': 59,
 'friend': 60,
 'bad': 61,
 'week': 62,
 'come': 63,
 'getting': 64,
 'better': 65,
 'gon': 66,
 'say': 67,
 'hey': 68,
 'sorry': 69,
 'watching': 70,
 'look': 71,
 'wait': 72,
 'sleep': 73,
 'ive': 74,
 'youre': 75,
 'could': 76,


2. Creating Embedding matrix

In [106]:
embedding_matrix = np.zeros((len(word2vec.wv.key_to_index), embedding_length))

embedding_matrix.shape

(13914, 200)

In [107]:
for token, idx in word_index.items():
    if word2vec.wv.__contains__(token):
        embedding_matrix[idx] = word2vec.wv[token]

print("Embedding matrix :")
print(embedding_matrix)

Embedding matrix :
[[ 0.00000000e+00  0.00000000e+00  0.00000000e+00 ...  0.00000000e+00
   0.00000000e+00  0.00000000e+00]
 [ 0.00000000e+00  0.00000000e+00  0.00000000e+00 ...  0.00000000e+00
   0.00000000e+00  0.00000000e+00]
 [ 8.77706427e-03 -6.50832153e-05  5.43886796e-03 ... -1.20586539e-02
  -1.25540624e-04 -7.39952503e-03]
 ...
 [ 0.00000000e+00  0.00000000e+00  0.00000000e+00 ...  0.00000000e+00
   0.00000000e+00  0.00000000e+00]
 [ 0.00000000e+00  0.00000000e+00  0.00000000e+00 ...  0.00000000e+00
   0.00000000e+00  0.00000000e+00]
 [ 0.00000000e+00  0.00000000e+00  0.00000000e+00 ...  0.00000000e+00
   0.00000000e+00  0.00000000e+00]]


#### Model Creation

In [120]:
model = Sequential()

# input layer
model.add(Embedding(input_dim = len(word2vec.wv.key_to_index),
                    output_dim=embedding_length,
                    weights=[embedding_matrix], 
                    input_length=max_len, 
                    trainable = True))
model.add(Bidirectional(LSTM(units=100, return_sequences=True, dropout=0.2)))
model.add(Bidirectional(LSTM(units=50, return_sequences=True, dropout=0.2)))
model.add(GlobalMaxPooling1D())

# hidden layer
model.add(Dense(16, activation='relu'))
model.add(Dropout(0.2))

# output layer
model.add(Dense(1, activation='sigmoid'))

model.compile(optimizer=Adam(learning_rate=0.0001), loss = 'binary_crossentropy', metrics=['accuracy'])

model.summary()

Model: "sequential_8"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_8 (Embedding)     (None, 27, 200)           2782800   
                                                                 
 bidirectional_10 (Bidirect  (None, 27, 200)           240800    
 ional)                                                          
                                                                 
 bidirectional_11 (Bidirect  (None, 27, 100)           100400    
 ional)                                                          
                                                                 
 global_max_pooling1d_7 (Gl  (None, 100)               0         
 obalMaxPooling1D)                                               
                                                                 
 dense_23 (Dense)            (None, 16)                1616      
                                                      

In [121]:
early_stopping = EarlyStopping( monitor='val_accuracy', restore_best_weights=True, patience=3)

In [122]:
model.fit(x_train, y_train, epochs=e_pochs, batch_size=batch_size, validation_data=(x_val, y_val), callbacks=[early_stopping])

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10


<keras.src.callbacks.History at 0x1c3a65f8450>

In [123]:
print("Accuracy :",model.evaluate(x_test, y_test)[1]*100,"%")

Accuracy : 73.46733808517456 %
