In [69]:
import pandas as pd
from tensorflow import keras
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from keras.layers import Dense
from keras.optimizers import Adam
from keras.models import Sequential
import re

#### Import Dataset

In [2]:
df = pd.read_csv('./cleaned_data.csv')

df.head()

Unnamed: 0,Sentiment,Tweets,Cleaned_Tweets,Word_Count,Character_Count
0,0,Wants to go home... bored of work now,want go home bored work,5,23
1,0,@jeffLmsu good call on holding for take 3........,good call holding take 3 cause take 1 wa huge ...,11,53
2,1,just watched the new moon trailer aa i love it,watched new moon trailer aa love,6,32
3,0,Should I go check what it iss????,go check,2,8
4,1,the mammaaaaa &amp; sister. http://twitpic.co...,mammaaaaa amp sister,3,20


#### Data Cleaning

In [46]:
df.isna().sum()

Sentiment           0
Tweets              0
Cleaned_Tweets     47
Word_Count          0
Character_Count     0
dtype: int64

In [50]:
df = df.dropna().reset_index()

#### tokenization

In [51]:
sentences = df['Cleaned_Tweets'].values

sentences

array(['want go home bored work',
       'good call holding take 3 cause take 1 wa huge failure',
       'watched new moon trailer aa love', ...,
       'gettin thangs together head san antonio wish wa goin spur game lol good',
       'evening traffic jam', 'talk people hows everybody today'],
      dtype=object)

In [53]:
words = []

for idx,sent in enumerate(sentences):
    words.extend(re.findall(r'[\w]+',sent))

words = list(set(words))
print(len(words))

13914


In [54]:
token = Tokenizer()

token.fit_on_texts(words)

token.word_index

{'t': 1,
 '0': 2,
 'gt': 3,
 'poop': 4,
 'panda': 5,
 'parade': 6,
 'block': 7,
 'menu': 8,
 'universe': 9,
 'john': 10,
 'livi': 11,
 'sex': 12,
 'b': 13,
 'bee': 14,
 '9': 15,
 'patch': 16,
 'sexy': 17,
 'o': 18,
 'ani': 19,
 '11': 20,
 'jersey': 21,
 'aroâs': 22,
 'semifinal': 23,
 'crittersyou': 24,
 'gurgaon': 25,
 'mornin': 26,
 'warrrm': 27,
 'pub': 28,
 'quotjoe': 29,
 'combination': 30,
 'protect': 31,
 'gorjusss': 32,
 'scorpion': 33,
 'chloe': 34,
 'chantelle': 35,
 'antidote': 36,
 'close': 37,
 'procrastenating': 38,
 'saladmy': 39,
 'er': 40,
 'tamara': 41,
 'hmv': 42,
 'helenes': 43,
 'grab': 44,
 'wuz': 45,
 'cheating': 46,
 'mentionsometimes': 47,
 'goat': 48,
 'oh': 49,
 'inconvenient': 50,
 'kat': 51,
 'newport': 52,
 'deluxe': 53,
 'honey': 54,
 'newspring': 55,
 'twatted': 56,
 'presne': 57,
 'favesss': 58,
 'neat': 59,
 'mhhh': 60,
 'jemur': 61,
 'twitterberry': 62,
 '1353': 63,
 'quottoo': 64,
 'expected': 65,
 'earth': 66,
 'stargate': 67,
 'ogame': 68,
 'notebo

In [55]:
sequence = token.texts_to_sequences(sentences)

sequence

[[3463, 5068, 1231, 3057, 5790],
 [9176, 13292, 3993, 8098, 12026, 4774, 8098, 3816, 7440, 831, 1306],
 [9412, 12273, 2764, 2240, 12501, 2546],
 [5068, 9135],
 [8347, 12497, 6099],
 [49, 11616, 11352, 13815, 9982, 1431],
 [80, 9302, 7365, 4218, 10084, 12969, 4168, 2047, 4346, 13147],
 [13654, 12707, 1413, 13215, 5154, 3922, 2290],
 [5028, 82, 4218, 4822, 12746, 12497, 2038, 6789],
 [631, 3, 631],
 [8387],
 [5448, 13700, 8452, 8316, 49, 5448, 1915, 11641, 7875],
 [5877, 4218, 2614, 5966, 13688],
 [7027, 9654, 2790, 5708, 6261],
 [5535, 7334],
 [13674, 13346],
 [2047, 4783, 2193, 4153],
 [12060,
  715,
  11657,
  10551,
  7096,
  12060,
  9450,
  9076,
  9625,
  9125,
  9450,
  13393,
  3758],
 [12401, 7711, 9471, 7257, 4076, 13782, 9593, 8681],
 [383, 13016, 10364, 2592, 4875, 10496, 9625, 8369, 4923],
 [3896, 9176, 3366, 5810, 3366, 2464, 9768, 7440, 1915, 9412],
 [7365, 5934],
 [7837, 11500, 4545, 7393],
 [13389],
 [3884,
  7719,
  335,
  11496,
  8169,
  4783,
  7375,
  4863,
  9965,

In [62]:
max_len = df['Word_Count'].sort_values(ascending=False).values[0]

max_len

26

In [64]:
padded_sequence = pad_sequences(sequences=sequence, maxlen=max_len, padding='post')

padded_sequence

array([[ 3463,  5068,  1231, ...,     0,     0,     0],
       [ 9176, 13292,  3993, ...,     0,     0,     0],
       [ 9412, 12273,  2764, ...,     0,     0,     0],
       ...,
       [ 2728,  6196,  2193, ...,     0,     0,     0],
       [ 1571,  7670, 12534, ...,     0,     0,     0],
       [ 1441,  5190, 11311, ...,     0,     0,     0]])

In [65]:
padded_sequence.shape

(9953, 26)

#### Spliting the dataset

In [86]:
x_train, x_test, y_train, y_test = train_test_split(padded_sequence, df['Sentiment'], test_size=0.2, random_state=42)
x_test, x_val, y_test, y_val = train_test_split(x_test, y_test, test_size=0.5, random_state=42)

print(x_train.shape, y_train.shape)
print(x_test.shape, y_test.shape)
print(x_val.shape, y_val.shape)

(7962, 26) (7962,)
(995, 26) (995,)
(996, 26) (996,)


#### Model Creation

In [87]:
def sigmoid_function(lst):
    output = []
    for i in lst:
        if i >= 0.5:
            output.append(1)
        else:
            output.append(0)
    return output

In [93]:
model = Sequential()

# input layer
model.add(Dense(128, activation='relu', input_shape=(max_len,)))

# hidden layer
model.add(Dense(64, activation='relu'))
model.add(Dense(32, activation='relu'))

# output layer
model.add(Dense(1, activation='sigmoid'))

model.compile(optimizer=Adam(learning_rate=0.0001), loss='binary_crossentropy', metrics = ['accuracy'])

model.summary()

Model: "sequential_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_6 (Dense)             (None, 128)               3456      
                                                                 
 dense_7 (Dense)             (None, 64)                8256      
                                                                 
 dense_8 (Dense)             (None, 32)                2080      
                                                                 
 dense_9 (Dense)             (None, 1)                 33        
                                                                 
Total params: 13825 (54.00 KB)
Trainable params: 13825 (54.00 KB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [94]:
model.fit(x_train, y_train, batch_size=64, epochs=10, validation_data=(x_val, y_val))

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.src.callbacks.History at 0x22c616414d0>

In [95]:
y_pred = sigmoid_function(model.predict(x_test))

print(y_test.values)
print(len(y_test.values))

[0 0 1 1 0 1 1 1 1 1 0 0 0 1 1 0 0 0 0 0 0 1 1 1 1 0 1 1 1 0 0 0 1 1 1 1 1
 1 1 0 1 0 1 1 1 1 0 1 0 0 1 1 0 0 1 0 1 0 0 0 1 1 1 0 1 0 0 1 1 0 1 1 0 1
 1 1 1 1 0 0 1 1 0 0 0 0 1 1 1 1 0 1 0 1 1 1 0 1 1 1 1 1 1 0 0 1 0 1 1 1 1
 1 1 0 0 0 0 1 1 1 0 1 0 1 0 1 0 0 1 1 1 1 1 0 0 1 1 1 1 0 1 1 0 1 0 1 0 1
 0 0 0 1 1 1 1 0 1 0 0 1 1 1 0 0 0 1 1 0 0 0 0 0 1 0 0 1 1 1 1 1 1 0 0 1 1
 1 1 0 0 0 1 1 1 1 0 0 0 0 1 1 1 0 0 1 1 0 1 1 0 1 1 1 1 0 1 1 1 0 1 0 0 0
 0 0 0 1 0 1 0 0 0 1 0 1 1 0 1 0 1 0 1 0 1 0 1 1 1 0 0 0 1 1 1 1 1 1 0 0 1
 0 0 0 0 1 0 0 0 1 0 1 1 0 1 0 0 1 0 1 0 1 0 1 1 0 1 1 1 0 1 0 1 1 0 1 0 0
 1 1 1 0 0 1 1 0 0 1 1 1 0 1 0 1 0 0 1 1 0 1 1 0 0 1 1 1 0 0 0 0 0 0 0 0 1
 1 0 0 1 0 0 0 0 0 1 0 0 0 1 1 0 1 1 0 1 1 0 0 1 0 0 1 0 0 0 0 1 1 0 0 0 0
 1 0 1 1 0 1 1 0 1 0 0 0 1 0 1 1 1 1 1 0 1 1 1 1 1 1 1 0 0 1 0 1 0 0 1 0 1
 1 1 1 1 1 1 0 1 1 1 0 1 0 0 1 1 1 1 0 0 1 1 1 0 1 1 1 1 0 0 0 0 1 1 1 0 0
 1 0 0 0 1 1 1 0 1 0 0 1 0 0 0 1 1 1 1 1 1 1 0 1 1 0 1 0 0 0 0 0 1 0 0 0 0
 0 1 1 1 0 0 0 0 1 1 1 0 

In [96]:
print(y_pred)
print(len(y_pred))

[1, 1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0, 1, 1, 1, 0, 0, 0, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 0, 1, 1, 1, 0, 1, 0, 0, 0, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 1, 0, 0, 1, 1, 0, 1, 1, 1, 1, 0, 0, 1, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 0, 1, 0, 1, 1, 1, 0, 1, 0, 0, 1, 1, 0, 0, 0, 1, 1, 0, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 

In [97]:
print("Accuracy :",round(model.evaluate(x_test, y_test)[1]*100, 2),"%")

Accuracy : 53.37 %
