In [1]:
import pandas as pd
import re
from tensorflow import keras
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from keras.models import Sequential
from keras.layers import Dense, Dropout, Embedding, Flatten, LSTM
from keras.optimizers import Adam
from keras.callbacks import EarlyStopping




#### Loading the dataset

In [2]:
df = pd.read_csv('./cleaned_data.csv')

df.head()

Unnamed: 0,Sentiment,Tweets,Cleaned_Tweets,Word_Count,Character_Count
0,0,Wants to go home... bored of work now,want go home bored work,5,23
1,0,@jeffLmsu good call on holding for take 3........,good call holding take 3 cause take 1 wa huge ...,11,53
2,1,just watched the new moon trailer aa i love it,watched new moon trailer aa love,6,32
3,0,Should I go check what it iss????,go check,2,8
4,1,the mammaaaaa &amp; sister. http://twitpic.co...,mammaaaaa amp sister,3,20


#### Data Cleaning

In [3]:
df.isna().sum()

Sentiment           0
Tweets              0
Cleaned_Tweets     47
Word_Count          0
Character_Count     0
dtype: int64

In [4]:
df = df.dropna().reset_index()

In [5]:

df.isna().sum()

index              0
Sentiment          0
Tweets             0
Cleaned_Tweets     0
Word_Count         0
Character_Count    0
dtype: int64

#### Tokenization

In [6]:
sentences = df['Cleaned_Tweets'].values

sentences

array(['want go home bored work',
       'good call holding take 3 cause take 1 wa huge failure',
       'watched new moon trailer aa love', ...,
       'gettin thangs together head san antonio wish wa goin spur game lol good',
       'evening traffic jam', 'talk people hows everybody today'],
      dtype=object)

In [7]:
words = []

for sent in sentences:
    words.extend(re.findall(r'[\w]+',sent))

words = list(set(words))

print(len(words))

13914


In [8]:
token = Tokenizer()

token.fit_on_texts(words)

word_index = token.word_index

word_index

{'t': 1,
 '0': 2,
 'gt': 3,
 'panda': 4,
 'parade': 5,
 'universe': 6,
 'o': 7,
 'b': 8,
 'livi': 9,
 'bee': 10,
 'john': 11,
 'sexy': 12,
 'menu': 13,
 'ani': 14,
 '11': 15,
 '9': 16,
 'poop': 17,
 'patch': 18,
 'block': 19,
 'sex': 20,
 'ohmygod': 21,
 'subtext': 22,
 'dangi': 23,
 'quotoldquot': 24,
 'shoulderback': 25,
 'machine': 26,
 'fandom': 27,
 'chlobug': 28,
 'oscar': 29,
 'macbeth': 30,
 'gloomy': 31,
 'noodle': 32,
 'spock': 33,
 'culinary': 34,
 'waitingand': 35,
 'summons': 36,
 'impression': 37,
 'melo': 38,
 'caused': 39,
 'wut': 40,
 'phew': 41,
 'medicern': 42,
 'ol': 43,
 'mariee': 44,
 'racism': 45,
 'levine': 46,
 'cracker': 47,
 '19': 48,
 'surrounded': 49,
 'dreamweaver': 50,
 'dos': 51,
 'quotburnin': 52,
 'teasinglet': 53,
 'qualifyingrace': 54,
 'hit': 55,
 'cig': 56,
 'slurpee': 57,
 '6': 58,
 'sunglass': 59,
 'goodluck': 60,
 'hii': 61,
 'cook': 62,
 'quotpoker': 63,
 'lasted': 64,
 'kick': 65,
 'worshipping': 66,
 'layla': 67,
 'international': 68,
 'obvs'

In [9]:
sequences = token.texts_to_sequences(sentences)

sequences

[[8341, 3576, 5697, 2861, 2586],
 [6419, 10249, 5382, 13907, 12786, 7836, 13907, 11217, 5586, 12200, 9251],
 [9611, 1668, 10446, 13250, 11764, 497],
 [3576, 13115],
 [10889, 1293, 13880],
 [1708, 7243, 13062, 13310, 13596, 1900],
 [1232, 3440, 750, 2229, 12578, 7317, 6489, 5563, 1167, 3323],
 [13898, 7716, 8885, 11014, 3238, 1986, 9865],
 [571, 7787, 2229, 11658, 2450, 1293, 5875, 12795],
 [1937, 3, 1937],
 [3744],
 [7275, 13284, 10097, 10996, 1708, 7275, 11326, 8597, 6704],
 [2470, 2229, 11766, 1328, 7327],
 [5969, 6228, 12355, 11694, 10698],
 [12364, 2081],
 [11402, 12750],
 [5563, 4479, 3257, 277],
 [880,
  1366,
  6160,
  7165,
  8358,
  880,
  13438,
  4507,
  11052,
  6786,
  13438,
  4952,
  1772],
 [11556, 9362, 1152, 1054, 4264, 8634, 7821, 8482],
 [9306, 12536, 13455, 4331, 5088, 12463, 11052, 5633, 4179],
 [112, 6419, 10664, 8498, 10664, 3392, 9325, 5586, 11326, 9611],
 [750, 5580],
 [13797, 6195, 7419, 1633],
 [12571],
 [1217,
  9856,
  13213,
  13660,
  7788,
  4479,
  606

In [10]:
max_len = df['Word_Count'].sort_values(ascending=False).values[0] + 1


max_len

27

In [11]:
padded_sequence = pad_sequences(sequences=sequences, maxlen=max_len, padding='post')

padded_sequence

array([[ 8341,  3576,  5697, ...,     0,     0,     0],
       [ 6419, 10249,  5382, ...,     0,     0,     0],
       [ 9611,  1668, 10446, ...,     0,     0,     0],
       ...,
       [ 2404, 10692,  3257, ...,     0,     0,     0],
       [ 7386,  7326,  6847, ...,     0,     0,     0],
       [ 2502,  3582,  6764, ...,     0,     0,     0]])

#### Spliting the Data

In [12]:
x_train, x_test, y_train, y_test = train_test_split(padded_sequence, df['Sentiment'], test_size=0.2, random_state=42)
x_test, x_val, y_test, y_val = train_test_split(x_test, y_test, test_size=0.5, random_state=42)

print(x_train.shape, y_train.shape)
print(x_test.shape, y_test.shape)
print(x_val.shape, y_val.shape)

(7962, 27) (7962,)
(995, 27) (995,)
(996, 27) (996,)


#### Model Creation

In [13]:
embedding_length = 200
e_pochs = 10
batch_size = 64

In [14]:
model = Sequential()

# input layer
model.add(Embedding(input_dim = len(word_index)+1, output_dim=embedding_length, input_length=max_len))
model.add(LSTM(units=embedding_length, return_sequences=True, dropout=0.2))
model.add(LSTM(units=50))

# hidden layer
model.add(Dense(32, activation='relu'))
model.add(Dropout(0.2))
model.add(Dense(16, activation='relu'))
model.add(Dropout(0.2))

# output layer
model.add(Dense(1, activation='sigmoid'))

model.compile(optimizer=Adam(learning_rate=0.0001), loss = 'binary_crossentropy', metrics=['accuracy'])

model.summary()


Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 27, 200)           2781600   
                                                                 
 lstm (LSTM)                 (None, 27, 200)           320800    
                                                                 
 lstm_1 (LSTM)               (None, 50)                50200     
                                                                 
 dense (Dense)               (None, 32)                1632      
                                                                 
 dropout (Dropout)           (None, 32)                0         
                                                                 
 dense_1 (Dense)             (None, 16)                528       
                                                                 
 dropout_1 (Dropout)         (None, 16)                

In [15]:
early_stopping = EarlyStopping( monitor='val_accuracy', restore_best_weights=True, patience=3)

In [16]:
model.fit(x_train, y_train, epochs=e_pochs, batch_size=batch_size, validation_data=(x_val, y_val), callbacks=[early_stopping])

Epoch 1/10


Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10


<keras.src.callbacks.History at 0x22045397950>

In [17]:
print("Accuracy :",model.evaluate(x_test, y_test)[1]*100,"%")

Accuracy : 71.85929417610168 %
