In [29]:
import pandas as pd
import re
from tensorflow import keras
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from keras.models import Sequential
from keras.layers import Dense, Dropout, Embedding, Flatten
from keras.optimizers import Adam

#### Import the Dataset

In [2]:
df = pd.read_csv('./cleaned_data.csv')

df.head()

Unnamed: 0,Sentiment,Tweets,Cleaned_Tweets,Word_Count,Character_Count
0,0,Wants to go home... bored of work now,want go home bored work,5,23
1,0,@jeffLmsu good call on holding for take 3........,good call holding take 3 cause take 1 wa huge ...,11,53
2,1,just watched the new moon trailer aa i love it,watched new moon trailer aa love,6,32
3,0,Should I go check what it iss????,go check,2,8
4,1,the mammaaaaa &amp; sister. http://twitpic.co...,mammaaaaa amp sister,3,20


## Data Cleaning

In [3]:
df.isna().sum()

Sentiment           0
Tweets              0
Cleaned_Tweets     47
Word_Count          0
Character_Count     0
dtype: int64

In [7]:
df.dropna().reset_index(inplace=True)

In [8]:
df.isna().sum()

Sentiment          0
Tweets             0
Cleaned_Tweets     0
Word_Count         0
Character_Count    0
dtype: int64

#### Tokenization

In [10]:
sentences = df['Cleaned_Tweets'].values

sentences

array(['want go home bored work',
       'good call holding take 3 cause take 1 wa huge failure',
       'watched new moon trailer aa love', ...,
       'gettin thangs together head san antonio wish wa goin spur game lol good',
       'evening traffic jam', 'talk people hows everybody today'],
      dtype=object)

In [12]:
words = []

for sent in sentences:
    words.extend(re.findall(r'[\w]+',sent))

words = list(set(words))

print(len(words))

13914


In [14]:
token = Tokenizer()

token.fit_on_texts(words)

word_index = token.word_index

word_index

{'t': 1,
 '0': 2,
 'gt': 3,
 'ani': 4,
 '11': 5,
 '9': 6,
 'menu': 7,
 'block': 8,
 'universe': 9,
 'bee': 10,
 'sex': 11,
 'patch': 12,
 'john': 13,
 'sexy': 14,
 'livi': 15,
 'o': 16,
 'b': 17,
 'parade': 18,
 'poop': 19,
 'panda': 20,
 'hk': 21,
 'easy': 22,
 'autobiography': 23,
 'bep': 24,
 'daniel': 25,
 'tornado': 26,
 'dump': 27,
 'ant': 28,
 'hartsfield': 29,
 '200k': 30,
 'calculate': 31,
 'prototyped': 32,
 'macguyver': 33,
 'business': 34,
 'clap': 35,
 'tou': 36,
 'quotget': 37,
 'degraves': 38,
 'forbus': 39,
 'deceased': 40,
 'unrelated': 41,
 'chaperoning': 42,
 '9900': 43,
 'freind': 44,
 'secret': 45,
 'spin': 46,
 'calculus': 47,
 'karate': 48,
 'twit': 49,
 'techonology': 50,
 'ahahah': 51,
 'flavored': 52,
 'tweetdare': 53,
 'cept': 54,
 'barqs': 55,
 'hang': 56,
 'dubai': 57,
 'ñð¾ððµ': 58,
 'secon': 59,
 'forever': 60,
 '17thï½': 61,
 'mself': 62,
 'full': 63,
 'log': 64,
 'honda': 65,
 'courthouse': 66,
 'tea': 67,
 'yucky': 68,
 '1100': 69,
 'duper': 70,
 'topi

In [15]:
sequences = token.texts_to_sequences(sentences)

sequences

[[1350, 3839, 2199, 12855, 253],
 [5210, 3019, 2691, 9305, 1066, 3043, 9305, 10269, 11262, 13158, 8099],
 [2488, 3766, 9334, 6154, 5029, 12088],
 [3839, 13195],
 [1063, 11035, 2992],
 [522, 9808, 11516, 12763, 10978, 3512],
 [724, 8651, 8122, 11795, 189, 11054, 3777, 4596, 1815, 1791],
 [6261, 2705, 2285, 7316, 7784, 4770, 4081],
 [359, 2880, 11795, 13449, 4895, 11035, 9432, 4235],
 [4710, 3, 4710],
 [12068],
 [3704, 333, 12561, 7497, 522, 3704, 5595, 8932, 8076],
 [11397, 11795, 7538, 13528, 5808],
 [13667, 10242, 6707, 10121, 3923],
 [6168, 7422],
 [11763, 10271],
 [4596, 243, 9487, 346],
 [5727,
  9468,
  13827,
  11711,
  13153,
  5727,
  4000,
  10778,
  12886,
  12553,
  4000,
  4337,
  4230],
 [7155, 13833, 2729, 5734, 2777, 4906, 4347, 12258],
 [8609, 1714, 11177, 1161, 3545, 2340, 12886, 6639, 2300],
 [1502, 5210, 13625, 2315, 13625, 6455, 6808, 11262, 5595, 2488],
 [8122, 1197],
 [13413, 8788, 11545, 5383],
 [3733],
 [8014,
  10942,
  6039,
  12131,
  3934,
  243,
  3217,
  9

In [20]:
max_len = df['Word_Count'].sort_values(ascending=False).values[0] + 1


max_len

27

In [23]:
padded_sequence = pad_sequences(sequences=sequences, maxlen=max_len, padding='post')

padded_sequence

array([[ 1350,  3839,  2199, ...,     0,     0,     0],
       [ 5210,  3019,  2691, ...,     0,     0,     0],
       [ 2488,  3766,  9334, ...,     0,     0,     0],
       ...,
       [ 1983, 13675,  9487, ...,     0,     0,     0],
       [11728,  7922,  6463, ...,     0,     0,     0],
       [13802,  8300,  5019, ...,     0,     0,     0]])

#### Spliting the data

In [27]:
x_train, x_test, y_train, y_test = train_test_split(padded_sequence, df['Sentiment'], test_size=0.2, random_state=42)
x_test, x_val, y_test, y_val = train_test_split(x_test, y_test, test_size=0.5, random_state=42)

print(x_train.shape, y_train.shape)
print(x_test.shape, y_test.shape)
print(x_val.shape, y_val.shape)

(7962, 27) (7962,)
(995, 27) (995,)
(996, 27) (996,)


#### Model Creation

In [35]:
embedding_length = 100
e_pochs = 10
batch_size = 64

In [34]:
model = Sequential()

# input layer
model.add(Embedding(input_dim = len(word_index)+1, output_dim=embedding_length, input_length=max_len))
model.add(Flatten())

# hidden layer
model.add(Dense(128, activation='relu'))
model.add(Dropout(0.2))
model.add(Dense(64, activation='relu'))
model.add(Dropout(0.2))
model.add(Dense(32, activation='relu'))
model.add(Dropout(0.2))

# output layer
model.add(Dense(1, activation='sigmoid'))

model.compile(optimizer=Adam(learning_rate=0.0001), loss = 'binary_crossentropy', metrics=['accuracy'])

model.summary()

Model: "sequential_3"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_1 (Embedding)     (None, 27, 100)           1390800   
                                                                 
 flatten_1 (Flatten)         (None, 2700)              0         
                                                                 
 dense_4 (Dense)             (None, 128)               345728    
                                                                 
 dropout (Dropout)           (None, 128)               0         
                                                                 
 dense_5 (Dense)             (None, 64)                8256      
                                                                 
 dropout_1 (Dropout)         (None, 64)                0         
                                                                 
 dense_6 (Dense)             (None, 32)               

In [36]:
model.fit(x_train, y_train, epochs=e_pochs, batch_size=batch_size, validation_data=(x_val, y_val))

Epoch 1/10


Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.src.callbacks.History at 0x270a3c23710>

In [38]:
print("Accuracy :",model.evaluate(x_test, y_test)[1]*100,"%")

Accuracy : 68.14070343971252 %
