In [1]:
# imports
import numpy as np
import pandas as pd
import tensorflow as tf

In [2]:
# open and read [i.e. first 5 entries] training data
Data = pd.read_csv('./Dataset/twitter_training.csv')
Data.head()

Unnamed: 0,2401,Borderlands,Positive,"im getting on borderlands and i will murder you all ,"
0,2401,Borderlands,Positive,I am coming to the borders and I will kill you...
1,2401,Borderlands,Positive,im getting on borderlands and i will kill you ...
2,2401,Borderlands,Positive,im coming on borderlands and i will murder you...
3,2401,Borderlands,Positive,im getting on borderlands 2 and i will murder ...
4,2401,Borderlands,Positive,im getting into borderlands and i can murder y...


In [3]:
# A little bit of Data exploraton to understand the data
Data.shape # info on total entries

(74681, 4)

In [4]:
# general info about the dataset
Data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 74681 entries, 0 to 74680
Data columns (total 4 columns):
 #   Column                                                 Non-Null Count  Dtype 
---  ------                                                 --------------  ----- 
 0   2401                                                   74681 non-null  int64 
 1   Borderlands                                            74681 non-null  object
 2   Positive                                               74681 non-null  object
 3   im getting on borderlands and i will murder you all ,  73995 non-null  object
dtypes: int64(1), object(3)
memory usage: 2.3+ MB


In [5]:
# know the different unique entries...
Data.nunique()

2401                                                     12447
Borderlands                                                 32
Positive                                                     4
im getting on borderlands and i will murder you all ,    69490
dtype: int64

In [6]:
# determine how many duplicates there are
Data.duplicated().sum()

2700

In [7]:
# drop all duplicates from our dataset
Data.drop_duplicates(inplace=True)

In [8]:
# re-check to see if job was done
Data.duplicated().sum()

0

In [9]:
# Size of our dataset after
Data.shape

(71981, 4)

In [10]:
# Know the columns we dealing with in this dataset
Data.columns

Index(['2401', 'Borderlands', 'Positive',
       'im getting on borderlands and i will murder you all ,'],
      dtype='object')

In [11]:
# Rename the columns in our dataset
Data.rename(columns={"2401": "#Text", "Borderlands": "Entity", "Positive": "Sentiment", 
                     "im getting on borderlands and i will murder you all ,": "Text"}, inplace=True)
Data.head()

Unnamed: 0,#Text,Entity,Sentiment,Text
0,2401,Borderlands,Positive,I am coming to the borders and I will kill you...
1,2401,Borderlands,Positive,im getting on borderlands and i will kill you ...
2,2401,Borderlands,Positive,im coming on borderlands and i will murder you...
3,2401,Borderlands,Positive,im getting on borderlands 2 and i will murder ...
4,2401,Borderlands,Positive,im getting into borderlands and i can murder y...


In [12]:
# Want to know the sentiment category in our dataset as well as their entry count
Data.groupby('Sentiment').count()

Unnamed: 0_level_0,#Text,Entity,Text
Sentiment,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Irrelevant,12584,12584,12537
Negative,21787,21787,21698
Neutral,17800,17800,17708
Positive,19810,19810,19712


In [13]:
# Want to know the entites in our dataset as well as their entry count
Data.groupby('Entity').count()

Unnamed: 0_level_0,#Text,Sentiment,Text
Entity,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Amazon,2264,2264,2249
ApexLegends,2289,2289,2278
AssassinsCreed,2160,2160,2156
Battlefield,2267,2267,2255
Borderlands,2210,2210,2205
CS-GO,2207,2207,2195
CallOfDuty,2322,2322,2314
CallOfDutyBlackopsColdWar,2261,2261,2242
Cyberpunk2077,2193,2193,2175
Dota2,2229,2229,2225


In [14]:
# Drop unwanted columns
Data.drop(columns=['#Text'], inplace=True)
Data.head()

Unnamed: 0,Entity,Sentiment,Text
0,Borderlands,Positive,I am coming to the borders and I will kill you...
1,Borderlands,Positive,im getting on borderlands and i will kill you ...
2,Borderlands,Positive,im coming on borderlands and i will murder you...
3,Borderlands,Positive,im getting on borderlands 2 and i will murder ...
4,Borderlands,Positive,im getting into borderlands and i can murder y...


In [15]:
# encode Sentiments into values
Data['Sentiment'].replace({'Positive': 1, 'Negative': 0, 'Neutral': 2, 'Irrelevant': 3}, inplace=True)

In [16]:
testData = Data
testData.head()

Unnamed: 0,Entity,Sentiment,Text
0,Borderlands,1,I am coming to the borders and I will kill you...
1,Borderlands,1,im getting on borderlands and i will kill you ...
2,Borderlands,1,im coming on borderlands and i will murder you...
3,Borderlands,1,im getting on borderlands 2 and i will murder ...
4,Borderlands,1,im getting into borderlands and i can murder y...


In [17]:
# Combining Sentiment and Text columns to form one column
testData['Text'] = testData['Entity'].str.cat(testData['Text'], sep=" ")
testData.head()

Unnamed: 0,Entity,Sentiment,Text
0,Borderlands,1,Borderlands I am coming to the borders and I w...
1,Borderlands,1,Borderlands im getting on borderlands and i wi...
2,Borderlands,1,Borderlands im coming on borderlands and i wil...
3,Borderlands,1,Borderlands im getting on borderlands 2 and i ...
4,Borderlands,1,Borderlands im getting into borderlands and i ...


In [18]:
# Drop Entitiy column table

testData.drop(columns="Entity", inplace=True)
testData.head()

Unnamed: 0,Sentiment,Text
0,1,Borderlands I am coming to the borders and I w...
1,1,Borderlands im getting on borderlands and i wi...
2,1,Borderlands im coming on borderlands and i wil...
3,1,Borderlands im getting on borderlands 2 and i ...
4,1,Borderlands im getting into borderlands and i ...


In [19]:
# explore and clean validation dataset

# open val data
valData = pd.read_csv('./Dataset/twitter_validation.csv')
valData.head()

Unnamed: 0,3364,Facebook,Irrelevant,"I mentioned on Facebook that I was struggling for motivation to go for a run the other day, which has been translated by Tomâ€™s great auntie as â€˜Hayley canâ€™t get out of bedâ€™ and told to his grandma, who now thinks Iâ€™m a lazy, terrible person ðŸ¤£"
0,352,Amazon,Neutral,BBC News - Amazon boss Jeff Bezos rejects clai...
1,8312,Microsoft,Negative,@Microsoft Why do I pay for WORD when it funct...
2,4371,CS-GO,Negative,"CSGO matchmaking is so full of closet hacking,..."
3,4433,Google,Neutral,Now the President is slapping Americans in the...
4,6273,FIFA,Negative,Hi @EAHelp Iâ€™ve had Madeleine McCann in my cel...


In [20]:
# know the columns in our dataset
valData.columns

Index(['3364', 'Facebook', 'Irrelevant',
       'I mentioned on Facebook that I was struggling for motivation to go for a run the other day, which has been translated by Tomâ€™s great auntie as â€˜Hayley canâ€™t get out of bedâ€™ and told to his grandma, who now thinks Iâ€™m a lazy, terrible person ðŸ¤£'],
      dtype='object')

In [21]:
# change Dataset columns
valData.rename(columns={"3364": "#Text", "Facebook": "Entity", "Irrelevant": "Sentiment", 
                     "I mentioned on Facebook that I was struggling for motivation to go for a run the other day, which has been translated by Tomâ€™s great auntie as â€˜Hayley canâ€™t get out of bedâ€™ and told to his grandma, who now thinks Iâ€™m a lazy, terrible person ðŸ¤£": "Text"}, inplace=True)
valData.head()

Unnamed: 0,#Text,Entity,Sentiment,Text
0,352,Amazon,Neutral,BBC News - Amazon boss Jeff Bezos rejects clai...
1,8312,Microsoft,Negative,@Microsoft Why do I pay for WORD when it funct...
2,4371,CS-GO,Negative,"CSGO matchmaking is so full of closet hacking,..."
3,4433,Google,Neutral,Now the President is slapping Americans in the...
4,6273,FIFA,Negative,Hi @EAHelp Iâ€™ve had Madeleine McCann in my cel...


In [22]:
# Encode Sentiments into values
valData['Sentiment'].replace({'Positive': 1, 'Negative': 0, 'Neutral': 2, 'Irrelevant': 3}, inplace=True)

In [23]:
# merge
# Combining Sentiment and Text columsn to form one column
valData['Text'] = valData['Entity'].str.cat(valData['Text'], sep=" ")
valData.head()

Unnamed: 0,#Text,Entity,Sentiment,Text
0,352,Amazon,2,Amazon BBC News - Amazon boss Jeff Bezos rejec...
1,8312,Microsoft,0,Microsoft @Microsoft Why do I pay for WORD whe...
2,4371,CS-GO,0,CS-GO CSGO matchmaking is so full of closet ha...
3,4433,Google,2,Google Now the President is slapping Americans...
4,6273,FIFA,0,FIFA Hi @EAHelp Iâ€™ve had Madeleine McCann in m...


In [24]:
# drop Entitiy column from valData
valData.drop(columns="Entity", inplace=True)
valData.head()

Unnamed: 0,#Text,Sentiment,Text
0,352,2,Amazon BBC News - Amazon boss Jeff Bezos rejec...
1,8312,0,Microsoft @Microsoft Why do I pay for WORD whe...
2,4371,0,CS-GO CSGO matchmaking is so full of closet ha...
3,4433,2,Google Now the President is slapping Americans...
4,6273,0,FIFA Hi @EAHelp Iâ€™ve had Madeleine McCann in m...


In [25]:
# determine if there are duplicates
valData.duplicated().sum()

0

In [26]:
# valData size
valData.shape

(999, 3)

In [27]:
# one-hot encoding on sentiments (y-values)
# split the dataset as well
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

lencoder = LabelEncoder()

# train-test split -> 80-20 split
xTrain, xTest, yTrain, yTest = train_test_split(testData['Text'], testData['Sentiment'], shuffle=True, test_size=0.2)

#one-hot encoding
yTrain = lencoder.fit_transform(yTrain)
yTest = lencoder.fit_transform(yTest)

yTrain = tf.one_hot(yTrain, depth=len(lencoder.classes_))
yTest = tf.one_hot(yTest, depth=len(lencoder.classes_))

In [28]:
xTrain = xTrain.astype(str)
xTest = xTest.astype(str)

In [29]:
# one-hot enocding for valData and segmenting
valX = valData['Text'].values
valX = valX.astype(str)

# one-hot encoding
valY = lencoder.fit_transform(valData['Sentiment'].values)
valY = tf.one_hot(valY, depth=len(lencoder.classes_))

In [29]:
# Encode Text Sentiments for testData

from tensorflow.keras.layers import TextVectorization

# Create a TextVectorization layer
vectorizer = TextVectorization(max_tokens=50000, output_mode='int', output_sequence_length=15)

# adapt with trainData
vectorizer.adapt(xTrain)

# Build Sentiment model
model = tf.keras.Sequential([
    tf.keras.layers.Input(shape=(1,), dtype=tf.string),
    vectorizer,
    tf.keras.layers.Embedding(len(vectorizer.get_vocabulary()), 64, mask_zero=True),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64, return_sequences=True)),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(32)),
    tf.keras.layers.Dense(64, activation='relu'),
    tf.keras.layers.Dropout(0.5),
    tf.keras.layers.Dense(4, activation='softmax')
])

model.compile(optimizer="adam", loss=tf.keras.losses.categorical_crossentropy, metrics=['accuracy'])

In [30]:
# train model

# Training call-backs
terminationProtocol = tf.keras.callbacks.EarlyStopping(
    monitor='loss',
    patience=8,
    restore_best_weights=True
)

model.fit(xTrain,
          yTrain, 
          epochs=10,
          batch_size=30,
          validation_data=(valX, valY),
          shuffle=True,
          callbacks=[terminationProtocol])

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.src.callbacks.History at 0x17b335c4850>

In [31]:
# save model for reuse/future
model.save('Sentiment-Model.tf')

INFO:tensorflow:Assets written to: Sentiment-Model.tf\assets


INFO:tensorflow:Assets written to: Sentiment-Model.tf\assets


In [35]:
# Evaluate the model on data outside its training data..
model.evaluate(xTest.values, yTest)



[0.7256035208702087, 0.861707329750061]

In [52]:
# Let's test the model out

text = "Amazon That was bad delivery" # Text is in 2 parts 'entity n text' -> Amazon is the entity, what follows is the text

pred = model.predict([text])



In [53]:
# tensor output of the final prediction
pred

array([[0.8514418 , 0.03204292, 0.08525458, 0.03126077]], dtype=float32)

In [55]:
# lets see the user readable output
sentiment = np.argmax(pred)

if sentiment == 0:
    print("Negative")
elif sentiment == 1:
    print("Positive")
elif sentiment == 2:
    print("Neutral")
else:
    print("Irrelevant")

Negative


In [30]:
# reload the model
model = tf.keras.models.load_model("Sentiment-Model.tf")

In [31]:
# lets evaluate it on a some (not all) of our testData

# segment some part of testData for evaluation
# decided to use 1000...which is quiet alot testData
dataX = xTest.values[:1000]
dataY = yTest[:1000] 

# evaluate model
model.evaluate(dataX, dataY)



[0.23690973222255707, 0.9459999799728394]

In [32]:
# lets use in the way we will in production

text = "Amazon Got a good service delivery."
# the text is made up of two parts: the entity and the text
# for text, entity is Amazon and the text is what proceeds after

# using the model to determine the prediction
sentiment = model.predict([text]) # value must be put in an array
sentiment



array([[1.3686526e-03, 9.9510843e-01, 3.1261600e-03, 3.9664903e-04]],
      dtype=float32)

In [33]:
# determine which class the text-sentiment belongs to
res = np.argmax(sentiment)

if res == 0:
    print("Negative")
elif res == 1:
    print("Positive")
elif res == 2:
    print("Neutral")
else:
    print("Irrelevant")

Positive
