# Imports

In [4]:
import sys
print(sys.version)

3.9.18 (main, Sep 11 2023, 13:30:38) [MSC v.1916 64 bit (AMD64)]


In [21]:
import tensorflow as tf
import tensorflow_hub as hub
import tensorflow_text as text
import pandas as pd
import numpy as np
import re
import nltk
from nltk.corpus import stopwords

In [3]:
print("Num GPUs Available: ", len(tf.config.list_physical_devices('GPU')))

Num GPUs Available:  1


# Preprocessing

In [4]:
# Dataset CSV Paths
training_path = r'C:\Users\chris\Documents\cs5100\CS5100_FinalProject\twitter_training.csv'
testing_path = r'C:\Users\chris\Documents\cs5100\CS5100_FinalProject\twitter_validation.csv'

# Import data
train_data = pd.read_csv(training_path)
test_data = pd.read_csv(testing_path)

# drop irrelevant columns
train_data.drop(['2401', 'Borderlands'], axis = 1, inplace = True)
test_data.drop(['3364', 'Facebook'], axis = 1, inplace = True)

# rename columns
train_data = train_data.rename(columns = {"Positive" : "sentiment", "im getting on borderlands and i will murder you all ," : "text"})
test_data = test_data.rename(columns = {"Irrelevant" : "sentiment", "I mentioned on Facebook that I was struggling for motivation to go for a run the other day, which has been translated by Tom’s great auntie as ‘Hayley can’t get out of bed’ and told to his grandma, who now thinks I’m a lazy, terrible person 🤣" : "text"})

In [5]:
# remove null values
train_data = train_data.dropna()
test_data = test_data.dropna()

# get rid of extra labels
train_data = train_data[~train_data['sentiment'].isin(['Neutral', 'Irrelevant'])].reset_index(drop=True)
test_data = test_data[~test_data['sentiment'].isin(['Neutral', 'Irrelevant'])].reset_index(drop=True)

def tweet_to_words(tweet):
    letters_only = re.sub("[^a-zA-Z]", " ",tweet) 
    words = letters_only.lower().split()                             
    stops = set(stopwords.words("english"))                  
    meaningful_words = [w for w in words if not w in stops] 
    return( " ".join( meaningful_words ))

nltk.download('stopwords')
train_data['clean_tweet']=train_data['text'].apply(lambda x: tweet_to_words(x))
test_data['clean_tweet']=test_data['text'].apply(lambda x: tweet_to_words(x))

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\chris\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [6]:
# Add a boolean state value that represents positive, negative as 1,0
train_data['state'] = train_data['sentiment'].apply(lambda x: 1 if x=='Positive' else 0) # Positive is 1, 0 is negative
test_data['state'] = test_data['sentiment'].apply(lambda x: 1 if x=='Positive' else 0) 
train_data.sample(10)

Unnamed: 0,sentiment,text,clean_tweet,state
23187,Negative,Is Battlefield 1 a good game and why?,battlefield good game,0
29915,Positive,Red Dead Redemption 2 >>> The V,red dead redemption v,1
36462,Negative,I hate that I need to get Madden now,hate need get madden,0
312,Positive,"I know it doesn't look like much, but Borderla...",know look like much borderlands favorite game ...,1
41045,Positive,30 minutes to my latest video on @ GhostRecons...,minutes latest video ghostrecons pvp even thou...,1
5291,Positive,I’m very excited! Finally a solution for these...,excited finally solution money troubles thanks...,1
22702,Positive,Battlefield Brigade 1 looked REALLY good,battlefield brigade looked really good,1
42082,Positive,Dell XPS 17 9700 Graphics Card Missing and Fli...,dell xps graphics card missing flickering scre...,1
23374,Negative,India's Parthenon after learning that the gove...,india parthenon learning government banned,0
2269,Negative,@ Treyarch I pre-ordered black ops Cold War an...,treyarch pre ordered black ops cold war let pl...,0


In [7]:
x_train = train_data.clean_tweet # features
y_train = train_data.state # labels

x_test = test_data.clean_tweet
y_test = test_data.state

In [8]:
# train_data.groupby('sentiment').describe() # Checking for imbalance in the dataset

In [9]:
bert_preprocess = hub.KerasLayer("https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3")
bert_encoder = hub.KerasLayer("https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/4")

In [10]:
# BERT layers
text_input = tf.keras.layers.Input(shape=(), dtype=tf.string, name='text')
preprocessed_text = bert_preprocess(text_input)
outputs = bert_encoder(preprocessed_text)

# Neural network layers 
# Todo: look at differences between functional and sequential model
dropout = tf.keras.layers.Dropout(0.1, name='dropout')(outputs['pooled_output']) # Todo: Look up drop layers and cosine similarity
output = tf.keras.layers.Dense(1, activation='sigmoid', name='output')(dropout) # Greater than 0.5 will activate to positive sentiment, and less than that will be negative

# Final model
model = tf.keras.Model(inputs=[text_input], outputs=[output])

In [11]:
model.summary() #  Lets us see the model

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 text (InputLayer)              [(None,)]            0           []                               
                                                                                                  
 keras_layer (KerasLayer)       {'input_type_ids':   0           ['text[0][0]']                   
                                (None, 128),                                                      
                                 'input_mask': (Non                                               
                                e, 128),                                                          
                                 'input_word_ids':                                                
                                (None, 128)}                                                  

In [13]:
metrics = [
    tf.keras.metrics.BinaryAccuracy(name='accuracy'),
    tf.keras.metrics.Precision(name='precision'),
    tf.keras.metrics.Recall(name='recall')
]

model.compile(optimizer='adam', # training is driven by loss function, todo: look at other potential loss functions to justify choice
              loss='binary_crossentropy', 
              metrics=metrics)

In [14]:
model.fit(x_train, y_train, epochs=10)





Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x29e123fd820>

In [15]:
model.evaluate(x_test, y_test)







[0.5199918150901794,
 0.7587476968765259,
 0.7664233446121216,
 0.7581227421760559]

In [16]:
y_predicted = model.predict(x_test)
y_predicted = y_predicted.flatten()
y_predicted = np.where(y_predicted > 0.5, 1, 0)







In [22]:
from sklearn.metrics import classification_report
print(classification_report(y_test, y_predicted))

              precision    recall  f1-score   support

           0       0.75      0.76      0.76       266
           1       0.77      0.76      0.76       277

    accuracy                           0.76       543
   macro avg       0.76      0.76      0.76       543
weighted avg       0.76      0.76      0.76       543

