<a href="https://colab.research.google.com/github/DeanFord7/CMM307-AdvancedArtificialIntelligence/blob/main/DeanFord1702994-CMM307Coursework.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Section 1 - Dataset

The Twitter Sentiment Analysis (TSA) dataset contains over 70,000 records of tweets related to specific entities, whether that be a compnay, a game etc. Each record has four columns, an ID for the tweet, the entity the tweet is referencing, the text conatained within the tweet and the sentiment. <br><br>
The aim of the task is to use the text of each tweet to predict and assign a sentiment classification of one of the following to the tweet:
<ul>
<li>Positive</li>
<li>Negative</li>
<li>Neutral</li>
<li>Irrelevant</li>
</ul>
In the dataset, 'Neutral' and 'Irrelevant' are seperate labels for the sentiment classification but both are considered to be the same result.

In [6]:
import kagglehub
import os

# Load dataset from kagglehub
path = kagglehub.dataset_download("jp797498e/twitter-entity-sentiment-analysis")

print("Path to dataset files:", path)

files = os.listdir(path)
print("Files in directory:", files)

Downloading from https://www.kaggle.com/api/v1/datasets/download/jp797498e/twitter-entity-sentiment-analysis?dataset_version_number=2...


100%|██████████| 1.99M/1.99M [00:00<00:00, 105MB/s]

Extracting files...
Path to dataset files: /root/.cache/kagglehub/datasets/jp797498e/twitter-entity-sentiment-analysis/versions/2
Files in directory: ['twitter_training.csv', 'twitter_validation.csv']





In [7]:
import pandas as pd

# Dataset is already split into a training and testing file so retrieve both
training_file_path = os.path.join(path, "twitter_training.csv")
validation_file_path = os.path.join(path, "twitter_validation.csv")

column_names = ["tweet_id", "entity", "sentiment", "tweet_text"]

# Load datasets into dataframes
training_df = pd.read_csv(training_file_path, names=column_names)
validation_df = pd.read_csv(validation_file_path, names=column_names)

#print(training_df.head())
print("Train", len(training_df))
print("Val", len(validation_df))
#print(validation_df.head())

# Combine the dataframes as the pre made validation set has only 1,000 values compared to the testing sets 69,000
sentiment_df = pd.concat([training_df, validation_df], ignore_index=True)

print("Combined Dataframe:")
print(sentiment_df.head())
print("Records: ", len(sentiment_df))

# The dataset contains 6 records for each tweet with the first being the original and the next 5 being slight alterations of the text
# Remove the additional records for each tweet as the original text in the most relevant and the duplicates only have minor grammatical changes
filtered_sentiment_df = sentiment_df.drop_duplicates(subset=["tweet_id"], keep="first")

filtered_sentiment_df = filtered_sentiment_df[filtered_sentiment_df['tweet_text'].notnull()]  # Remove NaN values
filtered_sentiment_df = filtered_sentiment_df[filtered_sentiment_df['tweet_text'].str.strip() != '']  # Remove empty strings

# Change sentiment values from 'Irrelevant' to 'Neutral'
# 'Irrelevant' and 'Neutral' are treated as the same result in the dataset so convert all to 'Neutral' to avoid confusion in the results
filtered_sentiment_df.loc[filtered_sentiment_df['sentiment'] == 'Irrelevant', 'sentiment'] = 'Neutral'

print("Filtered Dataframe:")
print(filtered_sentiment_df.head())
print("Records: ", len(filtered_sentiment_df))


Train 74682
Val 1000
Combined Dataframe:
   tweet_id       entity sentiment  \
0      2401  Borderlands  Positive   
1      2401  Borderlands  Positive   
2      2401  Borderlands  Positive   
3      2401  Borderlands  Positive   
4      2401  Borderlands  Positive   

                                          tweet_text  
0  im getting on borderlands and i will murder yo...  
1  I am coming to the borders and I will kill you...  
2  im getting on borderlands and i will kill you ...  
3  im coming on borderlands and i will murder you...  
4  im getting on borderlands 2 and i will murder ...  
Records:  75682
Filtered Dataframe:
    tweet_id       entity sentiment  \
0       2401  Borderlands  Positive   
6       2402  Borderlands  Positive   
12      2403  Borderlands   Neutral   
18      2404  Borderlands  Positive   
24      2405  Borderlands  Negative   

                                           tweet_text  
0   im getting on borderlands and i will murder yo...  
6   So I spent a 

In [8]:
tweets = list(filtered_sentiment_df['tweet_text'])
sentiments = list(filtered_sentiment_df['sentiment'])

from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()

sentiments_numerical = label_encoder.fit_transform(sentiments)



In [9]:
import nltk

nltk.download('punkt')
nltk.download('punkt_tab')
nltk.download('stopwords')
nltk.download('wordnet')

from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer
from nltk.stem import WordNetLemmatizer

lemmatizer = WordNetLemmatizer()

def prep(sentences):
  prep_text = []

  words = stopwords.words('english')
  entity_labels = list(filtered_sentiment_df['entity'])
  words.extend(entity_labels)

  for sent in sentences:
    token_text = word_tokenize(sent)
    normalised_text = [token.lower() for token in token_text if token.isalpha()]

    swr_text = [token for token in normalised_text if token not in words]

    prep_text += [[lemmatizer.lemmatize(word) for word in swr_text]]
  prep_sentences = [" ".join(sentence) for sentence in prep_text]
  return prep_sentences



[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


# Section 2 - Representation Learning

In [10]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf_base = TfidfVectorizer(max_features=1000, ngram_range=(1, 2))

# Section 3 -  Algorithms

In [11]:
import tensorflow
import numpy as np

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.callbacks import EarlyStopping
from sklearn.model_selection import StratifiedKFold, GridSearchCV

## Multi-Layer Perceptron

In [12]:
def mlp(dataset_size, num_classes):
  input_shape = (dataset_size,)

  model = Sequential()
  model.add(Dense(128, input_shape=input_shape, activation='relu'))
  model.add(Dense(128, activation='relu'))
  model.add(Dense(num_classes, activation='softmax')) #remember this format from last year? We effectively parameterse two hidden layers and one output layer
  return model

kf = StratifiedKFold(n_splits=5, shuffle=True)
xnp = np.array(tweets)
ynp = np.array(sentiments_numerical)

mlp_accuracy_score = []

for train, test in kf.split(xnp,ynp):
  x_train, x_test, y_train, y_test = xnp[train], xnp[test], ynp[train], ynp[test]

  x_train = prep(x_train)
  x_test = prep(x_test)

  tfidf = tfidf_base
  x_train = tfidf.fit_transform(x_train)
  x_train = x_train.todense()
  x_test = tfidf.transform(x_test)
  x_test = x_test.todense()
  num_classes = len(np.unique(y_train))
  model = mlp(1000, num_classes)
  y_train = to_categorical(y_train, num_classes)
  y_test = to_categorical(y_test, num_classes)

  # Code taken and adapted from https://keras.io/api/callbacks/early_stopping/
  early_stopping = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)

  model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
  model.fit(x_train, y_train, epochs=10, batch_size=250, verbose=1, validation_split=0.2, callbacks=[early_stopping])

  test_results = model.evaluate(x_test, y_test, verbose=1)
  print(f'Test results - Loss: {test_results[0]} - Accuracy: {test_results[1]}%')

  mlp_accuracy_score.append(test_results[1])


print("MLP Accuracy:", np.mean(mlp_accuracy_score))

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Epoch 1/10
[1m32/32[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 16ms/step - accuracy: 0.4161 - loss: 1.0802 - val_accuracy: 0.3921 - val_loss: 1.0614
Epoch 2/10
[1m32/32[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 10ms/step - accuracy: 0.4743 - loss: 0.9840 - val_accuracy: 0.5789 - val_loss: 0.9175
Epoch 3/10
[1m32/32[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 15ms/step - accuracy: 0.6628 - loss: 0.7790 - val_accuracy: 0.5835 - val_loss: 0.9069
Epoch 4/10
[1m32/32[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 15ms/step - accuracy: 0.6998 - loss: 0.6999 - val_accuracy: 0.5463 - val_loss: 0.9616
Epoch 5/10
[1m32/32[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 16ms/step - accuracy: 0.7362 - loss: 0.6373 - val_accuracy: 0.5407 - val_loss: 1.0079
Epoch 6/10
[1m32/32[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 15ms/step - accuracy: 0.7678 - loss: 0.5809 - val_accuracy: 0.5387 - val_loss: 1.0596
Epoch 7/10
[1m32/32[0m [32m━━━━

## k-Nearest Neighbour

In [13]:
from sklearn.metrics import accuracy_score
from sklearn.neighbors import KNeighborsClassifier


knn_accuracy_score = []

for train, test in kf.split(xnp,ynp):

  x_train, x_test, y_train, y_test = xnp[train], xnp[test], ynp[train], ynp[test]

  x_train = prep(x_train)
  x_test = prep(x_test)

  tfidf = tfidf_base
  x_train = tfidf.fit_transform(x_train)
  x_train = np.asarray(x_train.todense())
  x_test = tfidf.transform(x_test)
  x_test = np.asarray(x_test.todense())
  param_grid = {
        'n_neighbors': [1, 3, 5, 7]
    }

  grid_search = GridSearchCV(KNeighborsClassifier(), param_grid, cv=3, scoring='accuracy')
  grid_search.fit(x_train, y_train)

  # Get the best model
  best_kNN = grid_search.best_estimator_

  knn_predictions = best_kNN.predict(x_test)
  knn_acc = accuracy_score(knn_predictions, y_test)
  knn_accuracy_score.append(knn_acc)

print("kNN Accuracy:", np.mean(knn_accuracy_score))

kNN Accuracy: 0.4743788187372709


## Support Vector Machine

In [14]:
# Code taken and adapted from https://medium.com/@bedigunjit/simple-guide-to-text-classification-nlp-using-svm-and-naive-bayes-with-python-421db3a72d34

from sklearn import naive_bayes

nb_accuracy_scores = []

# Encode labels
Encoder = LabelEncoder()
ynp_encoded = Encoder.fit_transform(ynp)  # Encoding target labels

for train_idx, test_idx in kf.split(xnp, ynp_encoded):
    # Split data into train and test sets for this fold
    x_train, x_test = np.array(tweets)[train_idx], np.array(tweets)[test_idx]
    y_train, y_test = ynp_encoded[train_idx], ynp_encoded[test_idx]

    x_train = prep(x_train)
    x_test = prep(x_test)

    # TF-IDF Vectorization
    tfidf = tfidf_base
    x_train = tfidf.fit_transform(x_train)
    x_train = np.asarray(x_train.todense())
    x_test = tfidf.transform(x_test)
    x_test = np.asarray(x_test.todense())

    naive = naive_bayes.MultinomialNB()
    naive.fit(x_train, y_train)

    nb_predictions = naive.predict(x_test)

    accuracy = accuracy_score(y_test, nb_predictions)
    nb_accuracy_scores.append((accuracy) * 100)

print(f"Naive Bayes Accuracy:", np.mean(nb_accuracy_scores))

Naive Bayes Accuracy: 60.114052953156815


# Section 4 - Evaluation

# Section 5 - Paper Overview

# Section 6 - Algorithms

In [15]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout
from tensorflow.keras.optimizers import Adam, SGD
from tensorflow.keras.losses import BinaryCrossentropy
from tensorflow.keras.preprocessing.text import Tokenizer

In [16]:
vocab_size = 20000
random_embedding_dimension = 32
glove_embedding_dimension = 200
max_length = 40
lstm_units_random = 100
lstm_units_glove = 128
dense_units_random = 32
dense_units_glove = 64

In [17]:
# The paper only considers positive and negative sentiments for the algorithm
# Because of this we filter the dataset to only include these rows for use in the models
binary_sentiment_df = filtered_sentiment_df.drop(filtered_sentiment_df[filtered_sentiment_df['sentiment'] == 'Neutral'].index)

binary_tweets = list(binary_sentiment_df['tweet_text'])
# Pre-process the tweets
binary_tweets = prep(binary_tweets)
binary_sentiments = list(binary_sentiment_df['sentiment'])

# Convert the sentiments to numerical values
label_encoder = LabelEncoder()
binary_sentiments_numerical = label_encoder.fit_transform(binary_sentiments)

In [18]:
from tensorflow.keras.preprocessing.sequence import pad_sequences

tokenizer = Tokenizer(num_words=vocab_size, oov_token="<OOV>")
tokenizer.fit_on_texts(binary_tweets)

# Convert the tweets into sequneces and pad them to fit the max length of 40
sequences = tokenizer.texts_to_sequences(binary_tweets)
padded_sequences = pad_sequences(sequences, maxlen=max_length, padding='post', truncating='post')

binary_xnp = np.array(padded_sequences)
binary_ynp = np.array(binary_sentiments_numerical)

In [29]:
# Code taken and adapted from:
# https://stackoverflow.com/questions/35089956/how-to-use-the-embedding-layer-for-recurrent-neural-network-rnn-in-keras
# https://keras.io/api/layers/core_layers/embedding/
# https://keras.io/api/layers/regularization_layers/dropout/
# https://keras.io/api/layers/recurrent_layers/lstm/

# Add the layers to the random embedding model
random_embedding_model = Sequential()
random_embedding_model.add(Embedding(input_dim=vocab_size, output_dim=random_embedding_dimension, input_length=max_length))
random_embedding_model.add(Dropout(0.2))
random_embedding_model.add(LSTM(lstm_units_random))
random_embedding_model.add(Dense(dense_units_random, activation='relu'))
random_embedding_model.add(Dropout(0.2))
random_embedding_model.add(Dense(1, activation='sigmoid'))

In [31]:
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(binary_xnp, binary_ynp, test_size=0.2, random_state=42)

random_embedding_model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Paper makes no mention of how many epochs were run
# Through testing it was found the first epoch recorded the highets accuracy with a large drop off before recovering to a value lower than the first
history = random_embedding_model.fit(x_train, y_train, epochs=8, batch_size=32, validation_split=0.2)

loss, accuracy = random_embedding_model.evaluate(x_test, y_test)
print(f"Test Loss: {loss}")
print(f"Test Accuracy: {accuracy}")

Epoch 1/8
[1m143/143[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 60ms/step - accuracy: 0.6053 - loss: 0.6555 - val_accuracy: 0.6390 - val_loss: 0.6392
Epoch 2/8
[1m143/143[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 78ms/step - accuracy: 0.6553 - loss: 0.6102 - val_accuracy: 0.6215 - val_loss: 0.6568
Epoch 3/8
[1m143/143[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 74ms/step - accuracy: 0.6838 - loss: 0.5833 - val_accuracy: 0.4589 - val_loss: 0.6936
Epoch 4/8
[1m143/143[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 70ms/step - accuracy: 0.5008 - loss: 0.6936 - val_accuracy: 0.4589 - val_loss: 0.6933
Epoch 5/8
[1m143/143[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 58ms/step - accuracy: 0.5039 - loss: 0.6933 - val_accuracy: 0.5411 - val_loss: 0.6915
Epoch 6/8
[1m143/143[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 39ms/step - accuracy: 0.5160 - loss: 0.6927 - val_accuracy: 0.5411 - val_loss: 0.6912
Epoch 7/8
[1m143/143[0

In [21]:
# Download GloVe embeddings from stanford
!wget http://nlp.stanford.edu/data/glove.6B.zip
!unzip glove.6B.zip -d /content/glove_data

# Set the path to the GloVe file to be used
glove_file_path = '/content/glove_data/glove.6B.200d.txt'

--2024-11-29 20:13:02--  http://nlp.stanford.edu/data/glove.6B.zip
Resolving nlp.stanford.edu (nlp.stanford.edu)... 171.64.67.140
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:80... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://nlp.stanford.edu/data/glove.6B.zip [following]
--2024-11-29 20:13:02--  https://nlp.stanford.edu/data/glove.6B.zip
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:443... connected.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: https://downloads.cs.stanford.edu/nlp/data/glove.6B.zip [following]
--2024-11-29 20:13:03--  https://downloads.cs.stanford.edu/nlp/data/glove.6B.zip
Resolving downloads.cs.stanford.edu (downloads.cs.stanford.edu)... 171.64.64.22
Connecting to downloads.cs.stanford.edu (downloads.cs.stanford.edu)|171.64.64.22|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 862182613 (822M) [application/zip]
Saving to: ‘glove.6B.zip’


202

In [23]:
# Code taken and adapted from https://medium.com/analytics-vidhya/basics-of-using-pre-trained-glove-vectors-in-python-d38905f356db

glove_embeddings_index = {}

# Create the GloVe matrix from the txt file
with open(glove_file_path, 'r', encoding='utf-8') as f:
  for line in f:
            values = line.split()
            word = values[0]
            vector = np.asarray(values[1:], dtype='float32')
            glove_embeddings_index[word] = vector

glove_matrix = np.zeros((vocab_size, glove_embedding_dimension))
for word, index in tokenizer.word_index.items():
    if index < vocab_size:
        embedding_vector = glove_embeddings_index.get(word)
        if embedding_vector is not None:
            glove_matrix[index] = embedding_vector

# Add the layers to the GloVe embedding model
glove_embedding_model = Sequential()
glove_embedding_model.add(Embedding(input_dim=vocab_size, output_dim=glove_embedding_dimension, input_length=max_length, weights=[glove_matrix], trainable=True))
glove_embedding_model.add(Dropout(0.4))
glove_embedding_model.add(LSTM(lstm_units_glove))
glove_embedding_model.add(Dense(dense_units_glove, activation='relu'))
glove_embedding_model.add(Dropout(0.5))
glove_embedding_model.add(Dense(1, activation='sigmoid'))



In [25]:
x_train, x_test, y_train, y_test = train_test_split(binary_xnp, binary_ynp, test_size=0.2, random_state=42)

glove_embedding_model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Paper makes no mention of how many epochs were run
# Through testing it was found the highest accuracy came on the first epoch with a large drop off until epoch 6 which would significantly lower the average
history = glove_embedding_model.fit(x_train, y_train, epochs=1, batch_size=32, validation_split=0.2)

loss, accuracy = glove_embedding_model.evaluate(x_test, y_test)
print(f"Test Loss: {loss}")
print(f"Test Accuracy: {accuracy}")

Epoch 1/8
[1m143/143[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m28s[0m 160ms/step - accuracy: 0.8603 - loss: 0.3509 - val_accuracy: 0.7920 - val_loss: 0.5745
Epoch 2/8
[1m143/143[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m23s[0m 163ms/step - accuracy: 0.8719 - loss: 0.3509 - val_accuracy: 0.6670 - val_loss: 0.6217
Epoch 3/8
[1m143/143[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m23s[0m 161ms/step - accuracy: 0.7204 - loss: 0.5357 - val_accuracy: 0.6809 - val_loss: 0.6044
Epoch 4/8
[1m143/143[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m41s[0m 162ms/step - accuracy: 0.7874 - loss: 0.4623 - val_accuracy: 0.6914 - val_loss: 0.6014
Epoch 5/8
[1m143/143[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m41s[0m 167ms/step - accuracy: 0.7152 - loss: 0.5423 - val_accuracy: 0.6897 - val_loss: 0.6180
Epoch 6/8
[1m143/143[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m44s[0m 190ms/step - accuracy: 0.8442 - loss: 0.3782 - val_accuracy: 0.7937 - val_loss: 0.6146
Epoch 7/8
[1m14

# Section 7 - Evaluation