<a href="https://colab.research.google.com/github/DeanFord7/CMM307-AdvancedArtificialIntelligence/blob/main/DeanFord1702994-CMM307Coursework.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Section 1 - Dataset

The Twitter Sentiment Analysis (TSA) dataset contains over 70,000 records of tweets related to specific entities, whether that be a compnay, a game etc. Each record has four columns, an ID for the tweet, the entity the tweet is referencing, the text conatained within the tweet and the sentiment. <br><br>
The aim of the task is to use the text of each tweet to predict and assign a sentiment classification of one of the following to the tweet:
<ul>
<li>Positive</li>
<li>Negative</li>
<li>Neutral</li>
<li>Irrelevant</li>
</ul>
In the dataset, 'Neutral' and 'Irrelevant' are seperate labels for the sentiment classification but both are considered to be the same result.

In [17]:
import kagglehub
import os

# Load dataset from kagglehub
path = kagglehub.dataset_download("jp797498e/twitter-entity-sentiment-analysis")

print("Path to dataset files:", path)

files = os.listdir(path)
print("Files in directory:", files)

Path to dataset files: /root/.cache/kagglehub/datasets/jp797498e/twitter-entity-sentiment-analysis/versions/2
Files in directory: ['twitter_training.csv', 'twitter_validation.csv']


In [18]:
import pandas as pd

# Dataset is already split into a training and testing file so retrieve both
training_file_path = os.path.join(path, "twitter_training.csv")
validation_file_path = os.path.join(path, "twitter_validation.csv")

column_names = ["tweet_id", "entity", "sentiment", "tweet_text"]

# Load datasets into dataframes
training_df = pd.read_csv(training_file_path, names=column_names)
validation_df = pd.read_csv(validation_file_path, names=column_names)

#print(training_df.head())
print("Train", len(training_df))
print("Val", len(validation_df))
#print(validation_df.head())

# Combine the dataframes as the pre made validation set has only 1,000 values compared to the testing sets 69,000
sentiment_df = pd.concat([training_df, validation_df], ignore_index=True)

print("Combined Dataframe:")
print(sentiment_df.head())
print("Records: ", len(sentiment_df))

# The dataset contains 6 records for each tweet with the first being the original and the next 5 being slight alterations of the text
# Remove the additional records for each tweet as the original text in the most relevant and the duplicates only have minor grammatical changes
filtered_sentiment_df = sentiment_df.drop_duplicates(subset=["tweet_id"], keep="first")

filtered_sentiment_df = filtered_sentiment_df[filtered_sentiment_df['tweet_text'].notnull()]  # Remove NaN values
filtered_sentiment_df = filtered_sentiment_df[filtered_sentiment_df['tweet_text'].str.strip() != '']  # Remove empty strings

# Change sentiment values from 'Irrelevant' to 'Neutral'
# 'Irrelevant' and 'Neutral' are treated as the same result in the dataset so convert all to 'Neutral' to avoid confusion in the results
filtered_sentiment_df.loc[filtered_sentiment_df['sentiment'] == 'Irrelevant', 'sentiment'] = 'Neutral'

print("Filtered Dataframe:")
print(filtered_sentiment_df.head())
print("Records: ", len(filtered_sentiment_df))


Train 74682
Val 1000
Combined Dataframe:
   tweet_id       entity sentiment  \
0      2401  Borderlands  Positive   
1      2401  Borderlands  Positive   
2      2401  Borderlands  Positive   
3      2401  Borderlands  Positive   
4      2401  Borderlands  Positive   

                                          tweet_text  
0  im getting on borderlands and i will murder yo...  
1  I am coming to the borders and I will kill you...  
2  im getting on borderlands and i will kill you ...  
3  im coming on borderlands and i will murder you...  
4  im getting on borderlands 2 and i will murder ...  
Records:  75682
Filtered Dataframe:
    tweet_id       entity sentiment  \
0       2401  Borderlands  Positive   
6       2402  Borderlands  Positive   
12      2403  Borderlands   Neutral   
18      2404  Borderlands  Positive   
24      2405  Borderlands  Negative   

                                           tweet_text  
0   im getting on borderlands and i will murder yo...  
6   So I spent a 

In [19]:
tweets = list(filtered_sentiment_df['tweet_text'])
sentiments = list(filtered_sentiment_df['sentiment'])

from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()

sentiments_numerical = label_encoder.fit_transform(sentiments)



In [32]:
import nltk

nltk.download('punkt')
nltk.download('punkt_tab')
nltk.download('stopwords')
nltk.download('wordnet')

from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer
from nltk.stem import WordNetLemmatizer

lemmatizer = WordNetLemmatizer()

def prep(sentences):
  prep_text = []

  words = stopwords.words('english')
  entity_labels = list(filtered_sentiment_df['entity'])
  words.extend(entity_labels)

  for sent in sentences:
    token_text = word_tokenize(sent)
    normalised_text = [token.lower() for token in token_text if token.isalpha()]

    swr_text = [token for token in normalised_text if token not in words]

    prep_text += [[lemmatizer.lemmatize(word) for word in swr_text]]
  prep_sentences = [" ".join(sentence) for sentence in prep_text]
  return prep_sentences



[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


# Section 2 - Representation Learning

In [25]:
from sklearn.feature_extraction.text import TfidfVectorizer
acc_score = []
tfidf_base = TfidfVectorizer(max_features=1000, ngram_range=(1, 2))

# Section 3 -  Algorithms

## Multi-Layer Perceptron

In [27]:
import tensorflow
import numpy as np

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.callbacks import EarlyStopping
from sklearn.model_selection import StratifiedKFold

def mlp(dataset_size, num_classes):
  input_shape = (dataset_size,)

  model = Sequential()
  model.add(Dense(128, input_shape=input_shape, activation='relu'))
  model.add(Dense(128, activation='relu'))
  model.add(Dense(num_classes, activation='softmax')) #remember this format from last year? We effectively parameterse two hidden layers and one output layer
  return model

kf = StratifiedKFold(n_splits=5, shuffle=True)
xnp = np.array(tweets) #convert to numpy to standardise our arrays for the split
ynp = np.array(sentiments_numerical)

for train, test in kf.split(xnp,ynp):
  x_train, x_test, y_train, y_test = xnp[train], xnp[test], ynp[train], ynp[test]

  x_train = prep(x_train) #we preprocess our train and test datasets
  x_test = prep(x_test)

  tfidf = tfidf_base #notice we copy a blank tfidf so there is no leakage
  x_train = tfidf.fit_transform(x_train)
  x_train = x_train.todense() #by default, tfidf will output a sparse matris to conserve memory. This is incompatible with our deep learner
  x_test = tfidf.transform(x_test)
  x_test = x_test.todense()
  num_classes = len(np.unique(y_train))
  model = mlp(1000, num_classes) #we also instantiate a new mlp to prevent leakage of train and test set
  y_train = to_categorical(y_train, num_classes) #convert y to one hot vectors
  y_test = to_categorical(y_test, num_classes)

  early_stopping = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)

  # Configure the model and start training
  model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy']) #we have parameterised with fairly standard metrics - do feel free to alter and investigate
  model.fit(x_train, y_train, epochs=10, batch_size=250, verbose=1, validation_split=0.2, callbacks=[early_stopping])

  # Test the model after training
  test_results = model.evaluate(x_test, y_test, verbose=1)
  print(f'Test results - Loss: {test_results[0]} - Accuracy: {test_results[1]}%')

  acc_score.append(test_results[1])


print("Accuracy:", np.mean(acc_score))

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Epoch 1/10
[1m32/32[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 20ms/step - accuracy: 0.4197 - loss: 1.0806 - val_accuracy: 0.3951 - val_loss: 1.0530
Epoch 2/10
[1m32/32[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 13ms/step - accuracy: 0.5118 - loss: 0.9489 - val_accuracy: 0.5708 - val_loss: 0.9244
Epoch 3/10
[1m32/32[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 13ms/step - accuracy: 0.7010 - loss: 0.7052 - val_accuracy: 0.5759 - val_loss: 0.9340
Epoch 4/10
[1m32/32[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 15ms/step - accuracy: 0.7637 - loss: 0.5884 - val_accuracy: 0.5667 - val_loss: 1.0689
Epoch 5/10
[1m32/32[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 13ms/step - accuracy: 0.8115 - loss: 0.5031 - val_accuracy: 0.5484 - val_loss: 1.1553
Epoch 6/10
[1m32/32[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 14ms/step - accuracy: 0.8402 - loss: 0.4374 - val_accuracy: 0.5428 - val_loss: 1.2427
Epoch 7/10
[1m32/32[0m [32m━━━━

KeyboardInterrupt: 

# Section 4 - Evaluation

# Section 5 - Paper Overview

# Section 6 - Algorithms

In [46]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout
from tensorflow.keras.optimizers import Adam, SGD
from tensorflow.keras.losses import BinaryCrossentropy
from tensorflow.keras.preprocessing.text import Tokenizer

In [47]:
vocab_size = 20000
random_embedding_dimension = 32
glove_embedding_dimension = 200
max_length = 40
lstm_units_random = 100
lstm_units_glove = 128
dense_units_random = 32
dense_units_glove = 64

In [None]:
from tensorflow.keras.preprocessing.sequence import pad_sequences


tokenizer = Tokenizer(num_words=vocab_size, oov_token="<OOV>")
tokenizer.fit_on_texts(tweets)

sequences = tokenizer.texts_to_sequences(tweets)

padded_sequences = pad_sequences(sequences, maxlen=max_length, padding='post', truncating='post')

xnp = np.array(padded_sequences)
ynp = np.array(sentiments_numerical)

In [36]:
random_embedding_model = Sequential([
    Embedding(input_dim=vocab_size,
              output_dim=random_embedding_dimension,
              input_length=max_length),
    Dropout(0.2),
    LSTM(lstm_units_random),
    Dense(dense_units_random, activation='relu'),
    Dropout(0.2),
    Dense(1, activation='sigmoid')
])




In [43]:
# Download GloVe embeddings from stanford
!wget http://nlp.stanford.edu/data/glove.6B.zip
!unzip glove.6B.zip -d /content/glove_data

# Set the path to the GloVe file to be used
glove_file_path = '/content/glove_data/glove.6B.200d.txt'

--2024-11-28 15:26:14--  http://nlp.stanford.edu/data/glove.6B.zip
Resolving nlp.stanford.edu (nlp.stanford.edu)... 171.64.67.140
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:80... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://nlp.stanford.edu/data/glove.6B.zip [following]
--2024-11-28 15:26:14--  https://nlp.stanford.edu/data/glove.6B.zip
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:443... connected.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: https://downloads.cs.stanford.edu/nlp/data/glove.6B.zip [following]
--2024-11-28 15:26:14--  https://downloads.cs.stanford.edu/nlp/data/glove.6B.zip
Resolving downloads.cs.stanford.edu (downloads.cs.stanford.edu)... 171.64.64.22
Connecting to downloads.cs.stanford.edu (downloads.cs.stanford.edu)|171.64.64.22|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 862182613 (822M) [application/zip]
Saving to: ‘glove.6B.zip’


202

In [48]:
glove_embeddings_index = {}
with open(glove_file_path, 'r', encoding='utf-8') as f:
  for line in f:
            values = line.split()
            word = values[0]
            vector = np.asarray(values[1:], dtype='float32')
            glove_embeddings_index[word] = vector

glove_matrix = np.zeros((vocab_size, glove_embedding_dimension))
for word, index in tokenizer.word_index.items():
    if index < vocab_size:
        embedding_vector = glove_embeddings_index.get(word)
        if embedding_vector is not None:
            glove_matrix[index] = embedding_vector

glove_embedding_model = Sequential([
    Embedding(input_dim=vocab_size,
              output_dim=glove_embedding_dimension,
              input_length=max_length,
              weights=[glove_matrix],
              trainable=True),
    Dropout(0.4),
    LSTM(lstm_units_glove),
    Dense(dense_units_glove, activation='relu'),
    Dropout(0.5),
    Dense(1, activation='sigmoid')
])

In [42]:
from sklearn.model_selection import train_test_split


x_train, x_test, y_train, y_test = train_test_split(xnp, ynp, test_size=0.2, random_state=42)

random_embedding_model.compile(optimizer=Adam(), loss='categorical_crossentropy', metrics=['accuracy'])

history = random_embedding_model.fit(x_train, y_train, epochs=10, batch_size=32, validation_split=0.2)

# Evaluate the model
loss, accuracy = random_embedding_model.evaluate(x_test, y_test)
print(f"Test Loss: {loss}")
print(f"Test Accuracy: {accuracy}")

Epoch 1/10
[1m246/246[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 46ms/step - accuracy: 0.5342 - loss: 0.0000e+00 - val_accuracy: 0.4664 - val_loss: 0.0000e+00
Epoch 2/10
[1m246/246[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 48ms/step - accuracy: 0.4987 - loss: 0.0000e+00 - val_accuracy: 0.4659 - val_loss: 0.0000e+00
Epoch 3/10
[1m246/246[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 45ms/step - accuracy: 0.5236 - loss: 0.0000e+00 - val_accuracy: 0.4608 - val_loss: 0.0000e+00
Epoch 4/10
[1m246/246[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 45ms/step - accuracy: 0.5298 - loss: 0.0000e+00 - val_accuracy: 0.4455 - val_loss: 0.0000e+00
Epoch 5/10
[1m246/246[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 38ms/step - accuracy: 0.5213 - loss: 0.0000e+00 - val_accuracy: 0.4598 - val_loss: 0.0000e+00
Epoch 6/10
[1m246/246[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 43ms/step - accuracy: 0.5152 - loss: 0.0000e+00 - val_accuracy:

In [52]:


tokenizer = Tokenizer(num_words=vocab_size, oov_token="<OOV>")
tokenizer.fit_on_texts(tweets)
sequences = tokenizer.texts_to_sequences(tweets)
padded_sequences = pad_sequences(sequences, maxlen=max_length, padding='post', truncating='post')

xnp = np.array(padded_sequences)
ynp = np.array(sentiments_numerical)

x_train, x_test, y_train, y_test = train_test_split(xnp, ynp, test_size=0.2, random_state=42)

glove_embedding_model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

# Train the model
history = glove_embedding_model.fit(x_train, y_train, epochs=10, batch_size=32, validation_split=0.2)

# Evaluate the model
loss, accuracy = glove_embedding_model.evaluate(x_test, y_test)
print(f"Test Loss: {loss}")
print(f"Test Accuracy: {accuracy}")

Epoch 1/10


  return self.fn(y_true, y_pred, **self._fn_kwargs)


[1m246/246[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m52s[0m 187ms/step - accuracy: 0.3109 - loss: 0.0000e+00 - val_accuracy: 0.2790 - val_loss: 0.0000e+00
Epoch 2/10
[1m246/246[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m82s[0m 189ms/step - accuracy: 0.2994 - loss: 0.0000e+00 - val_accuracy: 0.2790 - val_loss: 0.0000e+00
Epoch 3/10
[1m246/246[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m82s[0m 191ms/step - accuracy: 0.2946 - loss: 0.0000e+00 - val_accuracy: 0.2790 - val_loss: 0.0000e+00
Epoch 4/10
[1m246/246[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m80s[0m 184ms/step - accuracy: 0.3078 - loss: 0.0000e+00 - val_accuracy: 0.2790 - val_loss: 0.0000e+00
Epoch 5/10
[1m246/246[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m85s[0m 196ms/step - accuracy: 0.3002 - loss: 0.0000e+00 - val_accuracy: 0.2790 - val_loss: 0.0000e+00
Epoch 6/10
[1m246/246[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m78s[0m 180ms/step - accuracy: 0.3101 - loss: 0.0000e+00 - val_accuracy: 0.2

# Section 7 - Evaluation