<a href="https://colab.research.google.com/github/DeanFord7/CMM307-AdvancedArtificialIntelligence/blob/main/DeanFord1702994-CMM307Coursework.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Section 1 - Dataset

The Twitter Sentiment Analysis (TSA) dataset contains over 70,000 records of tweets related to specific entities, whether that be a compnay, a game etc. Each record has four columns, an ID for the tweet, the entity the tweet is referencing, the text conatained within the tweet and the sentiment. <br><br>
The aim of the task is to use the text of each tweet to predict and assign a sentiment classification of one of the following to the tweet:
<ul>
<li>Positive</li>
<li>Negative</li>
<li>Neutral/Irrelevant</li>
</ul>

In [42]:
import kagglehub
import os

# Load dataset from kagglehub
path = kagglehub.dataset_download("jp797498e/twitter-entity-sentiment-analysis")

print("Path to dataset files:", path)

files = os.listdir(path)
print("Files in directory:", files)

Path to dataset files: /root/.cache/kagglehub/datasets/jp797498e/twitter-entity-sentiment-analysis/versions/2
Files in directory: ['twitter_training.csv', 'twitter_validation.csv']


In [43]:
import pandas as pd

# Dataset is already split into a training and testing file so retrieve both
training_file_path = os.path.join(path, "twitter_training.csv")
validation_file_path = os.path.join(path, "twitter_validation.csv")

column_names = ["tweet_id", "entity", "sentiment", "tweet_text"]

# Load datasets into dataframes
training_df = pd.read_csv(training_file_path, names=column_names)
validation_df = pd.read_csv(validation_file_path, names=column_names)

#print(training_df.head())
print("Train", len(training_df))
print("Val", len(validation_df))
#print(validation_df.head())

# Combine the dataframes as the pre made validation set has only 1,000 values compared to the testing sets 69,000
sentiment_df = pd.concat([training_df, validation_df], ignore_index=True)

print("Combined Dataframe:")
print(sentiment_df.head())
print("Records: ", len(sentiment_df))

# The dataset contains 6 records for each tweet with the first being the original and the next 5 being slight alterations of the text
# Remove the additional records for each tweet as the original text in the most relevant and the duplicates only have minor grammatical changes
filtered_sentiment_df = sentiment_df.drop_duplicates(subset=["tweet_id"], keep="first")

filtered_sentiment_df = filtered_sentiment_df[filtered_sentiment_df['tweet_text'].notnull()]  # Remove NaN values
filtered_sentiment_df = filtered_sentiment_df[filtered_sentiment_df['tweet_text'].str.strip() != '']  # Remove empty strings

print("Filtered Dataframe:")
print(filtered_sentiment_df.head())
print("Records: ", len(filtered_sentiment_df))


Train 74682
Val 1000
Combined Dataframe:
   tweet_id       entity sentiment  \
0      2401  Borderlands  Positive   
1      2401  Borderlands  Positive   
2      2401  Borderlands  Positive   
3      2401  Borderlands  Positive   
4      2401  Borderlands  Positive   

                                          tweet_text  
0  im getting on borderlands and i will murder yo...  
1  I am coming to the borders and I will kill you...  
2  im getting on borderlands and i will kill you ...  
3  im coming on borderlands and i will murder you...  
4  im getting on borderlands 2 and i will murder ...  
Records:  75682
Filtered Dataframe:
    tweet_id       entity sentiment  \
0       2401  Borderlands  Positive   
6       2402  Borderlands  Positive   
12      2403  Borderlands   Neutral   
18      2404  Borderlands  Positive   
24      2405  Borderlands  Negative   

                                           tweet_text  
0   im getting on borderlands and i will murder yo...  
6   So I spent a 

In [44]:
tweets = list(filtered_sentiment_df['tweet_text'])
sentiments = list(filtered_sentiment_df['sentiment'])

from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()

sentiments_numerical = label_encoder.fit_transform(sentiments)



In [59]:
import nltk

nltk.download('punkt')
nltk.download('punkt_tab')
nltk.download('stopwords')

from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer

def prep(sentences):
  prep_text = []

  words = stopwords.words('english')
  #entity_labels = list(filtered_sentiment_df['entity'])
  #words.extend(entity_labels)

  for sent in sentences:
    token_text = word_tokenize(sent)
    normalised_text = [token.lower() for token in token_text if token.isalpha()]

    swr_text = [token for token in normalised_text if token not in words]

    stemmer = SnowballStemmer('english')
    prep_text += [[stemmer.stem(word) for word in swr_text]]
  prep_sentences = [" ".join(sentence) for sentence in prep_text]
  return prep_sentences



[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [58]:
prep_x = prep(tweets)
print(prep_x)



# Section 2 - Representation Learning

In [47]:
from gensim.models import Word2Vec
import numpy as np

def word2vec_rep(sentence, w2v_model):
  embs = [w2v_model.wv[word] for word in sentence if word in w2v_model.wv.index_to_key]

  if len(embs) == 0:
      return np.zeros(w2v_model.vector_size)

  sent_emb = np.mean(np.array(embs), 0)
  return sent_emb

# Section 3 -  Algorithms

## Multi-Layer Perceptron

In [60]:
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score

mlp_score = []

xnp = np.array(prep_x)
ynp = np.array(sentiments_numerical)
kf = StratifiedKFold(n_splits=5, shuffle=True)

for train, test in kf.split(xnp, ynp):
  x_train, x_test, y_train, y_test = xnp[train], xnp[test], ynp[train], ynp[test]

  w2v_model = Word2Vec(vector_size=500, window=5, min_count=10, workers=4)
  w2v_model.build_vocab(x_train, update=None)
  w2v_model.train(corpus_iterable=x_train, total_examples=len(x_train), epochs=50)

  x_train_representations = [word2vec_rep(instance, w2v_model) for instance in x_train]
  x_test_representations = [word2vec_rep(instance, w2v_model) for instance in x_test]

  mlp = MLPClassifier(alpha=1, max_iter=500)
  mlp.fit(x_train_representations, y_train)
  mlp_predictions = mlp.predict(x_test_representations)
  mlp_acc = accuracy_score(mlp_predictions, y_test)
  mlp_score.append(mlp_acc)

print("MLP Accuracy: ", np.mean(mlp_score))



MLP Accuracy:  0.3421588594704684


In [68]:
from sklearn.feature_extraction.text import TfidfVectorizer #we will import the tfidf functionality from sklearn
acc_score = []
tfidf_base = TfidfVectorizer(max_features=1000) #we can instantiate, but we will need to recreate every fold in our evaluation (to prevent test set leakage)
import tensorflow
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.utils import to_categorical

def mlp(dataset_size, num_classes):
  input_shape = (dataset_size,)

  model = Sequential()
  model.add(Dense(128, input_shape=input_shape, activation='relu'))
  model.add(Dense(128, activation='relu'))
  model.add(Dense(num_classes, activation='softmax')) #remember this format from last year? We effectively parameterse two hidden layers and one output layer
  return model

kf = StratifiedKFold(n_splits=5, shuffle=True)
xnp = np.array(tweets) #convert to numpy to standardise our arrays for the split
ynp = np.array(sentiments_numerical)

for train, test in kf.split(xnp,ynp):
  x_train, x_test, y_train, y_test = xnp[train], xnp[test], ynp[train], ynp[test]

  x_train = prep(x_train) #we preprocess our train and test datasets
  x_test = prep(x_test)

  tfidf = tfidf_base #notice we copy a blank tfidf so there is no leakage
  x_train = tfidf.fit_transform(x_train)
  x_train = x_train.todense() #by default, tfidf will output a sparse matris to conserve memory. This is incompatible with our deep learner
  x_test = tfidf.transform(x_test)
  x_test = x_test.todense()
  num_classes = len(np.unique(y_train))
  model = mlp(1000, num_classes) #we also instantiate a new mlp to prevent leakage of train and test set
  y_train = to_categorical(y_train, num_classes) #convert y to one hot vectors
  y_test = to_categorical(y_test, num_classes)

  # Configure the model and start training
  model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy']) #we have parameterised with fairly standard metrics - do feel free to alter and investigate
  model.fit(x_train, y_train, epochs=10, batch_size=250, verbose=1, validation_split=0.2)

  # Test the model after training
  test_results = model.evaluate(x_test, y_test, verbose=1)
  print(f'Test results - Loss: {test_results[0]} - Accuracy: {test_results[1]}%')

  acc_score.append(test_results[1])


print("Accuracy:", np.mean(acc_score))

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Epoch 1/10


ValueError: Arguments `target` and `output` must have the same rank (ndim). Received: target.shape=(None, 4, 2), output.shape=(None, 2)

# Section 4 - Evaluation

# Section 5 - Paper Overview

# Section 6 - Algorithms

# Section 7 - Evaluation