**COM3110/COM4115 Lab: Word Embeddings**

This lab sheet was created by Jo√£o Augusto Leite

In [None]:
!pip install nltk gensim scikit-learn plotly pandas numpy

In [None]:
from nltk.corpus import gutenberg
from gensim.models import Word2Vec
from sklearn.manifold import TSNE
from matplotlib import pyplot
import gensim.downloader
import plotly.express as px
import pandas as pd
import nltk
import numpy as np
nltk.download('gutenberg')
nltk.download('punkt')

In [None]:
# Helper function, you don't need to look into it.
def plot_vectors(X, annotations):
  X_embedded = TSNE(n_components=2, learning_rate='auto', init='random', perplexity=1).fit_transform(X)
  df = pd.DataFrame({'x': X_embedded[:, 0], 'y': X_embedded[:, 1], 'word': annotations})
  fig = px.scatter(df, x='x', y='y', text='word')
  fig.update_traces(textposition='top center')
  fig.show()

# Part 1: Computing word embeddings using Gensim

In [None]:
# Loading the moby dick book corpus
sents = gutenberg.sents("melville-moby_dick.txt")
sents = [[word.lower() for word in sent] for sent in sents]  # lowercasing

In [None]:
print("First sentence:", sents[0])
print("First token in the first sentence:", sents[0][0])

In [None]:
# Training the word2vec model is done in a single line.
# You can look into the Word2Vec class documentation for more details:
# https://radimrehurek.com/gensim/auto_examples/tutorials/run_word2vec.html#training-parameters

model = Word2Vec(sentences=sents, vector_size=100, window=5, min_count=1, workers=4).wv

In [None]:
# After trained, you can access the vectors like this.
# This object is a matrix where each row contains a 100-dimensional vector for each word in the corpus
model.vectors

In [None]:
model.vectors.shape  # (n_words, 100) matrix

In [None]:
# Mapping of words to indices in the matrix
print(model.key_to_index)

In [None]:
print(model.key_to_index["ship"])

In [None]:
# To retrieve the single vector representing a given word,
# you can first retrieve its index in the matrix, then access the matrix at that index:

index_to_ship = model.key_to_index["ship"]
model.vectors[index_to_ship]  # vector for the word 'ship'

In [None]:
# This is another way to retrieve the vector of a word.
model['ship']

In [None]:
all(model['ship'] == model.vectors[index_to_ship])  # all dimensions of both vectors are equal

In [None]:
# It is possible that the model does not contain a particular word
# this will throw an error:
model['brontosaurus']

In [None]:
# You can check if there is a vector for a particular word in the matrix
# by checking if the word is in the model:
"brontosaurus" in model

## Vector operations
You can perform mathematical operations using vectors. Note that a vector sum operation is an element-wise sum over each dimension of the vectors. This is done automatically by numpy when you use the + operator for two multidimensional vectors.

In [None]:
# This is vector sum operation between the vectors representing the words whale and ship.
u =  model['whale']
v = model['ship']

print(u + v)

In [None]:
a = u * v # element-wise multiplication
b = u / v # element-wise division
c = u + v # element-wise sum
d = u - v # element-wise difference
e = u @ v # dot product

print(e)  # look at different results (a through e)

In [None]:
# Since you can perform mathematical operations using vectors, you are able
# to compute the cosine similarity between two vectors

def cosine_similarity(u, v):
    dot_product = sum(a * b for a, b in zip(u, v))
    norm_u = np.sqrt(sum(a**2 for a in u))
    norm_v = np.sqrt(sum(b**2 for b in v))

    if norm_u * norm_v != 0:  # avoid division by 0
      similarity = dot_product / (norm_u * norm_v)
    else:
      similarity = 0

    return similarity

u =  model['whale']
v = model['ship']
z = model['giraffe']

print("Similarity between 'whale' and 'ship':", cosine_similarity(u, v))
print("Similarity between 'whale' and 'giraffe':", cosine_similarity(u, z))
print("Similarity between 'ship' and 'giraffe':", cosine_similarity(u, z))

In [None]:
# Use the 'plot_vectors' helper function to plot the vectors for 'whale', 'ship', and 'giraffe'.

X = np.stack([u,v,z], axis=0)  # stack the three vectors vertically
print(X.shape)

In [None]:
plot_vectors(X=X, annotations=["whale", "ship", "giraffe"])

In [None]:
# Gensim also comes with a list of pre-trained models (i.e. you do not need to train a model yourself)
# You can also load models using gensim.downloader

print(list(gensim.downloader.info()['models'].keys()))

In [None]:
# You can load one of these pre-trained models and use it off the shelf
# For instance, below you will be loading a word2vec model trained with similar data from Mikolov et al. (2013)
# Loading it will take a while, since the model is big!

w2v_vectors = gensim.downloader.load('word2vec-google-news-300')

In [None]:
# Since this is a more general and much bigger model, you should be able to replicate some of the semantic operations discussed in the lecture
# Write code to perform the 'king' - 'man' + 'woman' operation and compare the result with the vector of 'queen'

# YOUR CODE HERE

###

In [None]:
# Use the 'plot_vectors' helper function to plot the vectors for 'king', 'queen', 'man', 'woman' and 'king - man + woman'.

# YOUR CODE HERE

###

## Task 1: Write your own "most_similar" method.
Write your own function to retrieve the topn most similar words to a given word.
Return a list of tuples containing (word, similarity) in the same format as the model.most_similar method.

**Note**: remember that the most similar word is always itself, and you don't need to include it.

In [None]:
# Write your own function to retrieve the topn most similar words to a given word.
# Return a list of tuples containing (word, similarity) in the same format
# as the model.most_similar method.
# Note: remember that the most similar word is always itself, and you don't need to include it.

def get_top_similar_words(model, word, topn):
  # YOUR CODE HERE

  ###

  return top_similar_words  # list of tuples with (word, similarity)


top_words = get_top_similar_words(model, "ship", 50)
top_words

In [None]:
# Use gensim method to calculate the most similar words

topn = 50
sims = model.most_similar('ship', topn=topn) # get the most similar words
sims

# This method returns the top n most similar words to the given word, along
# with their cosine similarities.

In [None]:
# Is your implementation retrieving the exact same words as gensim's implementation?
all([t1[0] == t2[0] for t1, t2 in zip(top_words, sims)])

In [None]:
# Plot the top n similar words.
# Note that this plot is a 2D representation of the 100-dimensional vectors,
# thus it is possible that you see slight differences from what your similarity
# scores indicated.

your_top_words = [w for w, _ in top_words]
vectors = np.stack([model[w] for w in your_top_words])
plot_vectors(vectors, your_top_words)

## Part 2: Text classification with embeddings

In this section we will use embeddings that were precomputed by researchers at Stanford. The GloVe embeddings we will use were trained on a Twitter corpus with a total of 27 billion tokens and a vocabulary size of more than 1 million unique tokens.
You can read more about it at https://nlp.stanford.edu/projects/glove/.

In [None]:
# Download and unzip the pretrained embeddings if you don't already have it
# Note: this can take a few minutes
!wget https://nlp.stanford.edu/data/glove.twitter.27B.zip
!unzip glove.twitter.27B.zip

In [None]:
from gensim.scripts.glove2word2vec import glove2word2vec
from gensim.models import KeyedVectors
import pandas as pd

In [None]:
# Load the embeddings as a gensim model (this can take a few seconds)
glove_input_file = 'glove.twitter.27B.100d.txt'
model = KeyedVectors.load_word2vec_format(glove_input_file, binary=False, no_header=True)

In [None]:
model.vectors.shape

In [None]:
# Note how the top similar words are different from the embeddings we trained previously.
sims = model.most_similar("ship", topn=5)
sims

In [None]:
# Plot the most similar words to 'whale' in the glove vector space
words = [w for w, _ in sims]
vectors = np.stack([model[w] for w in words], axis=0)

plot_vectors(vectors, words)

## Sentiment Classification
We will train a machine learning model to predict the sentiment of tweets associated with airline companies. We will use the embeddings of the tweets as features.

**Please upload the Tweets_short.csv file in this notebook.** You can click the folder option "Files" in the left part of the UI, and then "Upload to session storage", or simply drag and drop the file.

In [None]:
import re
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score

In [None]:
df = pd.read_csv("Tweets_short.csv")
df

In [None]:
# Distribution of classes
df["airline_sentiment"].value_counts()

## Task 1:
Implement normalisation and tokenisation for the dataset. The Glove embeddings we are going to use were trained by mapping hashtags to the token \<hashtag>, numbers to the token \<number>, urls to \<url>, and @users to \<user>. Also,  all tokens should be lowercased.

The full list of normalisations can be found at https://nlp.stanford.edu/projects/glove/preprocess-twitter.rb. You don't need to implement all normalisation steps. You can play around and see how the performance of your model is affected by extra normalisation steps.

In [None]:
def normalise_tweet(tweet):
  ### YOUR CODE HERE

  ###

  return tweet

def tokenise_tweet(tweet):
  ### YOUR CODE HERE

  return tweet


In [None]:
mapping = {"neutral": 0, "negative": -1, "positive": 1}
labels = df["airline_sentiment"].apply(lambda x: mapping[x]).to_numpy()  # convert the sentiment labels to 0, -1, and 1.
text = (df["text"].apply(lambda x: normalise_tweet(x))  # normalise
  .apply(lambda x: tokenise_tweet(x)).tolist())  # tokenise

## Task 2:
Implement a function to compute the embedding of a SENTENCE. Up until now we were working with WORD embeddings. Now you have to produce a single vector to represent the whole sentence. There are many ways to aggregate the word vectors into a sentence vector. We will implement a simple averaging method across the embeddings of each word in the sentence.

Your function should sum the embeddings of the words in the tweet, and divide this vector by the number of tokens in the tweet.

Remember to check if a word is in the model before retrieving it. Assign a vector of zeros if the word is not in the model: np.zeros(100).


In [None]:
def get_tweet_embedding(tweet_tokenised):
  ### YOUR CODE HERE

  ###


  return tweet_embedding

In [None]:
# Stack the tweet embeddings into a matrix X
X = []
for tokenised_tweet in text:
  tweet_embedding = get_tweet_embedding(tokenised_tweet)
  X.append(tweet_embedding)

X = np.stack(X)
X.shape

In [None]:
# We will use the tweet embeddings to train a classifier to predict if a given tweet
# has a positive, neutral, or negative sentiment.
print("Sentence embeddings:", X)
print("Labels:", labels)

In [None]:
# Split the data into train and test sets
train_idxs, test_idxs = train_test_split(range(len(df)), train_size=0.7, random_state=42, stratify=labels)

X_train = X[train_idxs]
X_test = X[test_idxs]

y_train = labels[train_idxs]
y_test = labels[test_idxs]

In [None]:
# Train the classifier
clf = SVC(random_state=42, class_weight="balanced")
clf.fit(X_train, y_train);

In [None]:
# Make predictions on the test set
predictions = clf.predict(X_test)
predictions

In [None]:
# Score the classifier
accuracy = accuracy_score(predictions, y_test)
f1_macro = f1_score(predictions, y_test, average="macro")
print(f"Accuracy: {accuracy*100:.2f}/100")
print(f"F1-Macro: {f1_macro*100:.2f}/100")

## Task 3:
Perform error analysis. Analyse the tweets that were incorrectly classified by your model. Explore the types of mistakes your classifier made. How many negative tweets did it predict as positive? and neutral?

Take a look at [confusion matrices](https://scikit-learn.org/stable/modules/generated/sklearn.metrics.confusion_matrix.html) and try to use sklearn's implementation in your analysis.

In [None]:
# Analyse the tweets that were incorrectly classified by your model
misclassified_idxs = np.array(test_idxs)[predictions != y_test]

pred_df = df.iloc[test_idxs].reset_index()
pred_df["predicted_sentiment"] = predictions

reverse_mapping = {v: k for k, v in mapping.items()}
pred_df["predicted_sentiment"] = pred_df["predicted_sentiment"].apply(lambda x: reverse_mapping[x])
mistakes_df = pred_df[pred_df["index"].isin(misclassified_idxs)]
mistakes_df

In [None]:
condition = mistakes_df["airline_sentiment"] == "negative"
mistakes_df[condition]