In [2]:
from transformers import BertModel, BertTokenizerFast
import numpy as np
import torch
from sklearn.manifold import TSNE
import plotly.express as px
import pandas as pd

# The main difference between cased and uncased models is that 
# cased differentiate words that are uppercased and lowercased (diff vocab sizes). 
# While uncased, takes all to be lower cased. We'll use the uncased 
# model for now but it would be interesting to see the difference in
# results.

tokenizer = BertTokenizerFast.from_pretrained("bert-base-uncased")  
model = BertModel.from_pretrained("bert-base-uncased")

In [3]:
# Getting all the paths needed
embeddings_folder_path = "../Embeddings/"
article_names_file_path = "../Data/articleOrder.txt"
all_article_data_file_path = "../Data/AllTheData.txt"

# Getting our data ready
with open(all_article_data_file_path, "r") as f:
    # Getting the data as a string through "read()", then removing 
    # trailling and leading whitespace characters through "strip()",
    # and splitting the string into a list where every index is
    # the content of an article through "split()".
    articles = f.read().strip().split("\n")

print(f"We have {len(articles)} amount of articles!")

We have 10000 amount of articles!


In [None]:
maxSequenceSize = 0
i = 0

# The BERT model takes a max of 512 tokens as input per sequence.
# Accordingly, this loop simply gets the length of the longest 
# sequence of tokens that represents an article.

for article in articles:
    currentLength = len(tokenizer.tokenize(article))
    print(i) if currentLength > 512 else 0
    if(currentLength > maxSequenceSize):
        maxSequenceSize = currentLength
    i+=1


print(f"The longest article representation is {maxSequenceSize}" +
      " tokens long")

The longest article representation is 432 tokens long


In [None]:
# Tokenizing all the inputs, adding the padding, and 
# creating the attention masks for each article.

# All articles will be made of a sequence of tokens of 
# size maxSequenceSize+2, accounting for the special tokens: "[CLS]"
# at the start of the sequence and "[SEP]" at the end. If an article 
# is less than the max size, paddings ("[PAD]" tokens) will be added 
# to the right because the BERT encoder only accepts unifrom sequence 
# sizes. Also, an associate attention mask is created for each sequence 
# to indicate to the model which tokens should be attended to.

all_encoding_objects = []
all_token_ids = []
all_attention_masks = []

# Tokenizing the articles, adding the padding to the
# right of the sequences (list), and creating attention masks.
# All encapsulated in the encoding object.
# Output: List of lists
for i in range(10000):
    encodings = tokenizer(text = articles[i], 
                          padding = "max_length",
                          padding_side = "right", 
                          max_length = maxSequenceSize+2)
    all_encoding_objects.append(encodings.encodings[0])
    all_token_ids.append(encodings.encodings[0].ids)
    all_attention_masks.append(encodings.encodings[0].attention_mask)

In [None]:
# Getting the Embeddings

# Transforming all our data into tensors
tensor_input_ids = torch.tensor(all_token_ids)
tensor_attention_masks = torch.tensor(all_attention_masks)
embeddings = torch.tensor(data=[])

start = 0
end = 2000
step = 100

for i in range(start, end, step):
    with torch.no_grad():
        # Passing inputs to the BERT model
        outputs = model(input_ids = tensor_input_ids[i:i+100,:], 
                        attention_mask = tensor_attention_masks[i:i+100,:])
        # We only need the last layer of the output
        embeddings = torch.cat((embeddings, outputs.last_hidden_state,), 0)

        print("batch "+str(i/100)+" done")
        torch.save(embeddings, embeddings_folder_path + "embeddings" + str(i/100) + ".pt")
        print("Saved in file path " + embeddings_folder_path + "embeddings" + str(i/100) + ".pt")
        embeddings = torch.tensor(data=[])

In [None]:
# Variable that will be used to store the
# name of the pooling method used. This will
# be used to store and load the resulting embeddings
method_name=""

# Functions that will determine the type of pooling
# to be used
def with_max(current_embedding, axis):
    global method_name 
    method_name = "Max_Pooling_"
    return np.max(current_embedding, axis=axis)

def with_min(current_embedding, axis):
    global method_name 
    method_name = "Min_Pooling_"
    return np.min(current_embedding, axis=axis)

def with_mean(current_embedding, axis):
    global method_name 
    method_name = "Mean_Pooling_"
    return np.mean(current_embedding, axis=axis)


def pooling(folder_path, pooling_method, batches):
    """
    Pools embeddings from multiple batches of saved embeddings.

    Args:
        folder_path (str): The path to the folder containing the saved embeddings.
        pooling_method (function): The function to use for pooling the embeddings.
        batches (int): The number of batches of embeddings to process.

    Returns:
        np.ndarray: The pooled embeddings.

    Notes:
        - The function loads embeddings from files named "embeddings{index}.pt" 
          where {index} is the batch index.
        - Each batch contains 100 article representations.
        - The attention masks are used to mask out the "[PAD]" token embeddings 
          by filling their embeddings with zeros.
        - The pooling method is applied to the masked embeddings to obtain the pooled embeddings.
        - The pooled embeddings are concatenated and returned as a single numpy array.
    """
    pooled_embeddings = np.array([], dtype=np.float64)
    current_index = 0.0
    attention_mask_index = 0

    # Going through all the saved embeddings batches
    for i in range(batches):
        current_embeddings = torch.load(f = folder_path + "embeddings" + str(current_index+i) + ".pt", 
                                        weights_only = True).numpy().astype(np.float64)
        # Going through every embedded article
        # in the batches
        for j in range(100):
            # Shaping the attention mask of
            # to the appriopriate shape of (434, 768)
            current_mask = np.transpose(
                np.array(
                    np.split(
                        np.tile(np.array(all_attention_masks[attention_mask_index], 
                                         dtype=np.int64), 768), 768), dtype = 'bool'))

            # Mask "[PAD]" token embeddings by filling with zeros.
            # Note: This may affect min and max pooling if all
            # embedding values for a feature are negative.
            current_embedding = np.ma.masked_array(data = current_embeddings[j],
                                                   mask = ~current_mask, 
                                                   shrink = False,
                                                   fill_value = 0, 
                                                   dtype=np.float64).filled()
            
            # Using input function to pool the embeddings
            current_pooled_embedding = pooling_method(current_embedding, 0)

            # Collecting the embeddings
            pooled_embeddings = np.concatenate([pooled_embeddings, current_pooled_embedding], 
                                               axis = 0)
        
        print("Done with file number "+str(i))
    return pooled_embeddings

# Choose the pooling method of choice
pooling_method = with_max

# Get all pooled embeddings from the 
# embedding file stored
pooled_embeddings = np.array(np.split(pooling(embeddings_folder_path, pooling_method, batches=20), 2000),
                         dtype=np.float64)

print("Pooling Completed!")

# Storing the resulting numpy array in a file
with open(method_name+"Embeddings.npy", "wb") as f:
    np.save(f, pooled_embeddings)

Done with file number 0
Done with file number 1
Done with file number 2
Done with file number 3
Done with file number 4
Done with file number 5
Done with file number 6
Done with file number 7
Done with file number 8
Done with file number 9
Done with file number 10
Done with file number 11
Done with file number 12
Done with file number 13
Done with file number 14
Done with file number 15
Done with file number 16
Done with file number 17
Done with file number 18
Done with file number 19
Pooling Completed!


In [7]:
# Loading all the embeddings
with open("../Data/Max_Pooling_Embeddings.npy", "rb") as f:
    max_embeddings = np.load(f)

with open("../Data/Min_Pooling_Embeddings.npy", "rb") as f:
    min_embeddings = np.load(f)

with open("../Data/Mean_Pooling_Embeddings.npy", "rb") as f:
    mean_embeddings = np.load(f)

In [8]:
# Reading the file which contains all the article names
with open(article_names_file_path,"r") as article_names_file:
    article_names_list = article_names_file.read().strip().split(sep="\n")

# Retreiving the first 2000 article names because
# that is how many embeddings we've computed
article_names = article_names_list[0:2000]


In [None]:
# Plotting the embeddings using t-SNE
# (t-distributed Stochastic Neighbor Embedding)

# Perform t-SNE for each post pooling embeddings.
# Here, hyperparameters could be played around to achieve
# different results. This is a concern for the future.
tsne_results_min_pool = TSNE(n_components = 2, 
                             random_state = 42, 
                             perplexity = 40   ).fit_transform(min_embeddings)

tsne_results_max_pool = TSNE(n_components = 2, 
                             random_state = 42, 
                             perplexity = 40   ).fit_transform(max_embeddings)

tsne_results_mean_pool = TSNE(n_components = 2, 
                              random_state = 42, 
                              perplexity = 40  ).fit_transform(mean_embeddings)



In [None]:
# PLOTTING FUNCTION

def plot_using_tsne(tsne_results, article_names, plot_width, plot_height,
                    pooling_method_name):
        """
        Plots a t-SNE visualization of BERT pooled embeddings.

        This function creates a scatter plot using t-SNE results to visualize the 
        embeddings of articles. Each point in the plot represents an article, and 
        hovering over a point will display the article's name. The plot is titled 
        based on the pooling method used for the embeddings.

        Parameters:
        tsne_results (numpy.ndarray): A 2D array with t-SNE results, where each row 
                                        corresponds to an article and has two dimensions.
        article_names (list of str): A list of article names corresponding to the 
                                        t-SNE results.
        plot_width (int): The width of the plot (in pixels).
        plot_height (int): The height of the plot (in pixels).
        pooling_method_name (str): The name of the pooling method used for the BERT 
                                        embeddings, which will be included in the plot title.

        Returns:
        None
        """
        
        # Create a DataFrame with the t-SNE results and article names
        tsne_embeddings_df = pd.DataFrame({
        't-SNE Dimension 1': tsne_results[:, 0],
        't-SNE Dimension 2': tsne_results[:, 1],
        'Article Name': article_names
    })

        # Plot the scatter plot
        fig = px.scatter(tsne_embeddings_df, 
                         x = 't-SNE Dimension 1', 
                         y = 't-SNE Dimension 2', 
                         hover_name = 'Article Name', 
                         title = "t-SNE Visualization of BERT " + pooling_method_name +
                                " Pooled Embeddings",
                         width = plot_width, height=plot_height)

        # Center the title
        fig.update_layout(title = {'text': "t-SNE Visualization of BERT " + 
                                       pooling_method_name + " Pooled Embeddings",
                                   'x': 0.5, 
                                   'xanchor': 'center'})
        fig.show()


In [11]:
# PLOTTING MAX POOL EMBEDDINGS
plot_using_tsne(tsne_results_max_pool, article_names, 700, 700, "Max")

In [12]:
# PLOTTING MIN POOL EMBEDDINGS
plot_using_tsne(tsne_results_min_pool, article_names, 700, 700, "Min")

In [13]:
# PLOTTING MEAN POOL EMBEDDINGS
plot_using_tsne(tsne_results_mean_pool, article_names, 900, 900, "Mean")

As expected, the shape of the plot and element-wise distances in the clusters are very different.
An example would be the cluster the contains several dates in AD. In the min and max embeddings, it is located in almost the same position on the plot, furthest cluster to the left. However, in the mean pooled plot, it is located in the top left and the embedding of the article "Anno Domini" seems to be further away, possibly not even in the same cluster. 