In [18]:
import numpy as np
import pandas as pd
import gensim
from gensim.utils import simple_preprocess
import sklearn
from gensim.models import Word2Vec
import matplotlib.pyplot as plt
from sklearn.manifold import TSNE
import plotly.express as px

In [14]:
# Setting all the needed paths
all_article_data_file_path = "/home/a-baboudjian/Desktop/Semantics Project/AllTheData.txt"
article_names_file_path = "/home/a-baboudjian/Desktop/Semantics Project/articleOrder.txt"

# Create a list where each index contains an abstract of an article
with open(all_article_data_file_path, "r") as f:
    text_content = f.read().strip().split("\n")

In [None]:
# Tokenize each article into a list of words (no punctuation)
tokenized_articles = [simple_preprocess(article) for article in text_content]

# Get the length of the longest article
longest_article_length = max(len(article) for article in tokenized_articles)
print(f"The length of the longest article is: {longest_article_length}")

The length of the longest article is: 149


In [None]:
# Train Word2Vec on the tokenized articles
word2vec_model = Word2Vec(sentences=tokenized_articles,  
                          window=7, 
                          workers=4)

In [None]:
def get_average_embedding(model, tokens):
    """
    Computes the average embedding vector for a list of tokens using a given 
    word embedding model.

    Args:
        - model (gensim.models.KeyedVectors): The word embedding model 
        (e.g., Word2Vec, GloVe) that contains word vectors.
        - tokens (list of str): A list of tokens (words) 
        for which the average embedding is to be computed.

    Returns:
        numpy.ndarray: The average embedding vector for the input tokens. 
                       If none of the tokens are found in the model, returns a 
                       zero vector of the same dimension as the model's word vectors.

    Notes:
        - The function filters out tokens that are not present in the model's vocabulary.
        - If the input list of tokens is empty or none of the tokens are found in the model, 
          the function returns a zero vector.
        - The returned vector is the mean of all the valid word vectors found in the model 
          for the given tokens.
    """
    ### AVERAGE POOLING ###
    embeddings = [model.wv[word] for word in tokens if word in model.wv]
    if len(embeddings) == 0:
        return np.zeros(model.vector_size)  # Handle empty articles
    return np.mean(embeddings, axis=0)

# Compute article-level embeddings for Word2Vec
word2vec_embeddings = np.array(
    [get_average_embedding(word2vec_model, article) for article in tokenized_articles])


In [None]:
# Perform t-SNE on Word2Vec embeddings
tsne_Word2Vec = TSNE(n_components=2, 
                    random_state=42, 
                    perplexity=40).fit_transform(word2vec_embeddings)

In [13]:
tsne_Word2Vec.shape

(10000, 2)

In [21]:
# PLOTTING Word2Vec EMBEDDINGS

# Reading the file which contains all the article names
with open(article_names_file_path,"r") as article_names_file:
    article_names = article_names_file.read().strip().split(sep="\n")
    
# Create a DataFrame with the t-SNE results and article names
df_tsne_mean_pool = pd.DataFrame({
    't-SNE Dimension 1': tsne_Word2Vec[:, 0],
    't-SNE Dimension 2': tsne_Word2Vec[:, 1],
    'Article Name': article_names
})

# Plot the scatter plot
fig = px.scatter(df_tsne_mean_pool, x='t-SNE Dimension 1', y='t-SNE Dimension 2', 
                 hover_name='Article Name', 
                 title="t-SNE Visualization of Word2Vec Embeddings",
                 width=1000, height=1000)

# Center the title
fig.update_layout(title={'text': "t-SNE Visualization of Word2Vec Embeddings",
                          'x':0.5, 'xanchor': 'center'})

fig.show()

This plot is contains 10,000 article embeddings rather than 2,000 so finding differences is a little bit more difficult difficult. However, if we refer to the cluster of dates in "AD", in the top left, we can see that the cluster, unlike the previous plots, is less defined and some articles are missing such as the "Anno Domini". Also, a cluster that is far from all else could be spotted, in the bottom left, which contains most of the "Grammy Award" articles. Interesting.