In [5]:
from sentence_transformers import SentenceTransformer
import pandas as pd
from sklearn.manifold import TSNE
import plotly.express as px
import numpy as np

In [3]:
# Sentence model we will be using
model = SentenceTransformer('sentence-transformers/all-mpnet-base-v2')

# Read the file and get all the article texts
with open('../Data/AllTheData.txt', 'r') as file:
    articles = file.read().strip().split('\n')

# Read the file and get all the article names
with open('../Data/articleOrder.txt', 'r') as file:
    articlesNames = file.read().strip().split('\n')

# Compute sentence embeddings for the first 2000 articles
embeddings = model.encode(articles[0:2000])

In [16]:
# Storing the resulting numpy array in a file
with open("../Data/Mpnet_Embeddings.npy", "wb") as f:
    np.save(f, np.array(embeddings))

In [7]:
# Loading all the embeddings
with open("../Data/Mpnet_Embeddings.npy", "rb") as f:
    mpnet_embeddings = np.load(f)

In [20]:
tsne_results = TSNE(n_components = 2, 
                    random_state = 42, 
                    perplexity = 40   ).fit_transform(mpnet_embeddings)

In [14]:
min_x_tsne = min(tsne_results[:,0])
max_x_tsne = max(tsne_results[:,1])

color_range = [(i-min_x_tsne)/(max_x_tsne-min_x_tsne) for i in tsne_results[:,0]]
print(len(color_range))

2000


In [15]:
# MPNET VISUALIZATION

# Create a DataFrame with the t-SNE results and article names
tsne_embeddings_df = pd.DataFrame({
    't-SNE Dimension 1': tsne_results[:, 0],
    't-SNE Dimension 2': tsne_results[:, 1],
    'Article Name': articlesNames[0:2000],
    'Colors':tsne_results[:, 0],
})

# Plot the scatter plot
fig = px.scatter(tsne_embeddings_df, 
                 x = 't-SNE Dimension 1', 
                 y = 't-SNE Dimension 2', 
                 hover_name = 'Article Name', 
                 title = "t-SNE Visualization of " +
                         "MiniLM-L6-v2 Embeddings",
                 template = "plotly_dark",
                 color='Colors',
                 width = 700, height = 700)

# Center the title
fig.update_layout(title = {'text': "t-SNE Visualization of " + 
                                    "MPNET",
                           'x': 0.5, 
                           'xanchor': 'center'})
fig.show()

In [24]:
# Loading all the embeddings
with open("../Data/MiniLM_Embeddings.npy", "rb") as f:
    miniLM_embeddings = np.load(f)

tsne_results_mini = TSNE(n_components = 2, 
                    random_state = 42, 
                    perplexity = 40   ).fit_transform(miniLM_embeddings)

In [25]:
# Visualizing how MiniLM "placed" the emebeddings differently 
# compared to MPNET.
 
# Create a DataFrame with the t-SNE results and article names
tsne_embeddings_mini_df = pd.DataFrame({
    't-SNE Dimension 1': tsne_results_mini[:, 0],
    't-SNE Dimension 2': tsne_results_mini[:, 1],
    'Article Name': articlesNames[0:2000],
    'Colors':tsne_results[:, 0],
})

# Plot the scatter plot
fig = px.scatter(tsne_embeddings_mini_df, 
                 x = 't-SNE Dimension 1', 
                 y = 't-SNE Dimension 2', 
                 hover_name = 'Article Name', 
                 title = "t-SNE Visualization of " +
                         "MiniLM-L6-v2 Embeddings",
                 template = "plotly_dark",
                 color='Colors',
                 width = 700, height = 700)

# Center the title
fig.update_layout(title = {'text': "t-SNE Visualization of " + 
                                    "MiniLM",
                           'x': 0.5, 
                           'xanchor': 'center'})
fig.show()