 # Import & Setup

In [1]:
import numpy as np
import pandas as pd
from glob import glob
from pathlib import Path
from pprint import pprint
from warnings import warn
from datetime import datetime
from ppmi_model import PPMIModel
from tppmi_model import TPPMIModel
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE

# --- currently unused --- #
import os
import tppmi_functions
import matplotlib.pyplot as plt

In [2]:
# Define directory and target words
DIR = Path("../../data")
ppmi_path = DIR / "ppmi-matrices"

In [3]:
target_words = ["twitter", "elon", "musk"]

Load the data into a dictionary with keys identifying the months

In [4]:
filenames = glob(str(ppmi_path / "*.txt"))
ppmi_dfs = {filename.split("2022-")[1][0:2]: pd.read_csv(ppmi_path / filename, sep=" ") for filename in filenames}

Set the vocab to be the index (instead of being the first attribute)

In [5]:
ppmi_dfs = {key:ppmi_df.set_index(ppmi_df.columns[0]) for key, ppmi_df in ppmi_dfs.items()}

Create ppmi_model objects

In [6]:
ppmi_models = {key: PPMIModel.construct_from_data(ppmi_df) for key, ppmi_df in ppmi_dfs.items()}

# Create TPPMI Models

In [7]:
ppmi_models.keys()

dict_keys(['09', '11', '08', '10', '12'])

In [8]:
tppmi_model = TPPMIModel(ppmi_models, target_words)

init
All words are contained in the vocabulary
elon - not in vocab of timestep: 09
elon - not in vocab of timestep: 08
musk - not in vocab of timestep: 09
musk - not in vocab of timestep: 08


In [9]:
pprint(tppmi_model.get_vocabulary()[:100], width=100, compact=True)

['appleton', 'ankeny', 'hinshaws', 'implement', 'probe', 'iuic', 'height', 'hpv', 'studio',
 'question', 'encountered', 'smoke', 'fumbling', 'tries', 'armed', 'demeaning', 'stronger', 'mama',
 'oversees', 'lies', 'discovering', 'files', 'fordham', 'jamesmelville', 'abpoli', 'beds',
 'rightwing', 'estate', 'wolfs', 'from', 'sordid', 'new', 'chemotherapy', 'clovis', 'untrue',
 'stadium', '51', 'meghan', 'councillors', 'recovered', 'brandnew', '69', 'truthful', 'divisive',
 'genocide', 'liberties', 'nomination', 'output', 'wash', 'lifestyles', 'immunisation', '250000',
 'reading', 'therickydavila', 'campuses', 'gordner', '2022', 'funny', 'implications', 'fold',
 'hike', 'stakes', 'pending', 'engagement', 'delete', 'free', 'alert', 'nsw', 'removedto',
 'household', 'lee', 'smk', 'holy', 'once', 'biggest', 'creator', 'spreads', 'pelosi', 'rapidly',
 'manchester', 'terrible', 'educating', 'newest', 'robin', 'aps', 'boulder', 'guilty', 'andrew',
 'yep', 'genitals', 'besides', 'sensory', 'engi

In [18]:
print(f"Size of the vocabulary: {tppmi_model.get_vocabulary_size()}")

Size of the vocabulary: 11624


In [19]:
vectors = tppmi_model.get_2d_representation(use_tsne=False)

In [20]:
vectors

{'twitter_09': array([  7.95903118, -11.61064671]),
 'twitter_11': array([25.90919645, -6.03353092]),
 'twitter_08': array([  6.29481126, -10.85692105]),
 'twitter_10': array([ 19.36825537, -21.26298013]),
 'twitter_12': array([32.37357664, 30.46065707]),
 'elon_09': array([-7.17823458, -2.0063212 ]),
 'elon_11': array([-10.42748052,  -3.27662236]),
 'elon_08': array([-7.17823458, -2.0063212 ]),
 'elon_10': array([-10.16725011,  -3.36896984]),
 'elon_12': array([-12.63605096,  19.81113809]),
 'musk_09': array([-7.17823458, -2.0063212 ]),
 'musk_11': array([-8.20744323, -2.57253341]),
 'musk_08': array([-7.17823458, -2.0063212 ]),
 'musk_10': array([-12.08766203,  -3.19756666]),
 'musk_12': array([-9.66604572, 19.93326071])}

Create a relative time scales stored in list

In [13]:
dates = [filename.split("s/ppmi-")[1].split(".")[0] for filename in filenames]
date_objects = [datetime.strptime(date, '%Y-%m-%d') for date in dates]
numdates = [(date - min(date_objects)).days for date in date_objects]
numdates_list = []

In [14]:
if any(date is None for date in numdates):
    raise ValueError("NA-s or invalid dates detected!")

# Reduce Dimensionality

In [15]:
def plot_word_vectors_2d(word_vectors_dict):
    plt.figure(figsize=(10, 8))

    unique_words = set([word.split("_")[0] for word in word_vectors_dict.keys()])
    color_map = plt.cm.rainbow(np.linspace(0, 1, len(unique_words)))
    word_color_dict = dict(zip(unique_words, color_map))

    legend_words = []  # Keep track of words added to the legend

    for word, vectors_dict in word_vectors_dict.items():
        pc1_values = vectors_dict[0]
        pc2_values = vectors_dict[1]
        color = word_color_dict[word.split("_")[0]]

        # Only add the word to the legend if it hasn't been added before
        if word.split("_")[0] not in legend_words:
            plt.scatter(pc1_values, pc2_values, label=word.split("_")[0], color=color)
            legend_words.append(word.split("_")[0])
        else:
            plt.scatter(pc1_values, pc2_values, color=color)

    plt.xlabel("PC1")
    plt.ylabel("PC2")
    plt.title("2D Visualization of Word Vectors")
    plt.legend()
    plt.grid(True)
    plt.show()

In [16]:
import plotly.io as pio
import plotly.express as px
import plotly.graph_objs as go
from plotly.offline import init_notebook_mode, iplot

pio.templates.default = "plotly"

def plot_word_vectors_2d_plotly(word_vectors_dict):
    unique_words = set([word.split("_")[0] for word in word_vectors_dict.keys()])
    color_map = px.colors.qualitative.Plotly  # Get a set of plotly colors
    word_color_dict = dict(zip(unique_words, color_map))

    data = []  # List to store scatter plot and line plot data
    prev_coords = {}  # Dictionary to store previous point's coordinates for each color

    for word, vectors_dict in word_vectors_dict.items():
        word_key = word.split("_")[0]
        pc1_values = vectors_dict[0]
        pc2_values = vectors_dict[1]
        color = word_color_dict[word_key]

        # Create scatter plot data for the point
        marker_trace = go.Scatter(
            x=[pc1_values],
            y=[pc2_values],
            mode='markers+text',  # Include text labels with markers
            name=word_key,
            showlegend=False,
            marker=dict(color=color),
            text=[word_key],  # Set the text label for the point
            textposition="top center"  # Position of the text label
        )
        data.append(marker_trace)

        # Check if there's a previous point for the same color
        if color in prev_coords:
            prev_pc1, prev_pc2 = prev_coords[color]
            # Create line plot data connecting the previous point and the current point
            line_trace = go.Scatter(
                x=[prev_pc1, pc1_values],
                y=[prev_pc2, pc2_values],
                mode='lines',
                showlegend=False,
                line=dict(color=color, width=1)
            )
            data.append(line_trace)
        else:
            # Create a specific symbol marker for the first point of each line
            start_marker_trace = go.Scatter(
                x=[pc1_values],
                y=[pc2_values],
                mode='markers',
                showlegend=False,
                marker=dict(symbol="diamond", size=10, color=color),
            )
            data.append(start_marker_trace)

        # Store the current point's coordinates as the previous coordinates for the color
        prev_coords[color] = (pc1_values, pc2_values)

    layout = dict(
        title="2D Visualization of Word Vectors",
        xaxis=dict(title="PC1", range=[-50, 50]),
        yaxis=dict(title="PC2", range=[-50, 50]),
        legend=dict(x=1.02, y=1.0)
    )

    # Create a Figure object with the data and layout
    fig = go.Figure(data=data, layout=layout)
    # Display the plot in a notebook or save to an HTML file

    init_notebook_mode(connected=True)
    iplot(fig, filename='word-embedding-plot')

In [17]:
plot_word_vectors_2d_plotly(vectors)