# Model Analysis

In [187]:
import numpy as np
import glob
import warnings
import random
from pathlib import Path
from gensim.models.word2vec import Word2Vec
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE

# matplotlib
# %matplotlib notebook (makes plot interactive but slows things down and makes them buggy eventually)
import matplotlib.pyplot as plt
# plotly
from plotly.offline import init_notebook_mode, iplot, plot
import plotly.graph_objs as go
import plotly.io as pio
pio.templates.default = "plotly"

In [188]:
warnings.filterwarnings("ignore")

model_dir = Path("model/")
model_files = glob.glob(str(model_dir / "*.model"))

In [189]:
# Get models by their split date
model_files_griner = [model_file for model_file in model_files if "07_Oct" in model_file]
model_files_twitter = [model_file for model_file in model_files if "01_Oct" in model_file]
model_files_pelosi = [model_file for model_file in model_files if "26_Oct" in model_file]
model_files_colorado_springs = [model_file for model_file in model_files if "18_Nov" in model_file]

In [190]:
model_files_griner

['model/01_Jun_to_07_Oct.model', 'model/07_Oct_to_28_Apr.model']

In [191]:
model_files_twitter

['model/01_Oct_to_28_Apr.model', 'model/01_Jun_to_01_Oct.model']

In [192]:
model_files_pelosi

['model/26_Oct_to_28_Apr.model', 'model/01_Jun_to_26_Oct.model']

In [193]:
model_files_colorado_springs

['model/01_Jun_to_18_Nov.model', 'model/18_Nov_to_28_Apr.model']

In [194]:
model_griner_pre = Word2Vec.load(model_files_griner[0])
model_griner_post = Word2Vec.load(model_files_griner[1])

model_twitter_pre = Word2Vec.load(model_files_twitter[1])
model_twitter_post = Word2Vec.load(model_files_twitter[0])

model_pelosi_pre = Word2Vec.load(model_files_pelosi[1])
model_pelosi_post = Word2Vec.load(model_files_pelosi[0])

model_colorado_springs_pre = Word2Vec.load(model_files_colorado_springs[0])
model_colorado_springs_post = Word2Vec.load(model_files_colorado_springs[1])

model_static = Word2Vec.load("model/word2vec.model")

# 2-D Visualization

2D visualization of word-embeddings.
Function can use either pca or tsne to reduce dimensions of the word vectors.
Function can either use matplotlib or plotly for plotting

In [195]:
def display_scatterplot(model, words, keyword, title = "Words in the embedding space", subtitle = None, use_tsne=False, use_plotly = False):

    # we do not want to change mutable parameters
    word_list = words.copy()

    # keywords not in words will otherwise not get visualized
    if keyword not in word_list:
        word_list.append(keyword)

    # Iff use_tsne is true, perform t-SNE, otherwise perform PCA
    if use_tsne:
        vectors_2d = perform_tsne(model.wv, word_list)
    else:
        vectors_2d = perform_pca(model.wv, word_list)

    # Iff use_plotly is true, use plotly for plotting, otherwise matplotlib
    if use_plotly:
        plot_with_plotly(vectors_2d, word_list, keyword, title = title, subtitle = subtitle)
    else:
        plot_with_matplotlib(vectors_2d, word_list, keyword, title = title, subtitle = subtitle)

## Variants of Plotting

In [196]:
def plot_with_plotly(vectors, labels, keyword, title = "Words in the embedding space", subtitle = None, plot_in_notebook=True):

    # Create a list to hold scatter plot data points
    data = []
    text_offset = 0.05  # Offset for positioning text labels

    # Iterate through each vector and label to create scatter plot data
    for i, (x, y) in enumerate(vectors):
        # keyword is red, other words are blue
        text_color = 'red' if labels[i] == keyword else 'black'
        # Append a scatter plot trace for the current label
        data.append(go.Scatter(x=[x], y=[y], mode='markers+text', text=[labels[i]], textfont=dict(color=text_color),
                               marker=dict(color=text_color), textposition='top center'))

    # Define the layout settings for the plot
    layout = go.Layout(
        title=title,
        titlefont=dict(size=22),
        title_x=0.5, # Center the title horizontally
        xaxis=dict(title='X'),
        yaxis=dict(title='Y'),
        showlegend = False,
        annotations=[
            dict(
                x=0.5,
                y=1.08,
                xref='paper',
                yref='paper',
                text=subtitle,
                showarrow=False,
            )
        ],
    )

    # Create a Figure object with the data and layout
    fig = go.Figure(data=data, layout=layout)

    # Display the plot in a notebook or save to an HTML file
    if plot_in_notebook:
        init_notebook_mode(connected=True)
        iplot(fig, filename='word-embedding-plot')
    else:
        plot(data, filename='word-embedding-plot.html')

In [197]:
def plot_with_matplotlib(vectors, labels, keyword, title = "Words in the embedding space", subtitle = None):

    # Create a scatter plot
    plt.figure(figsize=(6,6))
    plt.scatter(vectors[:,0], vectors[:,1], edgecolors='k', c='orange')

    # Add word labels to the scatter plot
    for word, (x,y) in zip(labels, vectors):
        color = 'red' if word == keyword else 'orange'
        plt.text(x + 0.02, y + 0.02, word, color=color)

    # Set plot titles
    plt.suptitle(title, fontsize=14)
    if subtitle:
        plt.title(subtitle, fontsize=10)

## Variants of dimension reduction

In [198]:
def perform_pca(model, words):

    # Convert the word vectors of the specified words into a NumPy array

    word_vectors = np.array([model[w] for w in words])

    # 1. Apply PCA to reduce dimensionality to 2 components
    # 2. Perform the PCA transformation on the word vectors and select the first 2 dimensions
    return PCA().fit_transform(word_vectors)[:, :2]

In [199]:
from sklearn.preprocessing import StandardScaler
def perform_tsne(model, words):

    # Convert the word vectors of the specified words into a NumPy array
    word_vectors = np.array([model[w] for w in words])

    # perplexity must not exceed the number of samples
    perplexity = min(5, len(words) - 1)

    # 1. Apply t-SNE with 2 components, using perplexity adjusted based on the number of words
    # 2. Perform the t-SNE transformation on the word vectors
    tsne_vectors = TSNE(n_components=2, perplexity=perplexity, random_state=1040).fit_transform(word_vectors)
    # Fit the scaler on t-SNE vectors
    scaler = StandardScaler()
    tsne_vectors = scaler.fit_transform(tsne_vectors)

    return tsne_vectors

## Sampling methods

In [200]:
def sample_from_most_similar(model_pre, model_post, keyword, sample_size=6):

    # we only calculate the 100 most similar words
    sample_size = min(20, sample_size)

    similar_words_pre = model_pre.wv.most_similar(keyword, topn=20)
    similar_words_post = model_post.wv.most_similar(keyword, topn=20)

    sampled_words_pre = random.sample(similar_words_pre, sample_size // 2)
    sampled_words_post = random.sample(similar_words_post, sample_size // 2)
    sampled_words = sampled_words_pre + sampled_words_post

    return [key for key,_ in sampled_words]

# Experiment

## Release of Brittney Griner

On December 8, 2022, Russia and the United States conducted a prisoner exchange, trading Brittney Griner, an American basketball player, for Viktor Bout, a Russian arms dealer. Griner, a WNBA champion star and Team USA Olympic athlete, had been convicted of smuggling and possession of cannabis in Russia earlier in 2022 and sentenced to nine years in prison.

Source: [Wikipedia article](https://en.wikipedia.org/wiki/Viktor_Bout%E2%80%93Brittney_Griner_prisoner_exchange)

In [201]:
test_word_griner = "griner"

01.06.2022 - 07.10.2022

In [202]:
model_griner_pre.wv.most_similar(test_word_griner)

[('jens', 0.9540700912475586),
 ('rosenthal', 0.9530154466629028),
 ('colton', 0.9523400068283081),
 ('horton', 0.9502237439155579),
 ('leanne', 0.948662519454956),
 ('lesley', 0.9486276507377625),
 ('sasso', 0.9476233124732971),
 ('torrez', 0.9474111795425415),
 ('alanna', 0.9470879435539246),
 ('donaldson', 0.9470842480659485)]

07.10.2022 - 28.04.2023

In [203]:
model_griner_post.wv.most_similar(test_word_griner)

[('belarus', 0.8117255568504333),
 ('snatched', 0.8009015321731567),
 ('circa', 0.8008100390434265),
 ('hoi', 0.7934396266937256),
 ('lenin', 0.793010950088501),
 ('lionel', 0.7874842882156372),
 ('rudy', 0.7828936576843262),
 ('mcfadden', 0.7730939388275146),
 ('1929', 0.770993173122406),
 ('kidnapped', 0.7703869342803955)]

In [204]:
vis_list_griner = ["brittney", "freedman", "32yearold", "kremlin", "zelensky", "vladimir", "joni"]
title_griner = "Brittney Griner prisoner exchange"

In [205]:
display_scatterplot(model_griner_pre, vis_list_griner, test_word_griner, title=title_griner, subtitle="01 June 2022 - 07 October 2022", use_tsne=False, use_plotly=True)

In [207]:
display_scatterplot(model_griner_post, vis_list_griner, test_word_griner, title=title_griner, subtitle="07 October 2022 - 28 April 2023", use_tsne=False, use_plotly=True)

## Elon Musk Twitter takeover

Business magnate Elon Musk initiated an acquisition of American social media company Twitter, Inc. on April 14, 2022, and concluded it on October 27, 2022. Musk had begun buying shares of the company in January 2022, becoming its largest shareholder by April with a 9.1 percent ownership stake

source: [Wikipedia article](https://en.wikipedia.org/wiki/Acquisition_of_Twitter_by_Elon_Musk#:~:text=Business%20magnate%20Elon%20Musk%20initiated,a%209.1%20percent%20ownership%20stake.)

In [208]:
test_word_twitter = "musk"

In [209]:
model_twitter_pre.wv.most_similar(test_word_twitter)

[('elon', 0.9230594038963318),
 ('gab', 0.7670913338661194),
 ('rufos', 0.7479801177978516),
 ('mitchs', 0.7462440133094788),
 ('prolific', 0.736637532711029),
 ('1540', 0.7365007996559143),
 ('karine', 0.7309271693229675),
 ('lords', 0.7297621369361877),
 ('tirade', 0.7284401059150696),
 ('kemi', 0.7270700335502625)]

In [210]:
model_twitter_post.wv.most_similar(test_word_twitter)

[('elon', 0.9650890827178955),
 ('musks', 0.9605391621589661),
 ('amandamarcotte', 0.9394629001617432),
 ('mattwallace888', 0.9056192636489868),
 ('twitter', 0.8312138319015503),
 ('tweets', 0.6574302315711975),
 ('hashtag', 0.6550123691558838),
 ('screenshot', 0.6542471051216125),
 ('censored', 0.6467615962028503),
 ('libs', 0.6451894640922546)]

In [211]:
vis_list_twitter = ["facebook", "instagram", "tiktok", "user", "twitter", "elon", "musk", "tesla"]
title_twitter = "Acquisition of Twitter by Elon Musk"

In [212]:
display_scatterplot(model_twitter_pre, vis_list_twitter, test_word_twitter,title_twitter, "01 June 2022 - 01 October 2022", use_tsne=False, use_plotly=True)

In [213]:
display_scatterplot(model_twitter_post, vis_list_twitter, test_word_twitter, title_twitter, "01 October 2022 - 28 April 2023", use_tsne=False, use_plotly=True)

## Attack on Paul Pelosi

On October 28, 2022, an intruder attacked Paul Pelosi, the 82-year-old husband of Nancy Pelosi, then the Speaker of the United States House of Representatives. The assailant beat Paul Pelosi with a hammer during a home invasion burglary of the couple's residence in Pacific Heights, San Francisco. He was seriously injured and underwent surgery for his fractured skull.

source: [Wikipedia article](https://en.wikipedia.org/wiki/Attack_on_Paul_Pelosi)

In [214]:
test_word_pelosi = "hammer"

In [215]:
model_pelosi_pre.wv.most_similar(test_word_pelosi)

[('insulted', 0.7824319005012512),
 ('faking', 0.7689189910888672),
 ('friggin', 0.7634397149085999),
 ('kindergartener', 0.7623264789581299),
 ('cowardly', 0.7585606575012207),
 ('inked', 0.7581943273544312),
 ('jaw', 0.7577055096626282),
 ('detentions', 0.7575969099998474),
 ('affections', 0.7562553286552429),
 ('moscow', 0.7543800473213196)]

In [216]:
model_pelosi_post.wv.most_similar(test_word_pelosi)

[('pelosis', 0.7470654249191284),
 ('hoe', 0.7042176723480225),
 ('dicking', 0.6948853135108948),
 ('loses', 0.6946367025375366),
 ('virgin', 0.691310703754425),
 ('barebacked', 0.6904866099357605),
 ('fists', 0.6902223825454712),
 ('doggystyled', 0.6830849051475525),
 ('shoplifting', 0.679892361164093),
 ('rapper', 0.6786114573478699)]

In [217]:
vis_list_pelosi = ["hammer", "nail", "carpenter", "pelosi", "fractured", "pelosis"]
title_pelosi = "Attack on Paul Pelosi"

In [218]:
display_scatterplot(model_pelosi_pre, vis_list_pelosi, test_word_pelosi, title_pelosi,
                    "01 June 2022 - 26 October 2022", use_tsne=False, use_plotly=True)

In [219]:
display_scatterplot(model_pelosi_post, vis_list_pelosi, test_word_pelosi, title_pelosi, "26 October 2022 - 28 April 2023", use_tsne=False, use_plotly=True)

## Colorado Springs shooting at LGBTQ nightclub

On November 19–20, 2022, an anti-LGBT-motivated mass shooting occurred at Club Q, a gay bar in Colorado Springs, Colorado, United States. Five people were murdered, and 25 others were injured, 19 of them by gunfire. The shooter, 22-year-old Anderson Lee Aldrich, was also injured while being restrained, and was taken to a local hospital

source: [wikipedia article](https://en.wikipedia.org/wiki/Colorado_Springs_nightclub_shooting)

In [220]:
test_word_colorado_springs = "nightclub"

In [221]:
model_colorado_springs_pre.wv.most_similar(test_word_colorado_springs)

[('uncontrolled', 0.8246869444847107),
 ('viruss', 0.8086864948272705),
 ('bacteria', 0.7970495820045471),
 ('storms', 0.7968387007713318),
 ('scourge', 0.7943069338798523),
 ('deadly', 0.7929344773292542),
 ('ba1', 0.7917760014533997),
 ('waves', 0.7911545634269714),
 ('cholera', 0.7885691523551941),
 ('floods', 0.7823625206947327)]

In [222]:
model_colorado_springs_post.wv.most_similar(test_word_colorado_springs)

[('murders', 0.8588711619377136),
 ('spree', 0.8488684892654419),
 ('terror', 0.8468985557556152),
 ('covenant', 0.8098364472389221),
 ('rampage', 0.8085429668426514),
 ('gunman', 0.8083027005195618),
 ('nashville', 0.8069188594818115),
 ('filmed', 0.8009072542190552),
 ('mass', 0.7958326935768127),
 ('parkland', 0.7932655215263367)]

In [223]:
vis_list_colorado_springs = ["kentucky", "baltimore", "denver", "massacre", "gunman", "nightclub", "springs"]
title_colorado_springs = "Colorado Springs nightclub shooting"

In [224]:
display_scatterplot(model_colorado_springs_pre, vis_list_colorado_springs, test_word_colorado_springs, title_colorado_springs, "01 June 2022 - 18 November 2022", use_tsne=False, use_plotly=True)

In [225]:
display_scatterplot(model_colorado_springs_post, vis_list_colorado_springs, test_word_colorado_springs, title_colorado_springs, "18 November 2022 - 28 April 2023", use_tsne=False, use_plotly=True)