# Model Analysis

In [1]:
import pickle
from glob import glob
from pathlib import Path
import scipy.sparse as sp
from tqdm.notebook import tqdm
from gensim.models.word2vec import Word2Vec

import sys
sys.path.append('../../')

from src.packages.TPPMI.ppmi_model import PPMIModel
from src.packages.TPPMI.tppmi_model import TPPMIModel

from src.visualization.embedding_visualization import  \
    plot_word_vectors_cade,  plot_word_vectors_tppmi, print_most_similar_cade

## Setup

### Paths

In [2]:
number_of_context_words = "500"
input_dir = Path("../../model") / "social-media-data" / "quarter"
ppmi_path = Path("../../data") / "ppmi-matrices" / "social-media-data" / "quarter" / number_of_context_words

### Load Models

In [3]:
cade_model_filenames = glob(str(input_dir / "*.model"))

In [4]:
# load models
cade_models_quarterly = {f"model_{model_file.split('_')[1][0:3].lower()}":Word2Vec.load(model_file) for model_file in tqdm(cade_model_filenames)}

  0%|          | 0/4 [00:00<?, ?it/s]

Put them in the right chronological order

In [5]:
order = ['jun', 'sep', 'dec', 'mar', 'apr']
month_codes = {"jun": 6, "sep": 9, "dec": 12, "mar": 3}

# Sort the keys based on the desired month order
sorted_keys = sorted(cade_models_quarterly.keys(), key=lambda x: order.index(x.split('_')[1]))

# Create a sorted dictionaries
cade_models_quarterly = {key: cade_models_quarterly[key] for key in sorted_keys}

In [6]:
cade_models_quarterly.keys()

dict_keys(['model_jun', 'model_sep', 'model_dec', 'model_mar'])

### TPPMI

In [7]:
ppmi_data_files = sorted(glob(str(ppmi_path  / "*.npz")))
words_files = sorted(glob(str(ppmi_path  / "*.pkl")))

Split context-words from timestamped-vocabularies

In [8]:
context_words_file = [path for path in words_files if "context-words" in path]
ppmi_vocab_files = [path for path in words_files if "context-words" not in path]

In [9]:
# Get ppmi-matrices and vocab
ppmi_matrices = {}

for filenames in zip(ppmi_vocab_files, ppmi_data_files):
    ppmi_matrix = sp.load_npz(filenames[1])
    with open(filenames[0], "rb") as f:
        vocab = pickle.load(f)
    key = filenames[0].split("ppmi-")[2][0:2]
    ppmi_matrices[key] = {"ppmi_matrix" : ppmi_matrix, "vocab": vocab}

# Get common context-words
with open(context_words_file[0], "rb") as f:
    context_words = pickle.load(f)

In [10]:
ppmi_matrices.keys()

dict_keys(['03', '06', '09', '12'])

Create ppmi_model objects

In [11]:
ppmi_models = {key: PPMIModel.construct_from_data(ppmi_data["ppmi_matrix"], ppmi_data["vocab"], context_words) for key, ppmi_data in ppmi_matrices.items()}

In [12]:
tppmi_model = TPPMIModel(ppmi_models)

## Mass Shootings

### Overview

- **May:** Uvalde Robb Elementary School shooting (May 24, 2022)
- **November:** Colorado Springs nightclub shooting (November 19, 2022)
- **March:** Nashville school shooting (March 27, 2023)

In [13]:
target_word_shootings = "shooting"
test_words_shootings = ["uvalde", "colorado", "nashville"]

In [14]:
title_shootings = f"Evolution of the word {target_word_shootings} over time"

### Visualizations

#### Cade

In [15]:
plot_word_vectors_cade(cade_models_quarterly, test_words_shootings + [target_word_shootings], range=[-2, 2], use_tsne=False, perplexity=3)

#### TPPMI

In [16]:
tppmi_shootings = tppmi_model.get_tppmi(test_words_shootings + [target_word_shootings], smooth=True)

All words are contained in the vocabulary


In [17]:
plot_word_vectors_tppmi(tppmi_model, test_words_shootings + [target_word_shootings],
                        use_tsne=False, range=[-10, 15])

All words are contained in the vocabulary


In [18]:
tppmi_shootings = tppmi_model.get_tppmi(test_words_shootings + [target_word_shootings])

All words are contained in the vocabulary


## Elon Musk Twitter takeover

### Overview

Business magnate Elon Musk initiated an acquisition of American social media company Twitter, Inc. on April 14, 2022, and concluded it on October 27, 2022. Musk had begun buying shares of the company in January 2022, becoming its largest shareholder by April with a 9.1 percent ownership stake

source: [Wikipedia article](https://en.wikipedia.org/wiki/Acquisition_of_Twitter_by_Elon_Musk#:~:text=Business%20magnate%20Elon%20Musk%20initiated,a%209.1%20percent%20ownership%20stake.)

In [19]:
target_word_twitter = "twitter"
test_words_twitter = ["facebook",  "elon", "instagram"]

In [20]:
title_twitter = f"Evolution of the word {target_word_twitter} over time"

### Visualizations

In [21]:
plot_word_vectors_cade(cade_models_quarterly, test_words_twitter + [target_word_twitter], range=[-8, 8])

In [22]:
tppmi_twitter = tppmi_model.get_tppmi(test_words_twitter + [target_word_twitter])

All words are contained in the vocabulary


In [23]:
plot_word_vectors_tppmi(tppmi_model, test_words_twitter + [target_word_twitter], range=[-10, 15])

All words are contained in the vocabulary


## President

### Overview

June:
- Boris Johnson: vote of no confidence June 6, 2022
- Lyndon B Johnson: Talks leading up to the civil rights act July 2, (1964)
- Janet Godwin: The Iowa City-based education testing company ACT will be laying off 106 employees by July 1.

September:
- Scott Hagel: Vice president of marketing

January:
- Richard Cordoray: Considered as vice chairman of the federal reserve for banking supervision

March/April:
- Donald Trump: Trump charged with 34 felony counts in hush money scheme

In [24]:
target_word_president = "president"
test_words_president = ["johnson", "trump", "biden", "former", "obama", "minister"]

### Visualizations

In [25]:
plot_word_vectors_cade(cade_models_quarterly, test_words_president + [target_word_president], range=[-10, 10])

In [26]:
tppmi_president = tppmi_model.get_tppmi(test_words_president + [target_word_president])

All words are contained in the vocabulary


In [28]:
plot_word_vectors_tppmi(tppmi_model, test_words_president + [target_word_president], range=[-10, 15])

All words are contained in the vocabulary
