# Creation of Sparse PPMI Embeddings for the NYT Dataset

In [2]:
%load_ext autoreload

In [3]:
import glob
import pickle
import pandas as pd
from pathlib import Path
from pprint import pprint
from tqdm.notebook import tqdm

import sys
sys.path.append('../../')

from src.packages.TPPMI.ppmi_model import PPMIModel
from src.packages.TPPMI.tppmi_model import TPPMIModel
import src.packages.TPPMI.util as tppmi_util

import nltk
# nltk.download('punkt');

## Setup

In [13]:
DATA_DIR = Path("../../data")
input_dir =  DATA_DIR / "processed" / "nyt-data"
min_freq = 5 # infrequent words have been removed in preprocessing
window_size = 5 # default word2vec window-size
number_of_context_words = 500

## Sample Context Words

In [14]:
df = pd.read_csv(input_dir / "corpus_all.csv")
df = df[["text"]]

In [15]:
df = df.dropna()

In [16]:
corpus = ' '.join(df["text"])

In [17]:
top_n = max(2 * number_of_context_words, 2000)
context_words = tppmi_util.sample_from_most_common_words(corpus, top_n=top_n,
                                                         sample_size=number_of_context_words, remove_stopwords=False)

In [18]:
pprint(context_words, compact=True, width=100)

['supposed', 'ended', 'edge', 'great', 'businesses', 'june', 'hit', 'yet', 'fell', 'records',
 'says', '6', 'information', 'continued', 'stock', 'arts', 'restaurants', 'takes', 'weeks',
 'wanted', 'rice', 'scott', 'reviews', 'highest', 'arms', 'street', 'meant', 'death', 'executive',
 'earth', 'market', 'car', 'give', 'consumer', 'described', 'ways', 'working', 'movies', 'panel',
 'cars', 'face', 'covered', 'kill', 'agreed', 'village', 'taken', 'larger', 'grow', 'operating',
 'asked', 'thursday', 'seemed', 'understand', 'growing', 'coverage', 'republican', 'pay', 'dream',
 'pulled', 'hour', 'post', 'looking', 'open', 'appear', 'color', 'expect', 'mayor', 'williams',
 'favor', 'telling', 'tell', 'normal', 'path', 'jersey', 'worst', 'pm', 'actually', 'quickly',
 'faces', 'davis', 'april', 'vote', 'relationship', 'hardly', 'said', 'victory', 'broken', 'attack',
 'college', 'cut', 'film', 'labor', 'nearby', 'area', 'town', 'father', 'researchers', 'memory',
 'ny', 'perhaps', 'services', 'i

## Build Model

In [19]:
csv_files = glob.glob(str(input_dir /  "*data.csv"))

# use dict to be able to identify the dfs later on
splits = {}

In [20]:
for filename in csv_files:
    df = pd.read_csv(filename)
    splits[filename.split("_data")[0][-4:]] = df

In [21]:
ppmi_models = {key: PPMIModel.construct_from_texts(value, context_words=context_words, min_freq=min_freq) for key, value in tqdm(splits.items())}

  0%|          | 0/27 [00:00<?, ?it/s]

## Control

In [22]:
ppmi_models.keys()

dict_keys(['2000', '2001', '1997', '1996', '2016', '1990', '1991', '2010', '2011', '2007', '2006', '1994', '1995', '2014', '2015', '2009', '2008', '2003', '2002', '2004', '2005', '1999', '1998', '1993', '1992', '2013', '2012'])

In [30]:
ppmi_model_2000 = ppmi_models["2000"]
ppmi_model_2000.compute_ppmi_matrix(window_size=window_size, normalize=True)

array([[0.        , 0.        , 0.        , ..., 0.10130641, 0.08115552,
        0.03853088],
       [0.        , 0.        , 0.        , ..., 0.        , 0.2714976 ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       ...,
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ]])

In [31]:
ppmi_model_2006 = ppmi_models["2006"]
ppmi_model_2006.compute_ppmi_matrix(window_size=window_size, normalize=True)

array([[0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       ...,
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.12466418,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ]])

In [32]:
ppmi_model_2000.get_as_df().loc["bush"]

supposed      0.063795
ended         0.098751
edge          0.000000
great         0.000000
businesses    0.000000
                ...   
parents       0.036076
scientists    0.000000
begins        0.000000
died          0.000000
johnson       0.000000
Name: bush, Length: 500, dtype: float64

In [33]:
ppmi_model_2006.get_as_df().loc["bush"]

supposed      0.000000
ended         0.063651
edge          0.000000
great         0.000000
businesses    0.000000
                ...   
parents       0.000000
scientists    0.000000
begins        0.000000
died          0.076239
johnson       0.000000
Name: bush, Length: 500, dtype: float64

## Vocabulary

In [34]:
for key, ppmi_model in ppmi_models.items():
    print(f"Vocabulary size of timestep {key}: {ppmi_model.get_vocabulary_size()}")

Vocabulary size of timestep 2000: 20680
Vocabulary size of timestep 2001: 20547
Vocabulary size of timestep 1997: 17841
Vocabulary size of timestep 1996: 19706
Vocabulary size of timestep 2016: 17469
Vocabulary size of timestep 1990: 18567
Vocabulary size of timestep 1991: 19057
Vocabulary size of timestep 2010: 20049
Vocabulary size of timestep 2011: 20557
Vocabulary size of timestep 2007: 21273
Vocabulary size of timestep 2006: 11661
Vocabulary size of timestep 1994: 19225
Vocabulary size of timestep 1995: 19358
Vocabulary size of timestep 2014: 20996
Vocabulary size of timestep 2015: 20951
Vocabulary size of timestep 2009: 20266
Vocabulary size of timestep 2008: 21099
Vocabulary size of timestep 2003: 20680
Vocabulary size of timestep 2002: 20684
Vocabulary size of timestep 2004: 21296
Vocabulary size of timestep 2005: 21307
Vocabulary size of timestep 1999: 20306
Vocabulary size of timestep 1998: 20269
Vocabulary size of timestep 1993: 19114
Vocabulary size of timestep 1992: 19152


In [36]:
tppmi_model = TPPMIModel(ppmi_models, dates="years")

## Normalize (currently normalized otherwise)

In [37]:
# tppmi_model.normalize_tppmi_scores()

## Calulate & Save

In [40]:
output_dir = DATA_DIR / "ppmi-matrices" / "nyt-data"
output_dir.mkdir(parents=True, exist_ok=True)

# save common context-words

with open(output_dir / f"context-words.pkl", "wb") as f:
    pickle.dump(context_words, f)

# save vocab and ppmi-matrices
for key, ppmi_model in tqdm(tppmi_model.ppmi_models.items()):
    ppmi_model.compute_ppmi_matrix(window_size=window_size, normalize=True)
    ppmi_model.save(key, output_dir)
    print("---------------------")

print("--------------- done ---------------")
print(f"PPMI-Data stored to: {str(output_dir)}")
print(f"Number of context-words: {len(context_words)}")

  0%|          | 0/27 [00:00<?, ?it/s]

PPMI data for 2000 saved successfully.
Vocabulary Size: 20680
---------------------
PPMI data for 2001 saved successfully.
Vocabulary Size: 20547
---------------------
PPMI data for 1997 saved successfully.
Vocabulary Size: 17841
---------------------
PPMI data for 1996 saved successfully.
Vocabulary Size: 19706
---------------------
PPMI data for 2016 saved successfully.
Vocabulary Size: 17469
---------------------
PPMI data for 1990 saved successfully.
Vocabulary Size: 18567
---------------------
PPMI data for 1991 saved successfully.
Vocabulary Size: 19057
---------------------
PPMI data for 2010 saved successfully.
Vocabulary Size: 20049
---------------------
PPMI data for 2011 saved successfully.
Vocabulary Size: 20557
---------------------
PPMI data for 2007 saved successfully.
Vocabulary Size: 21273
---------------------
PPMI data for 2006 saved successfully.
Vocabulary Size: 11661
---------------------
PPMI data for 1994 saved successfully.
Vocabulary Size: 19225
--------------