# Creation of Sparse PPMI Embeddings for the NYT Dataset

In [1]:
%load_ext autoreload

In [2]:
import glob
import pickle
import pandas as pd
from pathlib import Path
from pprint import pprint
from tqdm.notebook import tqdm

import sys
sys.path.append('../../')

from src.packages.TPPMI.ppmi_model import PPMIModel
from src.packages.TPPMI.tppmi_model import TPPMIModel
import src.packages.TPPMI.util as tppmi_util

import nltk
# nltk.download('punkt');

## Setup

In [3]:
DATA_DIR = Path("../../data")
input_dir =  DATA_DIR / "processed" / "nyt-data"
min_freq = 0 # infrequent words have been removed in preprocessing
window_size = 5 # default word2vec window-size
number_of_context_words = 500

## Sample Context Words

In [4]:
df = pd.read_csv(input_dir / "corpus_all.csv")
df = df[["text"]]

In [5]:
df = df.dropna()

In [6]:
corpus = ' '.join(df["text"])

In [7]:
top_n = max(2 * number_of_context_words, 2000)
context_words = tppmi_util.sample_from_most_common_words(corpus, top_n=top_n,
                                                         sample_size=number_of_context_words, remove_stopwords=False)

In [8]:
pprint(context_words, compact=True, width=100)

['salad', 'jr', 'score', 'jersey', 'goal', 'tickets', '1997', 'cars', 'focused', 'mike', 'sell',
 'international', 'treatment', 'declined', 'weeks', 'spot', 'nations', 'little', 'worried',
 'traffic', 'decide', 'model', 'questions', 'nice', 'imagine', 'married', 'spending', 'statement',
 'fall', 'known', 'article', 'streets', 'hes', 'agency', 'recently', 'fewer', 'scott', 'changes',
 'came', 'test', 'prove', 'interesting', 'term', 'feet', 'dining', 'primary', 'scene', 'votes',
 'red', 'institute', 'girls', 'elections', 'issued', 'struggle', 'senate', 'talent', 'opening',
 'day', 'nothing', 'meet', 'unusual', 'sides', 'reason', 'community', 'proved', 'letters',
 'quality', 'prosecutors', 'block', 'starts', 'car', 'century', 'four', 'animals', 'unlikely',
 'faced', 'earned', 'followed', 'various', 'something', '70', 'night', 'feb', 'drew', 'ready',
 'bills', 'issue', 'camp', 'planning', 'think', 'generally', 'fear', 'republicans', 'knicks', '18',
 'friends', 'cause', 'rare', '15', 'matte

## Build Model

In [9]:
csv_files = glob.glob(str(input_dir /  "*data.csv"))

# use dict to be able to identify the dfs later on
splits = {}

In [10]:
for filename in csv_files:
    df = pd.read_csv(filename)
    splits[filename.split("_data")[0][-4:]] = df

In [11]:
ppmi_models = {key: PPMIModel.construct_from_texts(value, context_words=context_words, min_freq=min_freq) for key, value in tqdm(splits.items())}

  0%|          | 0/27 [00:00<?, ?it/s]

## Control

In [12]:
ppmi_models.keys()

dict_keys(['2000', '2001', '1997', '1996', '2016', '1990', '1991', '2010', '2011', '2007', '2006', '1994', '1995', '2014', '2015', '2009', '2008', '2003', '2002', '2004', '2005', '1999', '1998', '1993', '1992', '2013', '2012'])

In [13]:
ppmi_model = ppmi_models[next(iter(ppmi_models))]

In [14]:
ppmi_model.ppmi_matrix.shape

(21756, 500)

## Vocabulary

In [15]:
for key, ppmi_model in ppmi_models.items():
    print(f"Vocabulary size of timestep {key}: {ppmi_model.get_vocabulary_size()}")

Vocabulary size of timestep 2000: 21756
Vocabulary size of timestep 2001: 21780
Vocabulary size of timestep 1997: 21357
Vocabulary size of timestep 1996: 21602
Vocabulary size of timestep 2016: 21334
Vocabulary size of timestep 1990: 21238
Vocabulary size of timestep 1991: 21387
Vocabulary size of timestep 2010: 21798
Vocabulary size of timestep 2011: 21820
Vocabulary size of timestep 2007: 21931
Vocabulary size of timestep 2006: 19104
Vocabulary size of timestep 1994: 21503
Vocabulary size of timestep 1995: 21520
Vocabulary size of timestep 2014: 21856
Vocabulary size of timestep 2015: 21853
Vocabulary size of timestep 2009: 21810
Vocabulary size of timestep 2008: 21935
Vocabulary size of timestep 2003: 21836
Vocabulary size of timestep 2002: 21822
Vocabulary size of timestep 2004: 21906
Vocabulary size of timestep 2005: 21906
Vocabulary size of timestep 1999: 21709
Vocabulary size of timestep 1998: 21699
Vocabulary size of timestep 1993: 21439
Vocabulary size of timestep 1992: 21437


## Normalize (optional)

In [18]:
tppmi_model = TPPMIModel(ppmi_models, dates="years")

In [19]:
tppmi_model.normalize_tppmi_scores()

  0%|          | 0/27 [00:00<?, ?it/s]

  0%|          | 0/27 [00:01<?, ?it/s]

## Calulate & Save

In [22]:
output_dir = DATA_DIR / "ppmi-matrices" / "nyt-data"
output_dir.mkdir(parents=True, exist_ok=True)

# save common context-words

with open(output_dir / f"context-words.pkl", "wb") as f:
    pickle.dump(context_words, f)

# save vocab and ppmi-matrices
for key, ppmi_model in tqdm(tppmi_model.ppmi_models.items()):
    ppmi_model.compute_ppmi_matrix(window_size=window_size)
    ppmi_model.save(key, output_dir)

print("--------------- done ---------------")
print(f"PPMI-Data stored to: {str(output_dir)}")
print(f"Number of context-words: {len(context_words)}")

  0%|          | 0/27 [00:00<?, ?it/s]

PPMI data for 2000 saved successfully.
Vocabulary Size: 21756
PPMI data for 2001 saved successfully.
Vocabulary Size: 21780
PPMI data for 1997 saved successfully.
Vocabulary Size: 21357
PPMI data for 1996 saved successfully.
Vocabulary Size: 21602
PPMI data for 2016 saved successfully.
Vocabulary Size: 21334
PPMI data for 1990 saved successfully.
Vocabulary Size: 21238
PPMI data for 1991 saved successfully.
Vocabulary Size: 21387
PPMI data for 2010 saved successfully.
Vocabulary Size: 21798
PPMI data for 2011 saved successfully.
Vocabulary Size: 21820
PPMI data for 2007 saved successfully.
Vocabulary Size: 21931
PPMI data for 2006 saved successfully.
Vocabulary Size: 19104
PPMI data for 1994 saved successfully.
Vocabulary Size: 21503
PPMI data for 1995 saved successfully.
Vocabulary Size: 21520
PPMI data for 2014 saved successfully.
Vocabulary Size: 21856
PPMI data for 2015 saved successfully.
Vocabulary Size: 21853
PPMI data for 2009 saved successfully.
Vocabulary Size: 21810
PPMI dat