# Creation of Sparse PPMI Embeddings for the NYT Dataset

In [31]:
%load_ext autoreload

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [32]:
import glob
import pickle
import pandas as pd
from pathlib import Path
from pprint import pprint
from tqdm.notebook import tqdm

import sys
sys.path.append('../../')

from src.packages.TPPMI.ppmi_model import PPMIModel
from src.packages.TPPMI.tppmi_model import TPPMIModel
import src.packages.TPPMI.util as tppmi_util

import nltk
# nltk.download('punkt');

## Setup

In [33]:
DATA_DIR = Path("../../data")
input_dir =  DATA_DIR / "processed" / "nyt-data"
min_freq = 5 # infrequent words have been removed in preprocessing
window_size = 5 # default word2vec window-size
number_of_context_words = 8000

## Sample Context Words

In [34]:
df = pd.read_csv(input_dir / "corpus_all.csv")
df = df[["text"]]

In [35]:
df = df.dropna()

In [36]:
corpus = ' '.join(df["text"])

In [37]:
top_n = max(2 * number_of_context_words, 12000)
context_words = tppmi_util.sample_from_most_common_words(corpus, top_n=top_n,
                                                         sample_size=number_of_context_words, remove_stopwords=False)

In [38]:
pprint(context_words, compact=True, width=100)

['enthusiastic', 'garland', 'espresso', 'jacobson', 'storm', 'excuses', 'cruising', 'everybodys',
 'ozone', 'consortium', 'supports', 'candidate', '1912', 'documented', 'amnesty', 'voices',
 'sergei', 'battling', 'patches', 'issued', '8', 'theodore', 'particular', 'easily', 'spite',
 'salvador', 'accessibility', 'jaws', 'subpoenas', 'larry', 'insulting', 'households', 'threerun',
 'creature', 'paved', 'batter', 'incarceration', 'tournaments', 'recognized', 'paragraph', 'except',
 'parallel', 'objects', 'underwriting', 'brown', 'mckay', 'spilled', 'favored', 'wineries',
 'texture', 'counselor', 'larger', 'dolan', 'ireland', 'club', 'sandra', 'canon', 'briefing',
 'brilliantly', 'withdrawal', 'picasso', 'rodham', 'speeches', 'atomic', 'explosion', '18thcentury',
 'amid', 'guilt', 'mellow', 'administrator', 'alienated', 'textured', 'suing', 'precious', 'cloth',
 'packaged', 'mentally', 'rooms', 'depths', 'tested', 'entree', 'jolt', 'columbus', 'spate',
 'driven', 'remembered', 'parkway', 

## Build Model

In [39]:
csv_files = glob.glob(str(input_dir /  "*data.csv"))

# use dict to be able to identify the dfs later on
splits = {}

In [40]:
for filename in csv_files:
    df = pd.read_csv(filename)
    splits[filename.split("_data")[0][-4:]] = df

In [None]:
ppmi_models = {key: PPMIModel.construct_from_texts(value, context_words=context_words, min_freq=min_freq) for key, value in tqdm(splits.items())}

  0%|          | 0/27 [00:00<?, ?it/s]

## Control

In [None]:
ppmi_models.keys()

## Vocabulary

In [None]:
for key, ppmi_model in ppmi_models.items():
    print(f"Vocabulary size of timestep {key}: {ppmi_model.get_vocabulary_size()}")

## Calulate & Save

In [None]:
tppmi_model = TPPMIModel(ppmi_models, dates="years")

In [None]:
output_dir = DATA_DIR / "ppmi-matrices" / "nyt-data" / str(number_of_context_words)
output_dir.mkdir(parents=True, exist_ok=True)

# save common context-words
with open(output_dir / f"context-words.pkl", "wb") as f:
    pickle.dump(context_words, f)

# save vocab and ppmi-matrices
for key, ppmi_model in tqdm(tppmi_model.ppmi_models.items()):
    ppmi_model.compute_ppmi_matrix(window_size=window_size, normalize=True)
    ppmi_model.save(key, output_dir)
    print("----------------------")

print("--------------- done ---------------")
print(f"PPMI-Data stored to: {str(output_dir)}")
print(f"Number of context-words: {len(context_words)}")