# Creation of Sparse PPMI Embeddings

In [1]:
import glob
import pickle
import pandas as pd
from pathlib import Path
from pprint import pprint
from tqdm.notebook import tqdm
from ppmi_model import PPMIModel
import util as tppmi_util

import nltk
nltk.download('punkt');

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/paulschmitt/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


## Setup

In [66]:
DIR = Path("../../data")
split_type = "monthly"
min_freq = 5
window_size = 5 # default word2vec window-size
number_of_context_words = 5000
months = ["Jun", "Jul", "Aug", "Sep", "Oct", "Nov", "Dec", "Jan", "Feb", "Mar", "Apr"]
month_codes = {"Jun": "06", "Jul": "07", "Aug": "08", "Sep": "09", "Oct": "10", "Nov": "11", "Dec": "12", "Jan": "01", "Feb": "02", "Mar": "03", "Apr": "04"}

## Build Corpus

 /1. Texts of the entire dataset split into sentences

In [67]:
df = pd.read_csv(DIR / "processed_data.csv")
df = df[["text"]]

In [68]:
df = df.dropna()

In [69]:
corpus = ' '.join(df["text"])

In [70]:
top_n = max(2 * number_of_context_words, 2000)
context_words = tppmi_util.sample_from_most_common_words(corpus, top_n=top_n,
                                                         sample_size=number_of_context_words, remove_stopwords=False)

In [71]:
pprint(context_words, compact=True, width=100)

['persistent', 'charities', 'shameful', 'apr', 'flash', 'womens', 'misdemeanor', 'university',
 'questioning', 'ceremonies', 'doctoral', 'demanded', 'anywhere', 'paycheck', 'edit', 'attributed',
 'disclosure', 'momlife', 'creates', 'transgenderism', 'bills', 'treating', 'bradley',
 'exceptional', 'fraud', 'ellis', 'errors', 'trading', 'lcps', 'business', 'reconciliation',
 'severe', 'restrictive', 'minors', 'bullshit', 'bathroom', 'coal', 'blind', 'as', '101',
 'kitchener', 'la', 'asserted', 'litter', 'isd', 'beacon', 'merit', 'industrial', 'distress',
 'destroys', 'rosa', 'robbed', 'expecting', 'contributing', 'tho', 'choices', 'lawful', 'playbook',
 'followers', 'prisons', 'sufficient', 'indianapolis', 'fundraising', 'watched', 'remedy',
 'apparent', 'communicating', 'webster', 'suppression', 'pfizers', 'celebrations', 'done', 'boring',
 'very', 'swiftly', 'purposes', 'confirms', 'beto', 'entertainment', 'whistleblower', 'debunked',
 'suzanne', 'candy', 'recall', 'measure', '24', 'me

/2. Method for reading posts of specified months of the dataset

In [72]:
input_dir = DIR / f"split/{split_type}"
csv_files = glob.glob(str(input_dir / "*.csv"))

# filter for starting-months
csv_files = [filename for filename in csv_files if filename.split("_to_")[0][-3:] in months]

# use dict to be able to identify the dfs later on
splits = {}

In [73]:
for filename in csv_files:
    df = pd.read_csv(filename)
    splits[filename.split("_to_")[0][-3:]] = df

## Build Model

In [74]:
ppmi_models = {key: PPMIModel.construct_from_texts(value, context_words=context_words, min_freq=min_freq) for key, value in tqdm(splits.items())}

  0%|          | 0/11 [00:00<?, ?it/s]

### Control

In [75]:
ppmi_model = ppmi_models[next(iter(ppmi_models))]

In [76]:
ppmi_model.compute_ppmi_matrix()

array([[0.        , 0.        , 0.        , ..., 0.        , 2.05667708,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.8169862 ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 2.21386267,
        0.        ],
       ...,
       [0.        , 0.        , 0.        , ..., 0.        , 1.10116564,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 2.37513081,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 1.43903746,
        0.        ]])

In [77]:
ppmi_model.ppmi_matrix.shape

(12243, 5000)

In [78]:
ppmi_models.keys()

dict_keys(['Mar', 'Jan', 'Feb', 'Jun', 'Sep', 'Apr', 'Oct', 'Aug', 'Dec', 'Nov', 'Jul'])

## Vocab

In [79]:
for key, ppmi_model in ppmi_models.items():
    print(f"Vocabulary size of timestep {key}: {ppmi_model.get_vocabulary_size()}")

Vocabulary size of timestep Mar: 12243
Vocabulary size of timestep Jan: 11687
Vocabulary size of timestep Feb: 10446
Vocabulary size of timestep Jun: 9760
Vocabulary size of timestep Sep: 10444
Vocabulary size of timestep Apr: 10208
Vocabulary size of timestep Oct: 13448
Vocabulary size of timestep Aug: 11428
Vocabulary size of timestep Dec: 11992
Vocabulary size of timestep Nov: 13232
Vocabulary size of timestep Jul: 10295


## Calulate & Save

In [80]:
output_dir = DIR / f"ppmi-matrices/{split_type}/{number_of_context_words}"
output_dir.mkdir(parents=True, exist_ok=True)

# save common context-words
with open(output_dir / f"context-words.pkl", "wb") as f:
    pickle.dump(context_words, f)

# save vocab and ppmi-matrices
for key, ppmi_model in ppmi_models.items():
    ppmi_model.compute_ppmi_matrix(window_size=window_size)
    ppmi_model.save(month_codes[key], output_dir)

print("--------------- done ---------------")
print(f"PPMI-Data stored to: {str(output_dir)}")
print(f"Number of context-words: {len(context_words)}")

PPMI data for 03 saved successfully.
Vocabulary Size: 12243
PPMI data for 01 saved successfully.
Vocabulary Size: 11687
PPMI data for 02 saved successfully.
Vocabulary Size: 10446
PPMI data for 06 saved successfully.
Vocabulary Size: 9760
PPMI data for 09 saved successfully.
Vocabulary Size: 10444
PPMI data for 04 saved successfully.
Vocabulary Size: 10208
PPMI data for 10 saved successfully.
Vocabulary Size: 13448
PPMI data for 08 saved successfully.
Vocabulary Size: 11428
PPMI data for 12 saved successfully.
Vocabulary Size: 11992
PPMI data for 11 saved successfully.
Vocabulary Size: 13232
PPMI data for 07 saved successfully.
Vocabulary Size: 10295
--------------- done ---------------
PPMI-Data stored to: ../../data/ppmi-matrices/monthly/5000
Number of context-words: 5000
