# Creation of Sparse PPMI Embeddings

In [1]:
import glob
import pickle
import pandas as pd
from pathlib import Path
import util as tppmi_util
from pprint import pprint
from tqdm.notebook import tqdm
from ppmi_model import PPMIModel

import nltk
nltk.download('punkt');

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/paulschmitt/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


## Setup

In [2]:
DIR = Path("../../data")
size = "medium"
split_type = "monthly"
min_freq = 5 # large = 2, medium = 5, short = 500
window_size = 5 # default word2vec window-size
number_of_context_words = 500
months = ["Jun", "Jul", "Aug", "Sep", "Oct", "Nov", "Dec", "Jan", "Feb", "Mar", "Apr"]
month_codes = {"Jun": "06", "Jul": "07", "Aug": "08", "Sep": "09", "Oct": "10", "Nov": "11", "Dec": "12", "Jan": "01", "Feb": "02", "Mar": "03", "Apr": "04"}

## Build Corpus

 /1. Texts of the entire dataset split into sentences

In [3]:
df = pd.read_csv(DIR / "processed_data.csv")
df = df[["text"]]

In [4]:
df = df.dropna()

In [5]:
corpus = ' '.join(df["text"])

In [10]:
context_words = tppmi_util.get_most_common_words(corpus, top_n=number_of_context_words, remove_stopwords=False)

In [11]:
pprint(context_words, compact=True, width=100)

['the', 'to', 'and', 'of', 'a', 'in', 'for', 'that', 'school', 'is', 'on', 'i', 'with', 'are', 'it',
 'as', 'have', 'at', 'this', 'be', 'you', 'by', 'not', 'was', 'they', 'from', 'students', 'their',
 'board', 'or', 'schools', 'but', 'an', 'we', 'has', 'said', 'who', 'about', 'its', 'all', 'will',
 'my', 'he', 'if', 'more', 'were', 'people', 'our', 'so', 'what', 'race', 'can', 'parents',
 'education', 'one', 'your', 'would', 'when', 'out', 'also', 'been', 'his', 'no', 'crt', 'children',
 'up', 'do', 'them', 'new', 'like', 'which', 'just', 'she', 'public', 'teachers', 'had', 'her',
 'covid', 'how', 'kids', 'other', 'after', 'some', 'there', 'year', 'state', 'because', 'get',
 'dont', 'covid19', 'district', 'being', 'time', 'student', 'over', 'trans', 'than', 'us', 'now',
 'me', 'rights', 'years', 'these', 'into', 'only', 'many', 'should', 'first', 'critical', 'high',
 'any', 'theory', 'even', 'health', 'those', 'during', 'post', 'teacher', 'know', 'make', 'most',
 'work', 'against', 'tw

/2. Method for reading posts of specified months of the dataset

In [9]:
input_dir = DIR / f"split/{split_type}"
csv_files = glob.glob(str(input_dir / "*.csv"))

# filter for starting-months
csv_files = [filename for filename in csv_files if filename.split("_to_")[0][-3:] in months]

# use dict to be able to identify the dfs later on
splits = {}

In [10]:
for filename in csv_files:
    df = pd.read_csv(filename)
    splits[filename.split("_to_")[0][-3:]] = df

## Build Model

In [11]:
ppmi_models = {key: PPMIModel.construct_from_texts(value, context_words=context_words, min_freq=min_freq) for key, value in tqdm(splits.items())}

  0%|          | 0/11 [00:00<?, ?it/s]

In [12]:
ppmi_model = ppmi_models[next(iter(ppmi_models))]

In [13]:
ppmi_model.compute_ppmi_matrix()

array([[1.93726214e-01, 0.00000000e+00, 2.63862636e-03, ...,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
       [0.00000000e+00, 1.05091684e-01, 0.00000000e+00, ...,
        1.67940210e+00, 1.67940210e+00, 1.67940210e+00],
       [9.18289548e-01, 0.00000000e+00, 1.11161366e+00, ...,
        2.84119326e+00, 2.84119326e+00, 2.84119326e+00],
       ...,
       [2.03636184e-01, 0.00000000e+00, 1.09278223e-01, ...,
        2.12653990e+00, 2.12653990e+00, 2.12653990e+00],
       [0.00000000e+00, 5.41647357e-01, 0.00000000e+00, ...,
        2.62678340e+00, 2.62678340e+00, 2.62678340e+00],
       [0.00000000e+00, 0.00000000e+00, 1.45204632e+00, ...,
        2.17002501e+00, 2.17002501e+00, 2.17002501e+00]])

In [14]:
ppmi_model.ppmi_matrix.shape

(12243, 20000)

In [15]:
ppmi_models.keys()

dict_keys(['Mar', 'Jan', 'Feb', 'Jun', 'Sep', 'Apr', 'Oct', 'Aug', 'Dec', 'Nov', 'Jul'])

Quick Check

In [16]:
test = ppmi_models["Sep"]
#test.get_as_df().head(10)

In [17]:
test.get_shape() # vocab x context_words

(10444, 20000)

## Vocab

In [18]:
for key, ppmi_model in ppmi_models.items():
    print(f"Vocabulary size of timestep {key}: {ppmi_model.get_vocabulary_size()}")

Vocabulary size of timestep Mar: 12243
Vocabulary size of timestep Jan: 11687
Vocabulary size of timestep Feb: 10446
Vocabulary size of timestep Jun: 9760
Vocabulary size of timestep Sep: 10444
Vocabulary size of timestep Apr: 10208
Vocabulary size of timestep Oct: 13448
Vocabulary size of timestep Aug: 11428
Vocabulary size of timestep Dec: 11992
Vocabulary size of timestep Nov: 13232
Vocabulary size of timestep Jul: 10295


## Compute PPMI Matrix

### Calulate & Save

In [19]:
output_dir = DIR / f"ppmi-matrices/split/{number_of_context_words}"
output_dir.mkdir(parents=True, exist_ok=True)

# save common context-words
print(f"Number of context-words: {len(context_words)}")
with open(output_dir / f"context-words.pkl", "wb") as f:
    pickle.dump(context_words, f)

# save vocab and ppmi-matrices
for key, ppmi_model in ppmi_models.items():
    ppmi_model.compute_ppmi_matrix(window_size=window_size)
    ppmi_model.save(month_codes[key], output_dir)
print("--------------- done ---------------")

Number of context-words: 20000
PPMI data for 03 saved successfully.
Vocabulary Size: 12243
PPMI data for 01 saved successfully.
Vocabulary Size: 11687
PPMI data for 02 saved successfully.
Vocabulary Size: 10446
PPMI data for 06 saved successfully.
Vocabulary Size: 9760
PPMI data for 09 saved successfully.
Vocabulary Size: 10444
PPMI data for 04 saved successfully.
Vocabulary Size: 10208
PPMI data for 10 saved successfully.
Vocabulary Size: 13448
PPMI data for 08 saved successfully.
Vocabulary Size: 11428
PPMI data for 12 saved successfully.
Vocabulary Size: 11992
PPMI data for 11 saved successfully.
Vocabulary Size: 13232
PPMI data for 07 saved successfully.
Vocabulary Size: 10295
--------------- done ---------------


 ### Control

In [20]:
'''vocab_all = set()

for _, ppmi_model in ppmi_models.items():
    vocab_all.update(ppmi_model.vocab)''';

In [21]:
# len(vocab_all)

In [22]:
'''manual_dir = Path("../../data/ppmi-matrices/old_quarter")
with open(manual_dir / f"context-words.pkl", "wb") as f:
    pickle.dump(list(vocab_all), f)''';

## Save

In [23]:
''''# Add quotechars to row and column names
ppmi_matrix_df.index = '"' + ppmi_matrix_df.index + '"'
ppmi_matrix_df.columns = '"' + ppmi_matrix_df.columns + '"''';

In [24]:
'''filename = f"ppmi-2022-{month_code}-01.txt"
ppmi_path = DIR / "ppmi-matrices"

ppmi_path.mkdir(parents=True, exist_ok=True)
#ppmi_matrix_df.to_csv(ppmi_path / filename, sep=" ", index=True, quoting=3)''';

In [25]:
'''print(f"{month}({month_code}) - done")
   print(f"Vocabulary-Size: {ppmi_model.get_vocabulary_size()}")''';