# PPMI

In [127]:
import glob
import pickle
import pandas as pd
from pathlib import Path
import util as tppmi_util
from pprint import pprint
from tqdm.notebook import tqdm
from ppmi_model import PPMIModel

import nltk
nltk.download('punkt');

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/paulschmitt/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


## Setup

In [128]:
DIR = Path("../../data")
size = "medium"
split_type = "quarter"
min_freq = 5 # large = 2, medium = 5, short = 500
window_size = 5 # default word2vec window-size
number_of_context_words = 5000
months = ["Jun", "Jul", "Aug", "Sep", "Oct", "Nov", "Dec", "Jan", "Feb", "Mar", "Apr"]
month_codes = {"Jun": "06", "Jul": "07", "Aug": "08", "Sep": "09", "Oct": "10", "Nov": "11", "Dec": "12", "Jan": "01", "Feb": "02", "Mar": "03", "Apr": "04"}

## Build Corpus

 /1. Texts of the entire dataset split into sentences

In [129]:
df = pd.read_csv(DIR / "processed_data.csv")
df = df[["text"]]

In [130]:
df = df.dropna()

In [131]:
corpus = ' '.join(df["text"])

In [132]:
context_words = tppmi_util.get_most_common_words(corpus, top_n=number_of_context_words, remove_stopwords=False)

In [133]:
pprint(context_words, compact=True, width=100)

['the', 'to', 'and', 'of', 'a', 'in', 'for', 'that', 'school', 'is', 'on', 'i', 'with', 'are', 'it',
 'as', 'have', 'at', 'this', 'be', 'you', 'by', 'not', 'was', 'they', 'from', 'students', 'their',
 'board', 'or', 'schools', 'but', 'an', 'we', 'has', 'said', 'who', 'about', 'its', 'all', 'will',
 'my', 'he', 'if', 'more', 'were', 'people', 'our', 'so', 'what', 'race', 'can', 'parents',
 'education', 'one', 'your', 'would', 'when', 'out', 'also', 'been', 'his', 'no', 'crt', 'children',
 'up', 'do', 'them', 'new', 'like', 'which', 'just', 'she', 'public', 'teachers', 'had', 'her',
 'covid', 'how', 'kids', 'other', 'after', 'some', 'there', 'year', 'state', 'because', 'get',
 'dont', 'covid19', 'district', 'being', 'time', 'student', 'over', 'trans', 'than', 'us', 'now',
 'me', 'rights', 'years', 'these', 'into', 'only', 'many', 'should', 'first', 'critical', 'high',
 'any', 'theory', 'even', 'health', 'those', 'during', 'post', 'teacher', 'know', 'make', 'most',
 'work', 'against', 'tw

/2. Method for reading posts of specified months of the dataset

In [134]:
input_dir = DIR / f"split/{split_type}"
csv_files = glob.glob(str(input_dir / "*.csv"))

# filter for starting-months
csv_files = [filename for filename in csv_files if filename.split("_to_")[0][-3:] in months]

# use dict to be able to identify the dfs later on
splits = {}

In [135]:
for filename in csv_files:
    df = pd.read_csv(filename)
    splits[filename.split("_to_")[0][-3:]] = df

## Build Model

In [136]:
ppmi_models = {key: PPMIModel.construct_from_texts(value, context_words=context_words, min_freq=min_freq, ) for key, value in tqdm(splits.items())}

  0%|          | 0/4 [00:00<?, ?it/s]

In [137]:
ppmi_model = ppmi_models[next(iter(ppmi_models))]

In [138]:
ppmi_model.context_word2ind

{'the': 0,
 'to': 1,
 'and': 2,
 'of': 3,
 'a': 4,
 'in': 5,
 'for': 6,
 'that': 7,
 'school': 8,
 'is': 9,
 'on': 10,
 'i': 11,
 'with': 12,
 'are': 13,
 'it': 14,
 'as': 15,
 'have': 16,
 'at': 17,
 'this': 18,
 'be': 19,
 'you': 20,
 'by': 21,
 'not': 22,
 'was': 23,
 'they': 24,
 'from': 25,
 'students': 26,
 'their': 27,
 'board': 28,
 'or': 29,
 'schools': 30,
 'but': 31,
 'an': 32,
 'we': 33,
 'has': 34,
 'said': 35,
 'who': 36,
 'about': 37,
 'its': 38,
 'all': 39,
 'will': 40,
 'my': 41,
 'he': 42,
 'if': 43,
 'more': 44,
 'were': 45,
 'people': 46,
 'our': 47,
 'so': 48,
 'what': 49,
 'race': 50,
 'can': 51,
 'parents': 52,
 'education': 53,
 'one': 54,
 'your': 55,
 'would': 56,
 'when': 57,
 'out': 58,
 'also': 59,
 'been': 60,
 'his': 61,
 'no': 62,
 'crt': 63,
 'children': 64,
 'up': 65,
 'do': 66,
 'them': 67,
 'new': 68,
 'like': 69,
 'which': 70,
 'just': 71,
 'she': 72,
 'public': 73,
 'teachers': 74,
 'had': 75,
 'her': 76,
 'covid': 77,
 'how': 78,
 'kids': 79,
 'ot

In [139]:
ppmi_model.ppmi_matrix

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [140]:
ppmi_model.compute_ppmi_matrix()

  prob_cols_given_row = (co_matrix.T / center_counts).T
  ratio = prob_cols_given_row / prob_of_cols


array([[0.        , 0.        , 1.29527448, ..., 0.        , 0.        ,
        0.        ],
       [0.28670733, 0.        , 0.10742906, ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       ...,
       [0.        , 1.49426345, 0.49170348, ..., 0.        , 0.        ,
        0.        ],
       [0.24073661, 0.99348816, 0.3963933 , ..., 0.        , 0.        ,
        0.        ],
       [0.40503966, 0.        , 1.07152197, ..., 0.        , 0.        ,
        0.        ]])

In [141]:
ppmi_models.keys()

dict_keys(['Jun', 'Sep', 'Dec', 'Mar'])

Quick Check

In [142]:
test = ppmi_models["Sep"]
#test.get_as_df().head(10)

In [143]:
test.get_shape() # vocab x context_words

(23099, 5000)

## Vocab

In [144]:
for key, ppmi_model in ppmi_models.items():
    print(f"Vocabulary size of timestep {key}: {ppmi_model.get_vocabulary_size()}")

Vocabulary size of timestep Jun: 20024
Vocabulary size of timestep Sep: 23099
Vocabulary size of timestep Dec: 21632
Vocabulary size of timestep Mar: 16743


## Compute PPMI Matrix

### Calulate & Save

In [145]:
output_dir = DIR / f"ppmi-matrices/quarter/{len(context_words)}"

# save common context-words
print(f"Number of context-words: {len(context_words)}")
with open(output_dir / f"context-words.pkl", "wb") as f:
    pickle.dump(context_words, f)

# save vocab and ppmi-matrices
for key, ppmi_model in ppmi_models.items():
    ppmi_model.compute_ppmi_matrix(window_size=window_size)
    ppmi_model.save(month_codes[key], output_dir)
print("--------------- done ---------------")

Number of context-words: 5000
PPMI data for 06 saved successfully.
Vocabulary Size: 20024
PPMI data for 09 saved successfully.
Vocabulary Size: 23099
PPMI data for 12 saved successfully.
Vocabulary Size: 21632
PPMI data for 03 saved successfully.
Vocabulary Size: 16743
--------------- done ---------------


### Control

In [146]:
#ppmi_model.save(month_code, DIR / "ppmi-matrices" / size)

In [147]:
'''loaded_sparse_ppmi_matrix = sp.load_npz(f"ppmi-{month_code}-01.npz")
dense_ppmi_matrix = loaded_sparse_ppmi_matrix.toarray()

with open(f"ppmi-{month_code}-01.pkl", "rb") as f:
    index_to_word = pickle.load(f)''';

In [148]:
# ppmi_matrix_df = ppmi_model.get_as_df()

## Save

In [149]:
''''# Add quotechars to row and column names
ppmi_matrix_df.index = '"' + ppmi_matrix_df.index + '"'
ppmi_matrix_df.columns = '"' + ppmi_matrix_df.columns + '"''';

In [150]:
'''filename = f"ppmi-2022-{month_code}-01.txt"
ppmi_path = DIR / "ppmi-matrices"

ppmi_path.mkdir(parents=True, exist_ok=True)
#ppmi_matrix_df.to_csv(ppmi_path / filename, sep=" ", index=True, quoting=3)''';

In [151]:
'''print(f"{month}({month_code}) - done")
   print(f"Vocabulary-Size: {ppmi_model.get_vocabulary_size()}")''';