# PPMI

In [18]:
import glob
import pandas as pd
from pathlib import Path
from ppmi_model import PPMIModel

import nltk
nltk.download('punkt');

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/paulschmitt/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


## Setup

In [19]:
DIR = Path("../../data")
size = "short"
min_freq = 500 # large = 2, medium = 5, short = 500
window_size = 5 # default word2vec window-size
months = ["Jun", "Jul", "Aug", "Sep", "Oct", "Nov", "Dec", "Jan", "Feb", "Mar", "Apr"]
month_codes = {"Jun": "06", "Jul": "07", "Aug": "08", "Sep": "09", "Oct": "10", "Nov": "11", "Dec": "12", "Jan": "01", "Feb": "02", "Mar": "03", "Apr": "04"}

## Build Corpus

 /1. Method for reading the entire dataset

In [20]:
df = pd.read_csv(DIR / "processed_data.csv")
df = df[["text"]]

/2. Method for reading posts of specified months of the dataset

In [21]:
input_dir = DIR / "split/monthly"
csv_files = glob.glob(str(input_dir / "*.csv"))

# filter for starting-months
csv_files = [filename for filename in csv_files if filename.split("_to_")[0][-3:] in months]

# use dict to be able to identify the dfs later on
monthly_splits = {}

In [22]:
for filename in csv_files:
    df = pd.read_csv(filename)
    monthly_splits[filename.split("_to_")[0][-3:]] = df#[["text"]]

## Build Model

In [23]:
ppmi_models = {key: PPMIModel.construct_from_texts(value, min_freq=min_freq) for key, value in monthly_splits.items()}

In [24]:
ppmi_models.keys()

dict_keys(['Mar', 'Jan', 'Feb', 'Jun', 'Sep', 'Apr', 'Oct', 'Aug', 'Dec', 'Nov', 'Jul'])

In [25]:
test = ppmi_models["Mar"]
test.get_as_df().head(10)

Unnamed: 0,2021,2023,a,about,according,across,after,against,all,also,...,why,will,with,woke,work,would,year,years,you,your
2021,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2023,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
a,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
about,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
according,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
across,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
after,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
against,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
all,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
also,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [26]:
test.get_vocabulary_size()

238

In [27]:
test.get_shape()

(238, 238)

## Vocab

In [28]:
for key, ppmi_model in ppmi_models.items():
    print(f"Vocabulary size of timestep {key}: {ppmi_model.get_vocabulary_size()}")

Vocabulary size of timestep Mar: 238
Vocabulary size of timestep Jan: 188
Vocabulary size of timestep Feb: 179
Vocabulary size of timestep Jun: 155
Vocabulary size of timestep Sep: 178
Vocabulary size of timestep Apr: 169
Vocabulary size of timestep Oct: 268
Vocabulary size of timestep Aug: 208
Vocabulary size of timestep Dec: 201
Vocabulary size of timestep Nov: 256
Vocabulary size of timestep Jul: 168


## Compute PMI Matrix

### Calulate & Save

In [29]:
for key, ppmi_model in ppmi_models.items():
    ppmi_model.compute_ppmi_matrix(window_size=window_size)
    ppmi_model.save(month_codes[key], DIR / "ppmi-matrices" / size)

PPMI data for 03 saved successfully.
Vocabulary Size: 238
PPMI data for 01 saved successfully.
Vocabulary Size: 188
PPMI data for 02 saved successfully.
Vocabulary Size: 179
PPMI data for 06 saved successfully.
Vocabulary Size: 155
PPMI data for 09 saved successfully.
Vocabulary Size: 178
PPMI data for 04 saved successfully.
Vocabulary Size: 169
PPMI data for 10 saved successfully.
Vocabulary Size: 268
PPMI data for 08 saved successfully.
Vocabulary Size: 208
PPMI data for 12 saved successfully.
Vocabulary Size: 201
PPMI data for 11 saved successfully.
Vocabulary Size: 256
PPMI data for 07 saved successfully.
Vocabulary Size: 168


### Control

In [30]:
#ppmi_model.save(month_code, DIR / "ppmi-matrices" / size)

In [31]:
'''loaded_sparse_ppmi_matrix = sp.load_npz(f"ppmi-{month_code}-01.npz")
dense_ppmi_matrix = loaded_sparse_ppmi_matrix.toarray()

with open(f"ppmi-{month_code}-01.pkl", "rb") as f:
    index_to_word = pickle.load(f)''';

In [32]:
# ppmi_matrix_df = ppmi_model.get_as_df()

## Save

In [33]:
''''# Add quotechars to row and column names
ppmi_matrix_df.index = '"' + ppmi_matrix_df.index + '"'
ppmi_matrix_df.columns = '"' + ppmi_matrix_df.columns + '"''';

In [34]:
'''filename = f"ppmi-2022-{month_code}-01.txt"
ppmi_path = DIR / "ppmi-matrices"

ppmi_path.mkdir(parents=True, exist_ok=True)
#ppmi_matrix_df.to_csv(ppmi_path / filename, sep=" ", index=True, quoting=3)''';

In [35]:
'''print(f"{month}({month_code}) - done")
   print(f"Vocabulary-Size: {ppmi_model.get_vocabulary_size()}")''';