# PPMI

In [1]:
import glob
import pandas as pd
from pathlib import Path
from ppmi_model import PPMIModel

import nltk
nltk.download('punkt');

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/paulschmitt/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


## Setup

In [6]:
DIR = Path("../../data")
size = "medium"
split_type = "quarter"
min_freq = 5 # large = 2, medium = 5, short = 500
window_size = 5 # default word2vec window-size
months = ["Jun", "Jul", "Aug", "Sep", "Oct", "Nov", "Dec", "Jan", "Feb", "Mar", "Apr"]
month_codes = {"Jun": "06", "Jul": "07", "Aug": "08", "Sep": "09", "Oct": "10", "Nov": "11", "Dec": "12", "Jan": "01", "Feb": "02", "Mar": "03", "Apr": "04"}

## Build Corpus

 /1. Method for reading the entire dataset

In [5]:
df = pd.read_csv(DIR / "processed_data.csv")
df = df[["text"]]

/2. Method for reading posts of specified months of the dataset

In [9]:
input_dir = DIR / f"split/{split_type}"
csv_files = glob.glob(str(input_dir / "*.csv"))

# filter for starting-months
csv_files = [filename for filename in csv_files if filename.split("_to_")[0][-3:] in months]

# use dict to be able to identify the dfs later on
splits = {}

In [10]:
for filename in csv_files:
    df = pd.read_csv(filename)
    splits[filename.split("_to_")[0][-3:]] = df

In [12]:
splits.keys()

dict_keys(['Jun', 'Sep', 'Dec', 'Mar'])

## Build Model

In [13]:
ppmi_models = {key: PPMIModel.construct_from_texts(value, min_freq=min_freq) for key, value in splits.items()}

In [14]:
ppmi_models.keys()

dict_keys(['Jun', 'Sep', 'Dec', 'Mar'])

Quick Check

In [15]:
test = ppmi_models["Jun"]
test.get_as_df().head(10)

Unnamed: 0,0,00,000,0001,001,002,003,005,007,008,...,zones,zong,zoning,zoo,zoom,zoomer,zoomers,zopiclone,zu,zubymusic
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [16]:
test.get_vocabulary_size()

20024

In [17]:
test.get_shape()

(20024, 20024)

## Vocab

In [18]:
for key, ppmi_model in ppmi_models.items():
    print(f"Vocabulary size of timestep {key}: {ppmi_model.get_vocabulary_size()}")

Vocabulary size of timestep Jun: 20024
Vocabulary size of timestep Sep: 23099
Vocabulary size of timestep Dec: 21632
Vocabulary size of timestep Mar: 16743


## Compute PMI Matrix

### Calulate & Save

In [20]:
for key, ppmi_model in ppmi_models.items():
    ppmi_model.compute_ppmi_matrix(window_size=window_size)
    ppmi_model.save(month_codes[key], DIR / "ppmi-matrices/quarter") # size

PPMI data for 06 saved successfully.
Vocabulary Size: 20024
PPMI data for 09 saved successfully.
Vocabulary Size: 23099
PPMI data for 12 saved successfully.
Vocabulary Size: 21632
PPMI data for 03 saved successfully.
Vocabulary Size: 16743


### Control

In [30]:
#ppmi_model.save(month_code, DIR / "ppmi-matrices" / size)

In [31]:
'''loaded_sparse_ppmi_matrix = sp.load_npz(f"ppmi-{month_code}-01.npz")
dense_ppmi_matrix = loaded_sparse_ppmi_matrix.toarray()

with open(f"ppmi-{month_code}-01.pkl", "rb") as f:
    index_to_word = pickle.load(f)''';

In [32]:
# ppmi_matrix_df = ppmi_model.get_as_df()

## Save

In [33]:
''''# Add quotechars to row and column names
ppmi_matrix_df.index = '"' + ppmi_matrix_df.index + '"'
ppmi_matrix_df.columns = '"' + ppmi_matrix_df.columns + '"''';

In [34]:
'''filename = f"ppmi-2022-{month_code}-01.txt"
ppmi_path = DIR / "ppmi-matrices"

ppmi_path.mkdir(parents=True, exist_ok=True)
#ppmi_matrix_df.to_csv(ppmi_path / filename, sep=" ", index=True, quoting=3)''';

In [35]:
'''print(f"{month}({month_code}) - done")
   print(f"Vocabulary-Size: {ppmi_model.get_vocabulary_size()}")''';