# PPMI

In [2]:
import glob
import pandas as pd
from pathlib import Path
from ppmi_model import PPMIModel

import nltk
nltk.download('punkt');

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/paulschmitt/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


## Setup

In [19]:
DIR = Path("../../data")
month = "Apr"
month_code = "04"
min_freq = 2 # large = 2, medium = 5, short = 100
months = ["Jun", "Jul", "Aug"] # , "Jun", Jul", "Aug", "Sep", "Oct", "Nov", "Dec", "Jan", "Feb", "Mar", "Apr"]

## Build Corpus

 /1. Method for reading the entire dataset

In [20]:
df = pd.read_csv(DIR / "processed_data.csv")
df = df[["text"]]

/2. Method for reading posts of specified months of the dataset

In [21]:
input_dir = DIR / "split/monthly"
csv_files = glob.glob(str(input_dir / "*.csv"))

# filter for starting-months
csv_files = [filename for filename in csv_files if filename.split("_to_")[0][-3:] in months]

# use dict to be able to identify the dfs later on
monthly_splits = {}

In [22]:
for filename in csv_files:
    df = pd.read_csv(filename)
    monthly_splits[filename.split("_to_")[0][-3:]] = df[["text"]]

In [23]:
df_test = monthly_splits[month]

In [24]:
df = df_test

## Build Model

In [25]:
ppmi_model = PPMIModel.construct_from_texts(df, min_freq=min_freq)

## Vocab

In [26]:
print(f"Vocabulary-Size: {ppmi_model.get_vocabulary_size()}")

Vocabulary-Size: 20380


## Compute PMI Matrix

### Calulation

In [None]:
ppmi_model.compute_ppmi_matrix(window_size=3);

### Control

In [None]:
ppmi_model.get_shape()

In [None]:
ppmi_matrix_df = ppmi_model.get_as_df()

## Save

In [None]:
# Add quotechars to row and column names
ppmi_matrix_df.index = '"' + ppmi_matrix_df.index + '"'
ppmi_matrix_df.columns = '"' + ppmi_matrix_df.columns + '"'

In [None]:
filename = f"ppmi-2022-{month_code}-01.txt"
ppmi_path = DIR / "ppmi-matrices"

ppmi_path.mkdir(parents=True, exist_ok=True)
ppmi_matrix_df.to_csv(ppmi_path / filename, sep=" ", index=True, quoting=3)

In [None]:
print(f"{month}({month_code}) - done")
print(f"Vocabulary-Size: {ppmi_model.get_vocabulary_size()}")