# Creation of Sparse PPMI Embeddings for the Social Media Dataset

In [1]:
import glob
import pickle
import pandas as pd
from pathlib import Path
from pprint import pprint
from tqdm.notebook import tqdm

import sys
sys.path.append('../../')

from src.packages.TPPMI.ppmi_model import PPMIModel
import src.packages.TPPMI.util as tppmi_util

import nltk
#nltk.download('punkt');

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/paulschmitt/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


## Setup

In [2]:
DATA_DIR = Path("../../data")
input_dir =  DATA_DIR / "processed" / "social-media-data"
split_type = "monthly"
min_freq = 5
window_size = 5 # default word2vec window-size
number_of_context_words = 500
months = ["Jun", "Jul", "Aug", "Sep", "Oct", "Nov", "Dec", "Jan", "Feb", "Mar", "Apr"]
month_codes = {"Jun": "06", "Jul": "07", "Aug": "08", "Sep": "09", "Oct": "10", "Nov": "11", "Dec": "12", "Jan": "01", "Feb": "02", "Mar": "03", "Apr": "04"}

## Sample context words

In [163]:
df = pd.read_csv(input_dir / "processed_data.csv")
df = df[["text"]]

In [164]:
df = df.dropna()

In [165]:
corpus = ' '.join(df["text"])

In [166]:
top_n = max(2 * number_of_context_words, 2000)
context_words = tppmi_util.sample_from_most_common_words(corpus, top_n=top_n,
                                                         sample_size=number_of_context_words, remove_stopwords=False)

In [167]:
pprint(context_words, compact=True, width=100)

['dam', 'excuses', 'honestly', 'arbor', 'prevalence', 'wasted', 'unlike', 'method', 'gift', 'juicy',
 'tries', 'immigrant', 'grace', 'rico', 'bikes', 'remarks', 'failed', 'backs', 'entity', 'scores',
 'au', 'police', 'highest', 'relationship', 'declines', 'child', 'genitalia', 'indeed', 'money',
 'marks', 'slot', '75', 'component', 'seattle', 'viewed', '78', 'dear', 'homosexual', 'lead',
 'delivering', 'inclusive', 'edge', 'locking', 'cycles', 'sins', 'susan', 'coast', 'commitments',
 'vaxxed', 'rolled', 'ed', 'diana', 'medicine', 'systems', 'transportation', 'applies', 'closet',
 'prevention', '14', 'combine', 'original', 'narratives', 'recognizes', 'fit', 'museums',
 'fascinating', 'another', 'alone', 'medium', 'ones', 'exclude', 'explore', 'relentless', 'sending',
 'cbs', 'maam', 'secretary', 'hb', 'pose', 'political', 'luck', 'transport', 'junior',
 'therapeutic', 'sources', 'fascist', 'inner', 'organizational', 'milf', 'origins', 'dirty',
 'teeth', 'intended', 'b', 'weight', 'lite

## Build Model

In [168]:
csv_files = glob.glob(str(input_dir / split_type / "*.csv"))

# filter for starting-months
csv_files = [filename for filename in csv_files if filename.split("_to_")[0][-3:] in months]

# use dict to be able to identify the dfs later on
splits = {}

In [169]:
for filename in csv_files:
    df = pd.read_csv(filename)
    splits[filename.split("_to_")[0][-3:]] = df

In [170]:
ppmi_models = {key: PPMIModel.construct_from_texts(value, context_words=context_words, min_freq=min_freq) for key, value in tqdm(splits.items())}

  0%|          | 0/11 [00:00<?, ?it/s]

### Control

In [171]:
ppmi_model = ppmi_models[next(iter(ppmi_models))]

In [172]:
ppmi_model.compute_ppmi_matrix()

array([[2.85473166, 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [2.39519934, 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [2.39519934, 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       ...,
       [2.24859587, 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [1.38839462, 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [1.03557325, 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ]])

In [173]:
ppmi_model.ppmi_matrix.shape

(12108, 5000)

In [174]:
ppmi_models.keys()

dict_keys(['Mar', 'Jan', 'Feb', 'Jun', 'Sep', 'Apr', 'Oct', 'Aug', 'Dec', 'Nov', 'Jul'])

## Vocabulary

In [175]:
for key, ppmi_model in ppmi_models.items():
    print(f"Vocabulary size of timestep {key}: {ppmi_model.get_vocabulary_size()}")

Vocabulary size of timestep Mar: 12108
Vocabulary size of timestep Jan: 11553
Vocabulary size of timestep Feb: 10313
Vocabulary size of timestep Jun: 9629
Vocabulary size of timestep Sep: 10310
Vocabulary size of timestep Apr: 10076
Vocabulary size of timestep Oct: 13314
Vocabulary size of timestep Aug: 11294
Vocabulary size of timestep Dec: 11857
Vocabulary size of timestep Nov: 13099
Vocabulary size of timestep Jul: 10162


## Calulate & Save

In [176]:
output_dir = DATA_DIR / "ppmi-matrices" / "social-media-data" / split_type / str(number_of_context_words)
output_dir.mkdir(parents=True, exist_ok=True)

# save common context-words
with open(output_dir / f"context-words.pkl", "wb") as f:
    pickle.dump(context_words, f)

# save vocab and ppmi-matrices
for key, ppmi_model in ppmi_models.items():
    ppmi_model.compute_ppmi_matrix(window_size=window_size)
    ppmi_model.save(month_codes[key], output_dir)

print("--------------- done ---------------")
print(f"PPMI-Data stored to: {str(output_dir)}")
print(f"Number of context-words: {len(context_words)}")

PPMI data for 03 saved successfully.
Vocabulary Size: 12108
PPMI data for 01 saved successfully.
Vocabulary Size: 11553
PPMI data for 02 saved successfully.
Vocabulary Size: 10313
PPMI data for 06 saved successfully.
Vocabulary Size: 9629
PPMI data for 09 saved successfully.
Vocabulary Size: 10310
PPMI data for 04 saved successfully.
Vocabulary Size: 10076
PPMI data for 10 saved successfully.
Vocabulary Size: 13314
PPMI data for 08 saved successfully.
Vocabulary Size: 11294
PPMI data for 12 saved successfully.
Vocabulary Size: 11857
PPMI data for 11 saved successfully.
Vocabulary Size: 13099
PPMI data for 07 saved successfully.
Vocabulary Size: 10162
--------------- done ---------------
PPMI-Data stored to: ../../data/ppmi-matrices/social-media-data/monthly/5000
Number of context-words: 5000
