In [6]:
import csv
import json
import random
import jsonlines
import collections
import numpy as np

from glob import glob
from tqdm import tqdm
from sklearn.feature_extraction.text import TfidfVectorizer

# Collect count stats

## For all docs

In [20]:
all_preds_args_cntr = collections.Counter()
all_preds_cntr = collections.Counter()

for pred_arg_path in tqdm(glob('../data/cgw/preds_args/*.jsonl')):
    with jsonlines.open(pred_arg_path) as reader:
        for doc in reader:
            assert doc['filename'].endswith('.comm')
            doc_id = doc['filename'][:-5]
            preds_args = [(a0.lower(), a1.lower(), ix) for (a0, a1, ix) in doc['preds_args']]
            doc_preds_args = [f'{a0}_{a1}_{ix}' for a0, a1, ix in preds_args]
            doc_preds = [(a0 if ix == 0 else a1) for a0, a1, ix in preds_args]
            all_preds_args_cntr.update(doc_preds_args)
            all_preds_cntr.update(doc_preds)

100%|██████████| 197/197 [04:52<00:00,  1.48s/it]


## For schema related docs

In [71]:
schema_name = 'election'

# Collect the ids of schema relevant docs

schema_related_docs_ids = set()

with jsonlines.open(f'../data/cgw/schema_related/pos/{schema_name}.jsonl') as reader:
    for doc in reader:
        schema_related_docs_ids.add(doc['id'])

In [72]:
schema_preds_args_cntr = collections.Counter()
schema_preds_cntr = collections.Counter()

for pred_arg_path in tqdm(glob('../data/cgw/preds_args/*.jsonl')):
    with jsonlines.open(pred_arg_path) as reader:
        for doc in reader:
            assert doc['filename'].endswith('.comm')
            doc_id = doc['filename'][:-5]
            if doc_id in schema_related_docs_ids:
                preds_args = [(a0.lower(), a1.lower(), ix) for (a0, a1, ix) in doc['preds_args']]
                doc_preds_args = [f'{a0}_{a1}_{ix}' for a0, a1, ix in preds_args]
                doc_preds = [(a0 if ix == 0 else a1) for a0, a1, ix in preds_args]
                schema_preds_args_cntr.update(doc_preds_args)
                schema_preds_cntr.update(doc_preds)

100%|██████████| 197/197 [01:08<00:00,  2.86it/s]


# Discover schema related events

In [73]:
z_all = sum(v for v in all_preds_cntr.values())
z_schema = sum(v for v in schema_preds_cntr.values())

schema_top_preds = [k for k, v in schema_preds_cntr.items() if v >= 100]
schema_top_preds_rates = [(k, (schema_preds_cntr[k] / z_schema) / (all_preds_cntr[k] / z_all)) for k in schema_top_preds]
schema_top_preds_rates = [(k, v) for k, v in schema_top_preds_rates if v > 1]
schema_top_preds_rates.sort(key=lambda x: -x[1])

In [74]:
# TODO: make sure the selected pred-only events typically expect similar types of args; that an event does not conflate multiple args

In [75]:
print(len(schema_top_preds_rates))

710


In [76]:
schema_top_preds_rates

[('voter', 36.35676704731012),
 ('election', 36.296766938715294),
 ('elect', 27.218845134364706),
 ('disenfranchise', 21.786111604650532),
 ('elected', 16.412711988050173),
 ('outspent', 14.790466055812104),
 ('unseated', 14.529726697755576),
 ('candidate', 14.449733668279983),
 ('democratic', 13.017742505574814),
 ('vote', 11.932886370967145),
 ('undecided', 11.739155350931489),
 ('democrats', 11.566640588198213),
 ('republican', 10.82555870220436),
 ('republicans', 9.877199316117094),
 ('democrat', 9.301091124691375),
 ('nominee', 9.27133760880697),
 ('campaign', 9.213812169607703),
 ('unseat', 8.728223590213258),
 ('campaigning', 8.493878784218463),
 ('boycott', 8.001994530854793),
 ('governor', 7.936002773489225),
 ('mayor', 7.4758093563552),
 ('senator', 7.044897425075981),
 ('poll', 6.965648171817239),
 ('re-elected', 6.681994613003907),
 ('cast', 6.666003030888622),
 ('energize', 6.630760227558706),
 ('trounce', 6.462291726602186),
 ('dissent', 6.423548970447496),
 ('contest', 6

---