In [1]:
import os
import csv
import json
import random
import jsonlines
import numpy as np

from glob import glob
from tqdm import tqdm
from sklearn.feature_extraction.text import TfidfVectorizer

# Process all schemas

In [3]:
schema_descr_path = '../data/schemas/descrs'
cgw_schema_rel_docs_path = '../data/cgw/schema_related/pos'
cgw_preds_args_path = '../data/cgw/preds_args'

for filename in os.listdir(schema_descr_path):
    if not filename.endswith('.json'):
        continue
    schema_name = filename[:-5]

    subset_docs_ids = set()

    # Collect schema-related docs' ids
    with jsonlines.open(f'{cgw_schema_rel_docs_path}/{schema_name}.jsonl') as reader:
        for doc in reader:
            subset_docs_ids.add(doc['id'])
    
    # Load schema events
    with open(f'{schema_descr_path}/{schema_name}.json') as fin:
        schema_descr = json.load(fin)
        filtered_events = schema_descr['predpatt']
    
    # Load schema-related docs
    subset_raw_docs_ids = []
    subset_raw_docs = []

    for pred_arg_path in tqdm(glob(f'{cgw_preds_args_path}/*.jsonl')):
        with jsonlines.open(pred_arg_path) as reader:
            for doc in reader:
                assert doc['filename'].endswith('.comm')
                doc_id = doc['filename'][:-5]
                if doc_id in subset_docs_ids:
                    # pred_only_doc = [f'{a0 if ix == 0 else a1}_{ix}' for a0, a1, ix in doc['preds_args']]
                    pred_only_doc = [(a0 if ix == 0 else a1) for a0, a1, ix in doc['preds_args']]
                    subset_raw_docs_ids.append(doc_id)
                    subset_raw_docs.append(pred_only_doc)
    
    subset_raw_docs_ids = np.array(subset_raw_docs_ids)
    assert len(subset_docs_ids) == len(subset_raw_docs) == len(subset_raw_docs_ids)

    # Vectorize docs
    vectorizer = TfidfVectorizer(tokenizer=lambda x: x, lowercase=False, vocabulary=filtered_events)
    subset_tfidf = vectorizer.fit_transform(subset_raw_docs)
    subset_tfidf = subset_tfidf.toarray()

    vocab_relevant_mask = (subset_tfidf.sum(axis=1) > 0)
    subset_tfidf = subset_tfidf[vocab_relevant_mask]
    subset_tfidf_ids = subset_raw_docs_ids[vocab_relevant_mask]
    
    # Output to CSV
    header = [v for k, v in sorted([(v, k) for k, v in vectorizer.vocabulary_.items()])]
    vocab_size = len(vectorizer.vocabulary_)

    with open(f'{cgw_schema_rel_docs_path}/{schema_name}_binary_{vocab_size}.csv', 'w') as fout:
        writer = csv.writer(fout)
        writer.writerow(header)
        for doc_id, row in zip(subset_tfidf_ids, np.int32(subset_tfidf > 0)):
            writer.writerow(list(row))

100%|██████████| 197/197 [00:58<00:00,  3.35it/s]
100%|██████████| 197/197 [00:56<00:00,  3.48it/s]
100%|██████████| 197/197 [00:56<00:00,  3.47it/s]
100%|██████████| 197/197 [00:56<00:00,  3.46it/s]
100%|██████████| 197/197 [00:57<00:00,  3.45it/s]


# Process narrative chains

In [None]:
filtered_events = ['convict_0', 'face_1', 'sentence_0', 'arrest_0', 'accuse_1',
                   'plead_1', 'acquit_0', 'indict_0', 'testify_1', 'charge_0']

filtered_events_set = set(filtered_events)

In [None]:
subset_raw_docs_ids = []
subset_raw_docs = []

with open('../data/cgw/nytimes_chains.txt') as reader:
    for ix, chain in tqdm(enumerate(reader)):
        pred_only_chain = [x.replace('->nsubj', '_1').replace('->dobj', '_0') for x in chain.strip().split()]
        subset_raw_docs_ids.append(ix)
        subset_raw_docs.append(pred_only_chain)

In [None]:
subset_raw_docs_ids = np.array(subset_raw_docs_ids)

In [None]:
%%time

vectorizer = TfidfVectorizer(tokenizer=lambda x: x, lowercase=False, vocabulary=filtered_events)
subset_tfidf = vectorizer.fit_transform(subset_raw_docs)
subset_tfidf = subset_tfidf.toarray()

vocab_relevant_mask = (subset_tfidf.sum(axis=1) > 0)
subset_tfidf = subset_tfidf[vocab_relevant_mask]
subset_tfidf_ids = subset_raw_docs_ids[vocab_relevant_mask]

In [None]:
# Output to CSV

header = [v for k, v in sorted([(v, k) for k, v in vectorizer.vocabulary_.items()])]
vocab_size = len(vectorizer.vocabulary_)

with open(f'../data/cgw/schema_related/pos/{schema_name}_tfidf_cj_{vocab_size}.csv', 'w') as fout:
    writer = csv.writer(fout)
    writer.writerow(header)
    for doc_id, row in zip(subset_tfidf_ids, subset_tfidf):
        writer.writerow(list(row))

---