In [1]:
import sys
sys.path.append("../../")

In [2]:
import eventx
import pandas as pd
import numpy as np

from eventx.util import corpus_statistics, utils
from eventx import SD4M_RELATION_TYPES, ROLE_LABELS, NEGATIVE_TRIGGER_LABEL, NEGATIVE_ARGUMENT_LABEL

In [3]:
DATASET_PATH = "../../data/daystream_corpus/"

SD4M_Train = DATASET_PATH + "train/train_sd4m_with_events.jsonl"
SD4M_Dev = DATASET_PATH + "dev/dev_sd4m_with_events.jsonl"
SD4M_Test = DATASET_PATH + "test/test_sd4m_with_events.jsonl"

SD4M_Train_Snorkel_Format = DATASET_PATH + "train/train_with_events_and_defaults.jsonl"
SD4M_Dev_Snorkel_Format = DATASET_PATH + "dev/dev_with_events_and_defaults.jsonl"
SD4M_Test_Snorkel_Format = DATASET_PATH + "test/test_with_events_and_defaults.jsonl"

daystream_snorkeled = DATASET_PATH + "daystream_snorkeled_with_abstains.jsonl"

In [4]:
sd4m_train = pd.read_json(SD4M_Train, lines=True, encoding='utf8')
sd4m_dev = pd.read_json(SD4M_Dev, lines=True, encoding='utf8')
sd4m_test = pd.read_json(SD4M_Test, lines=True, encoding='utf8')
sd4m = pd.concat([sd4m_train, sd4m_dev, sd4m_test])

daystream = pd.read_json(daystream_snorkeled, lines=True, encoding='utf8')

In [5]:
sd4m_train_sf = pd.read_json(SD4M_Train_Snorkel_Format, lines=True, encoding='utf8')
sd4m_dev_sf = pd.read_json(SD4M_Dev_Snorkel_Format, lines=True, encoding='utf8')
sd4m_test_sf = pd.read_json(SD4M_Test_Snorkel_Format, lines=True, encoding='utf8')
sd4m_sf = pd.concat([sd4m_train_sf, sd4m_dev_sf, sd4m_test_sf])

## Filter out documents with no triggers

It might make more sense to filter out documents that do not contain any triggers.
We do not care for these documents as they are filtered out by the model.
We filter out documents with no triggers/ just abstains in the Snorkel pipeline for the daystream data.

|            | # Documents | # Documents with triggers |
|------------|-------------|---------------------------|
| SD4M Train | 1273        | 567                       |
| SD4M Dev   | 147         | 55                        |
| SD4M Test  | 166         | 68                        |
| Total      | 1586        | 690                       |

In [6]:
# Filters
daystream_with_events = daystream.apply(lambda document: corpus_statistics.has_triggers(document), axis=1)
sd4m_train_with_events = sd4m_train.apply(lambda document: corpus_statistics.has_triggers(document), axis=1)
sd4m_dev_with_events = sd4m_dev.apply(lambda document: corpus_statistics.has_triggers(document), axis=1)
sd4m_test_with_events = sd4m_test.apply(lambda document: corpus_statistics.has_triggers(document), axis=1)

In [7]:
# Filters
sd4m_train_with_events_sf = sd4m_train_sf.apply(lambda document: corpus_statistics.has_triggers(document), axis=1)
sd4m_dev_with_events_sf = sd4m_dev_sf.apply(lambda document: corpus_statistics.has_triggers(document), axis=1)
sd4m_test_with_events_sf = sd4m_test_sf.apply(lambda document: corpus_statistics.has_triggers(document), axis=1)

In [8]:
daystream_filtered = daystream[daystream_with_events]
sd4m_train_filtered = sd4m_train[sd4m_train_with_events]
sd4m_dev_filtered = sd4m_dev[sd4m_dev_with_events]
sd4m_test_filtered = sd4m_test[sd4m_test_with_events]
sd4m_filtered = pd.concat([sd4m_train_filtered, sd4m_dev_filtered, sd4m_test_filtered])

In [9]:
sd4m_train_filtered_sf = sd4m_train_sf[sd4m_train_with_events_sf]
sd4m_dev_filtered_sf = sd4m_dev_sf[sd4m_dev_with_events_sf]
sd4m_test_filtered_sf = sd4m_test_sf[sd4m_test_with_events_sf]
sd4m_filtered_sf = pd.concat([sd4m_train_filtered_sf, sd4m_dev_filtered_sf, sd4m_test_filtered_sf])

In [10]:
daystream_filtered_stats = corpus_statistics.get_dataset_stats(daystream_filtered)
sd4m_train_filtered_stats = corpus_statistics.get_dataset_stats(sd4m_train_filtered)
sd4m_dev_filtered_stats = corpus_statistics.get_dataset_stats(sd4m_dev_filtered)
sd4m_test_filtered_stats = corpus_statistics.get_dataset_stats(sd4m_test_filtered)
sd4m_filtered_stats = corpus_statistics.get_dataset_stats(sd4m_filtered)

In [11]:
sd4m_train_filtered_stats_sf = corpus_statistics.get_dataset_stats(sd4m_train_filtered_sf)
sd4m_dev_filtered_stats_sf = corpus_statistics.get_dataset_stats(sd4m_dev_filtered_sf)
sd4m_test_filtered_stats_sf = corpus_statistics.get_dataset_stats(sd4m_test_filtered_sf)
sd4m_filtered_stats_sf = corpus_statistics.get_dataset_stats(sd4m_filtered_sf)

In [12]:
# General stats: LFs, Positive (Labeled positive vs. Abstains+negative), Documents, DataPoints
trigger_lfs = 21 # 18 uniques, 3 replicated LFs (CanceledRoute, CanceledStop, RailReplacementService)
role_lfs = 42 # 39 unique, 3 replicated LFs (start & end date, cause)

In [41]:
sd4m_filtered_stats_sf.iloc[0]['Role class frequencies']

{'location': 713,
 'delay': 109,
 'direction': 340,
 'start_loc': 447,
 'end_loc': 421,
 'start_date': 46,
 'end_date': 48,
 'cause': 127,
 'jam_length': 160,
 'route': 27,
 'no_arg': 6181}

In [25]:
sd4m_filtered_stats_sf

Unnamed: 0,docType,# Docs,# Tokens,# Entities,# Triggers,# Docs with event triggers,# Event triggers with positive label,# Event triggers with negative label,# Event triggers with abstain,Trigger class frequencies,# Docs with event roles,# Event role with positive label,# Event roles with negative label,# Event roles with abstain,Role class frequencies
0,MIXED,690,20609,5365,994,502,591,354,0,"{'Accident': 74, 'CanceledRoute': 73, 'Cancele...",502,2438,6181,0,"{'location': 713, 'delay': 109, 'direction': 3..."
1,RSS_XML,353,14500,3700,567,330,405,147,0,"{'Accident': 39, 'CanceledRoute': 60, 'Cancele...",330,1818,5057,0,"{'location': 464, 'delay': 58, 'direction': 28..."
2,TWITTER_JSONL,337,6109,1665,427,172,186,207,0,"{'Accident': 35, 'CanceledRoute': 13, 'Cancele...",172,620,1124,0,"{'location': 249, 'delay': 51, 'direction': 53..."


In [43]:
sd4m_filtered_stats

Unnamed: 0,docType,# Docs,# Tokens,# Entities,# Triggers,# Docs with event triggers,# Event triggers,Trigger class frequencies,# Docs with event roles,# Event roles,Role class frequencies
0,MIXED,690,20609,5365,994,502,632,"{'Accident': 74, 'CanceledRoute': 82, 'Cancele...",502,2529,"{'location': 730, 'delay': 114, 'direction': 3..."
1,RSS_XML,353,14500,3700,567,330,413,"{'Accident': 39, 'CanceledRoute': 62, 'Cancele...",330,1842,"{'location': 469, 'delay': 62, 'direction': 28..."
2,TWITTER_JSONL,337,6109,1665,427,172,219,"{'Accident': 35, 'CanceledRoute': 20, 'Cancele...",172,687,"{'location': 261, 'delay': 52, 'direction': 58..."


In [15]:
sd4m_test_filtered_stats

Unnamed: 0,docType,# Docs,# Tokens,# Entities,# Triggers,# Docs with event triggers,# Event triggers,Trigger class frequencies,# Docs with event roles,# Event roles,Role class frequencies
0,MIXED,68,1966,504,102,48,62,"{'Accident': 9, 'CanceledRoute': 11, 'Canceled...",48,250,"{'location': 84, 'delay': 8, 'direction': 31, ..."
1,RSS_XML,33,1332,329,56,31,38,"{'Accident': 6, 'CanceledRoute': 8, 'CanceledS...",31,172,"{'location': 44, 'delay': 3, 'direction': 29, ..."
2,TWITTER_JSONL,35,634,175,46,17,24,"{'Accident': 3, 'CanceledRoute': 3, 'CanceledS...",17,78,"{'location': 40, 'delay': 5, 'direction': 2, '..."


In [16]:
sd4m_filtered_stats.iloc[2]['Role class frequencies']

{'location': 261,
 'delay': 52,
 'direction': 58,
 'start_loc': 89,
 'end_loc': 75,
 'start_date': 60,
 'end_date': 28,
 'cause': 53,
 'jam_length': 10,
 'route': 1,
 'no_arg': 0}

In [17]:
daystream = pd.read_json(DATASET_PATH + "daystream.jsonl", lines=True, encoding='utf8')
sd4m = pd.concat([sd4m_train, sd4m_dev, sd4m_test])
daystream_stats = corpus_statistics.get_dataset_stats(daystream)
sd4m_stats = corpus_statistics.get_dataset_stats(sd4m)

In [18]:
daystream_stats

Unnamed: 0,docType,# Docs,# Tokens,# Entities,# Triggers,# Docs with event triggers,# Event triggers with positive label,# Event triggers with negative label,# Event triggers with abstain,Trigger class frequencies,# Docs with event roles,# Event role with positive label,# Event roles with negative label,# Event roles with abstain,Role class frequencies
0,MIXED,1955,66360,15708,3076,0,0,3076,0,"{'Accident': 0, 'CanceledRoute': 0, 'CanceledS...",0,0,47376,0,"{'location': 0, 'delay': 0, 'direction': 0, 's..."
1,RSS_XML,141,15222,3381,601,0,0,601,0,"{'Accident': 0, 'CanceledRoute': 0, 'CanceledS...",0,0,27847,0,"{'location': 0, 'delay': 0, 'direction': 0, 's..."
2,TWITTER_JSONL,1814,51138,12327,2475,0,0,2475,0,"{'Accident': 0, 'CanceledRoute': 0, 'CanceledS...",0,0,19529,0,"{'location': 0, 'delay': 0, 'direction': 0, 's..."


In [19]:
sd4m_stats

Unnamed: 0,docType,# Docs,# Tokens,# Entities,# Triggers,# Docs with event triggers,# Event triggers,Trigger class frequencies,# Docs with event roles,# Event roles,Role class frequencies
0,MIXED,1586,37458,7695,994,502,632,"{'Accident': 74, 'CanceledRoute': 82, 'Cancele...",502,2529,"{'location': 730, 'delay': 114, 'direction': 3..."
1,RSS_XML,575,19408,4733,567,330,413,"{'Accident': 39, 'CanceledRoute': 62, 'Cancele...",330,1842,"{'location': 469, 'delay': 62, 'direction': 28..."
2,TWITTER_JSONL,1011,18050,2962,427,172,219,"{'Accident': 35, 'CanceledRoute': 20, 'Cancele...",172,687,"{'location': 261, 'delay': 52, 'direction': 58..."


In [20]:
daystream_entities = [entity for entities in daystream['entities'] for entity in entities]

In [21]:
len([entity for entity in daystream_entities if entity['entity_type'] == 'trigger'])

3076

In [22]:
daystream_snorkel_labeled = pd.read_json(daystream_snorkeled, lines=True, encoding='utf8')

In [23]:
daystream_filtered_entities = [entity for entities in daystream_snorkel_labeled['entities'] for entity in entities]

In [24]:
len([entity for entity in daystream_filtered_entities if entity['entity_type'] == 'trigger'])

3076