In [8]:
import json

import numpy as np
import pandas as pd

In [4]:
events = set()
concepts = set()
counts = []

for event in file:
    events.add(event['info']['uri'])
    counts.append(event['info']['articleCounts']['total'])
    # for similar_event in event['similarEvents']['similarEvents']:
    #     events.add(similar_event['uri'])
    for concept in event['info']['concepts']:
        concepts.add(concept['uri'])

In [5]:
len(events)

1000

In [6]:
len(concepts)

8729

In [52]:
counts = np.array(counts)

In [54]:
counts[counts > 0].shape

(608,)

In [53]:
np.unique(counts[counts > 0], return_counts=True)

(array([  1,   2,   3,   4,   5,   6,   7,   8,   9,  10,  11,  12,  13,
         14,  15,  16,  17,  18,  19,  20,  21,  22,  23,  24,  25,  26,
         27,  28,  29,  30,  31,  32,  33,  35,  37,  39,  40,  45,  46,
         48,  51,  52,  56,  57,  58,  60,  62,  69,  79,  89,  92,  94,
        100, 111, 121, 122, 123, 131, 137, 141, 144, 150, 152, 164, 170,
        186, 221, 233, 313, 335, 394, 431, 447, 607]),
 array([97, 61, 44, 41, 38, 37, 33, 19, 30, 21, 25, 22, 16, 12,  6,  4,  7,
         4,  3,  7,  1,  5,  2,  4,  2,  2,  4,  1,  1,  1,  3,  2,  3,  3,
         4,  1,  1,  1,  1,  2,  1,  1,  1,  1,  1,  1,  3,  2,  1,  1,  1,
         1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
         1,  1,  1,  1,  1,  1]))

# Preprocessed

In [11]:
prep_df = pd.read_pickle('../data/preprocessed/events-00001.pkl')

In [15]:
print("events:", prep_df.shape[0])

events: 608


In [18]:
from datetime import datetime as dt


def generate_id(orig_id: str, prefix: str) -> str:
    """
    Generates an ID for a given concept or event
    :param orig_id: the original ID of the concept or event
    :param prefix: the prefix to be added to the ID (e.g. 'c' for concept, 'e' for event)
    """
    return f"{prefix}_{orig_id}"


def event_date_to_timestamp(event: dict, similar_event: bool):
    event_date = event["eventDate"] if similar_event else event["info"]["eventDate"]
    if event_date == "":
        return 0
    return dt.strptime(event_date, "%Y-%m-%d").timestamp()

In [46]:
with open('../data/source/events-00001.json', "r") as file:
    file = json.load(file)


def filter_json(json_content):
    filtered_events = []
    for event in json_content:
        # generate unique ids
        event["info"]["uri"] = generate_id(event["info"]["uri"], "e")
        event["info"]["eventDate"] = event_date_to_timestamp(event, False)

        similar_events = event['similarEvents']['similarEvents']
        concepts = event['info']["concepts"]
        article_counts_total = event['info']['articleCounts']['total']

        if article_counts_total == 0:
            event['info']['articleCounts']['total'] = -1

        try:
            # fix json structure
            event["similarEvents"] = event["similarEvents"]["similarEvents"]

            for se in similar_events:
                # generate unique ids
                se["uri"] = generate_id(se["uri"], "e")
                # convert to timestamp
                se["eventDate"] = event_date_to_timestamp(se, True)

            for c in concepts:
                # generate unique ids
                c["id"] = generate_id(c["id"], "c")

            filtered_events.append(event)
        except:
            print(event)
            break

        event["info"].pop("eventDateEnd", None)
        event["info"].pop("categories", None)

    dataframe = pd.DataFrame(filtered_events)
    return dataframe

In [47]:
prep_df = filter_json(file)

In [48]:
prep_df.shape

(1000, 2)

In [49]:
prep_df.head()

Unnamed: 0,info,similarEvents
0,"{'uri': 'e_11', 'articleCounts': {'total': 7, ...","[{'eventDateEnd': '', 'uri': 'e_1206914', 'sim..."
1,"{'uri': 'e_10', 'articleCounts': {'total': 221...","[{'eventDateEnd': '', 'uri': 'e_957210', 'sim'..."
2,"{'uri': 'e_13', 'articleCounts': {'total': 9, ...","[{'eventDateEnd': '', 'uri': 'e_2070817', 'sim..."
3,"{'uri': 'e_12', 'articleCounts': {'total': 1, ...","[{'eventDateEnd': '', 'uri': 'e_570885', 'sim'..."
4,"{'uri': 'e_15', 'articleCounts': {'total': 8, ...","[{'eventDateEnd': '', 'uri': 'e_27569', 'sim':..."


In [50]:
prep_df.loc[0]

info             {'uri': 'e_11', 'articleCounts': {'total': 7, ...
similarEvents    [{'eventDateEnd': '', 'uri': 'e_1206914', 'sim...
Name: 0, dtype: object