In [18]:
import pickle
import pandas as pd
import numpy as np
import os
from glob import glob
from tqdm import tqdm

In [21]:
with open('../../data/batch/B_random_10.pkl', 'rb') as f:
    B_random_10 = pickle.load(f)
    
with open('../../data/batch/B_recent_10.pkl', 'rb') as f:
    B_recent_10 = pickle.load(f)
    
with open('../../data/preprocessed/event_index.pkl', 'rb') as f:
    event_index = pickle.load(f)

In [14]:
ids = list(B_random_10['events-00023'])

In [20]:
directory_path = "../../data/preprocessed"
output_dir = "../../data/batch"
def iterate_files(action, n=3000):
    files = sorted(glob(os.path.join(directory_path, "events-*.pkl")))
    files = files[:n]
    
    for filename in tqdm(files, ncols=100, desc="Processing"):
        file_path = os.path.join(directory_path, filename)
        file_name = os.path.splitext(filename)[0]
    
        df = pd.read_pickle(file_path)
        action(file_name, df)

In [25]:
def get_file(file_name):
    file_path = os.path.join(directory_path, file_name + ".pkl")
    df = pd.read_pickle(file_path)
    return df

In [24]:
def iterate_file(file_name, action):
    file_path = os.path.join(directory_path, file_name + ".pkl")
    df = pd.read_pickle(file_path)
    action(file_name, df)

In [23]:
def find_event_file(event_id):
    for file, event_ids in event_index.items():
        if event_id in event_ids:
            return file

In [31]:
def get_event(event_id):
    file_name = find_event_file(event_id)
    df = get_file(file_name)
    return df.loc[event_id]

In [74]:
def check_event_date(event_id):
    event = get_event(event_id)
    event_date = event['info']['eventDate']
    # print(event_date)
    
    similar_after_event = []
    for similar in event['similarEvents']:
        similar_date = similar['eventDate']
        similar_after_event.append(similar_date + 30 >= event_date)
        # print(similar_date)
            
    # return true if all similar events take place after the event
    return all(similar_after_event)

In [70]:
get_event(ids[0])

info             {'uri': 'e_22846', 'articleCounts': {'total': ...
similarEvents    [{'eventDateEnd': '', 'uri': 'e_2204068', 'sim...
Name: e_22846, dtype: object

In [75]:
for event_id in ids:
    print(check_event_date(event_id))

True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
False
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
False
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
False
True


In [68]:
check_event_date(ids[1])

16062
16242
16093
16103
16150
16058
16102
16103
16118
16063
16061
16079
16172
16216
16087
16254
16145
16086
16094
16466
16151


False

In [81]:
def get_similar_events(event_id):
    event = get_event(event_id)
    event_date = event['info']['eventDate']
    similar_events = event['similarEvents']
    
    # filter similar events that take place BEFORE the event
    similar_events = [similar for similar in similar_events if similar['eventDate'] < event_date]
    return similar_events

In [86]:
id = ids[1]
event = get_event(id)
event_date = event['info']['eventDate']
similar_events = []
for similar in get_similar_events(id):
    similar_events.append(get_event(similar['uri']))

In [87]:
similar_events

[info             {'uri': 'e_23914', 'articleCounts': {'spa': 46...
 similarEvents    [{'eventDateEnd': '', 'uri': 'e_888983', 'sim'...
 Name: e_23914, dtype: object,
 info             {'uri': 'e_21730', 'articleCounts': {'spa': 11...
 similarEvents    [{'eventDateEnd': '', 'uri': 'e_199246', 'sim'...
 Name: e_21730, dtype: object]

In [84]:
get_similar_events(id)

[]

In [88]:
def get_k_hop(event_id, k=1):
    event = get_event(event_id)
    event_date = event['info']['eventDate']
    similar_events = event['similarEvents']
    
    # filter similar events that take place BEFORE the event
    similar_events = [similar for similar in similar_events if similar['eventDate'] < event_date]
    
    if k == 1:
        return similar_events
    
    k_hop = []
    for similar in similar_events:
        k_hop.extend(get_k_hop(similar['uri'], k-1))
        
    return k_hop

In [90]:
for event_id in ids:
    khop = get_k_hop(event_id, 2)
    print(len(khop))

0
2
0
0
38
0
18
0
3
0
0
0
0
0
0
0
0
7
0
0
0
0
1
0
14
0
1
0
0
0
0
69
0
0
1
0
105
66
36
0
0
0
3
0
0
0
0
1
0
3
2
1
2
0
0
0
0
2
3
0
8
0
71
0
0
6
1
0
0
0
1
0
0
0
17
0
0
0
0
1
1
0
0
0
0
0
0
0
0
2
30
