### Processing LIAD-Framework ready data

In [1]:
import pandas as pd
import numpy as np
from math import radians, sin, cos, sqrt, atan2
from joblib import Parallel, delayed
import math
import os
os.environ["OMP_NUM_THREADS"] = "1"
os.environ["MKL_NUM_THREADS"] = "1"
os.environ["OPENBLAS_NUM_THREADS"] = "1"
os.environ["NUMEXPR_NUM_THREADS"] = "1"


def split_by_time_bins(df):
    
    df = df.copy()
    
    df['num_days'] = (df['finished_at'].dt.normalize() - df['started_at'].dt.normalize()).dt.days + 1
    
    df_exploded = df.loc[df.index.repeat(df['num_days'])].copy()
    df_exploded['day_offset'] = df_exploded.groupby(level=0).cumcount()
    
    df_exploded['current_day_midnight'] = df_exploded['started_at'].dt.normalize() + pd.to_timedelta(df_exploded['day_offset'], unit='D')
    
    df_exploded['started_at'] = df_exploded[['started_at', 'current_day_midnight']].max(axis=1)
    df_exploded['day_end_boundary'] = df_exploded['current_day_midnight'] + pd.to_timedelta(1, unit='D')
    df_exploded['finished_at'] = df_exploded[['finished_at', 'day_end_boundary']].min(axis=1)
    
    df_daily = df_exploded[df_exploded['started_at'] < df_exploded['finished_at']].reset_index(drop=True)

    time_bins = [
        ('00:00:00', '06:00:00', 'Early Morning'),  # 0:00 - 5:59
        ('06:00:00', '09:00:00', 'Morning Rush'),   # 6:00 - 8:59
        ('09:00:00', '14:00:00', 'Mid Day'),        # 9:00 - 13:59
        ('14:00:00', '17:30:00', 'Afternoon'),      # 14:00 - 17:29
        ('17:30:00', '21:30:00', 'Evening'),        # 17:30 - 21:29
        ('21:30:00', '1 day',    'Night')           # 21:30 - 23:59
    ]
    
    final_segments = []

    for start_str, end_str, label in time_bins:
        temp_df = df_daily.copy()
        
        bin_start_delta = pd.to_timedelta(start_str)
        bin_end_delta = pd.to_timedelta(end_str)
        
        bin_abs_start = temp_df['current_day_midnight'] + bin_start_delta
        bin_abs_end = temp_df['current_day_midnight'] + bin_end_delta
        
        temp_df['started_at'] = pd.concat([temp_df['started_at'], bin_abs_start], axis=1).max(axis=1)
        temp_df['finished_at'] = pd.concat([temp_df['finished_at'], bin_abs_end], axis=1).min(axis=1)
        
        valid_segments = temp_df[temp_df['started_at'] < temp_df['finished_at']]
        final_segments.append(valid_segments)

    df_split = pd.concat(final_segments).sort_values(by=['started_at']).reset_index(drop=True)
    
    cols_to_drop = ['num_days', 'day_offset', 'current_day_midnight', 'day_end_boundary']
    df_split = df_split.drop(columns=[c for c in cols_to_drop if c in df_split.columns])

    return df_split

def assign_time_segment(dt):
    """
    Assign time segment based on hour and minute.
    Segments: 0-5.59, 6-8.59, 9-13.59, 14-17.29, 17.30-21.29, 21.30-23.59
    """
    hour = dt.hour
    minute = dt.minute
    
    if hour < 6:
        return '0-5.59'
    elif hour < 9:
        return '6-8.59'
    elif hour < 14:
        return '9-13.59'
    elif hour < 17 or (hour == 17 and minute < 30):
        return '14-17.29'
    elif hour < 21 or (hour == 21 and minute < 30):
        return '17.30-21.29'
    else:
        return '21.30-23.59'
    

def merge_consecutive_locations(
    df: pd.DataFrame,
    agent_col: str = "agent",
    loc_col: str = "location_id",
    start_col: str = "started_at",
    end_col: str = "finished_at",
) -> pd.DataFrame:
    """
    For each agent, merge consecutive rows with the same location_id into a single row.
    Keeps:
      - started_at = first started_at of the run
      - finished_at = last finished_at of the run
      - latitude/longitude/poi_category = first value in the run (change if you want)
    Assumes rows are ordered by time per agent; we enforce it via sort.
    """

    out = df.copy()
    out[start_col] = pd.to_datetime(out[start_col], errors="coerce")
    out[end_col]   = pd.to_datetime(out[end_col], errors="coerce")

    out = out.sort_values([agent_col, start_col, end_col], kind="mergesort")

    new_run = out[loc_col].ne(out.groupby(agent_col)[loc_col].shift())

    out["_run_id"] = new_run.groupby(out[agent_col]).cumsum()
    merged = (
        out.groupby([agent_col, "_run_id"], sort=False, as_index=False)
           .agg(
               started_at=(start_col, "first"),
               finished_at=(end_col, "last"),
               latitude=("latitude", "first"),
               longitude=("longitude", "first"),
               location_id=(loc_col, "first"),
               poi_category=("poi_category", "first"),
           )
           .drop(columns="_run_id")
    )

    return merged


### Train / Test data Processing

Train and Test data should contain:

1. "agent"        : Id of the agent

2. "started_at".  : Date and time. We assume the data you provide are UTC, then we convert to Asia/Tokyo.

3. "finished_at"  : Date and time. We assume the data you provide are UTC, then we convert to Asia/Tokyo.

4. "location_id"  : A unique Id of the location (we use this to uniquely define the agent's home)

5. "latitude"       

6. "longitude"

7. "poi_category" : POI label of the location

Store the data as train.csv and test.csv in the processed folder

In [17]:
### Some of our data processing steps may or may not be important for you

train = '/Users/chanuka/Desktop/codespaces/liad/data/sim2_evalb/sim2_evalb_stay_points_train.parquet'
test = '/Users/chanuka/Desktop/codespaces/liad/data/sim2_evalb/sim2_evalb_stay_points_test.parquet'

train_data = pd.read_parquet(test)
train_data['duration'] = (pd.to_datetime(train_data['end_datetime']) - pd.to_datetime(train_data['start_datetime'])).dt.total_seconds() / 60



In [18]:
train_data = train_data[train_data['duration'] > 15]

In [19]:
train_data.rename(columns={'agent_id': 'agent', 'start_datetime': 'started_at', 'end_datetime': 'finished_at', 'latitude_sp': 'latitude', 'longitude_sp': 'longitude', 'poi_id': 'location_id', 'category': 'poi_category'}, inplace = True)

In [20]:
train_data.drop(columns=['distance_m', 'duration'], inplace=True)

In [21]:
train_data = merge_consecutive_locations(train_data)

In [22]:
train_data.to_csv('../data/sim2_evalb/test.csv', index=False)

In [None]:
test_data = pd.read_csv(test)

eval_b_train = pd.read_parquet('/Users/chanuka/Desktop/codespaces/liad/data/files/evalb_stay_points_test_anomalous.parquet')
poi_data = pd.read_parquet('/Users/chanuka/Desktop/codespaces/liad/data/files/evalb_poi.parquet')
eval_b_train = eval_b_train.merge(poi_data, on='poi_id', how='left')
eval_b_train

eval_b_train.rename(columns={'agent_id': 'agent', 'start_datetime':'started_at', 'end_datetime':'finished_at', 'poi_id': 'location_id', 'category':'poi_category'}, inplace=True)
eval_b_train.drop(columns=['source', 'anomaly_type', 'anomaly'], inplace=True)

gt = pd.read_csv('/Users/chanuka/Desktop/codespaces/liad/data/trail_4_sim1_fis/anomalous_temporal_sim1.csv')
ground_truth = pd.read_csv('/Users/chanuka/Desktop/codespaces/liad/data/trail_4_sim1_fis/anomalous_agent_sim1.csv')

gt

eval_b_train = pd.read_parquet('/Users/chanuka/Desktop/codespaces/liad/data/files/evalb_stay_points_test_anomalous.parquet')
eval_b_train = eval_b_train[eval_b_train['anomaly'] == True]
eval_b_train.rename(columns={'agent_id': 'agent', 'start_datetime': 'anomaly_start_time', 'end_datetime': 'anomaly_end_time'}, inplace=True)
eval_b_train.to_csv('../processed/anomalous_temporal.csv', index= False)

### Ground Truth Processing

For the evaluation purpose, you need to have a anomalous_agents.csv saved in processed folder,
the file ideally should contain a single column called 'agent', which contain the ids of all the anomalous agents in the simulation

In [2]:
data1 = pd.read_csv('/Users/chanuka/Desktop/codespaces/liad/processed/sim2_evalb/gt/TeamA_anomalous_temporal.csv')
data2 = pd.read_csv('/Users/chanuka/Desktop/codespaces/liad/processed/sim2_evalb/gt/TeamB_anomalous_temporal.csv')
data3 = pd.read_csv('/Users/chanuka/Desktop/codespaces/liad/processed/sim2_evalb/gt/TeamC_anomalous_temporal.csv')
data4 = pd.read_csv('/Users/chanuka/Desktop/codespaces/liad/processed/sim2_evalb/gt/TeamD_anomalous_temporal.csv')

gt = pd.concat([data1, data2, data3, data4], ignore_index=True)

In [16]:
gt.rename(columns={'event_start_time': 'started_at', 'event_end_time': 'finished_at'}, inplace=True)
gt = gt[['agent', 'started_at', 'finished_at']]
gt['started_at'] = pd.to_datetime(gt['started_at'])
gt['finished_at'] = pd.to_datetime(gt['finished_at'])


In [4]:
gt.agent.nunique()

274

In [None]:
gt = pd.read_csv('/Users/chanuka/Desktop/codespaces/neural_reeb/data/trail_4_sim1_fis/anomalous_temporal_sim1.csv')
gt_new = pd.read_parquet('/Users/chanuka/Desktop/codespaces/liad/data/files/evalb_stay_points_test_anomalous.parquet')
gt_new['start_datetime'] = pd.to_datetime(gt_new['start_datetime'], utc=True).dt.tz_convert('Asia/Tokyo')
gt_new['end_datetime'] = pd.to_datetime(gt_new['end_datetime'], utc=True).dt.tz_convert('Asia/Tokyo')
gt_new.rename(columns={'agent_id': 'agent', 'start_datetime': 'started_at', 'end_datetime': 'finished_at'}, inplace=True) # should have agent, started_at, finished_at
gt_new = gt_new[gt_new['anomaly']== True]
gt_new = gt_new[['agent', 'started_at', 'finished_at']]

gt_new_agents = pd.DataFrame(gt_new.agent.unique(), columns=['agent'])

# format 1
gt_new_agents.to_csv('../processed/anomalous_agents.csv', index=False)

In [18]:
gt = split_by_time_bins(gt)

gt['time_segment'] = gt['started_at'].apply(assign_time_segment)
gt['day_of_week'] = gt['started_at'].dt.dayofweek
gt['day_type'] = gt['day_of_week'].apply(lambda x: 'weekend' if x >= 5 else 'weekday')

In [19]:
gt.to_csv('../processed/sim2_evalb/anomalous_segmented.csv', index=False)