In [1]:
import numpy as np
import pandas as pd
import ast


def _to_set(x):
    if x is None or (isinstance(x, float) and np.isnan(x)):
        return set()
    if isinstance(x, (list, tuple)):
        return set(x)
    if isinstance(x, str):
        try:
            v = ast.literal_eval(x)
            if isinstance(v, (list, tuple)):
                return set(v)
        except Exception:
            pass
        return set(x.split(","))
    return set()


def build_agent_features(train: pd.DataFrame, test: pd.DataFrame) -> pd.DataFrame:
    rows = []

    for agent, test_g in test.groupby("agent", sort=False):
        train_g = train[train["agent"] == agent]
        if train_g.empty:
            continue

        max_feats = {
            "f_duration": 0.0,
            "f_dist": 0.0,
            "f_speed_1": 0.0,
            "f_speed_2": 0.0,
            "f_trans_jaccard": 0.0,
        }

        for _, tr in test_g.iterrows():
            # same dominant_poi first
            cand = train_g[train_g["dominant_poi"] == tr["dominant_poi"]]
            if cand.empty:
                cand = train_g  # fallback

            best = {
                "f_duration": np.inf,
                "f_dist": np.inf,
                "f_speed_1": np.inf,
                "f_speed_2": np.inf,
                "f_trans_jaccard": np.inf,
            }

            t_set = _to_set(tr["transformation"])

            for _, rr in cand.iterrows():
                r_set = _to_set(rr["transformation"])

                inter = len(t_set & r_set)
                union = len(t_set | r_set)
                jac = 1.0 - (inter / union if union else 0.0)

                best["f_duration"] = min(best["f_duration"], abs(tr["duration_min"] - rr["duration_min"]))
                best["f_dist"] = min(best["f_dist"], abs(tr["max_distance_from_home"] - rr["max_distance_from_home"]))
                best["f_speed_1"] = min(best["f_speed_1"], abs(tr["avg_speed_first_half"] - rr["avg_speed_first_half"]))
                best["f_speed_2"] = min(best["f_speed_2"], abs(tr["avg_speed_second_half"] - rr["avg_speed_second_half"]))
                best["f_trans_jaccard"] = min(best["f_trans_jaccard"], jac)

            for k in max_feats:
                max_feats[k] = max(max_feats[k], best[k])

        rows.append({
            "agent": agent,
            **max_feats,
            "label": int(test_g["label"].max()),
        })

    return pd.DataFrame(rows)

In [2]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression

def fit_logistic_weights(agent_df: pd.DataFrame):
    feature_cols = [
        "f_duration",
        "f_dist",
        "f_speed_1",
        "f_speed_2",
        "f_trans_jaccard",
    ]

    X = agent_df[feature_cols].to_numpy()
    y = agent_df["label"].to_numpy()

    pipe = Pipeline([
        ("scaler", StandardScaler()),
        ("clf", LogisticRegression(
            penalty="l1",
            solver="liblinear",
            class_weight="balanced",
            max_iter=2000,
        ))
    ])

    pipe.fit(X, y)

    weights = pipe.named_steps["clf"].coef_[0]
    intercept = pipe.named_steps["clf"].intercept_[0]

    return dict(zip(feature_cols, weights)), intercept, pipe

In [3]:
train = pd.read_csv('/Users/chanuka/Desktop/codespaces/liad/processed/trial5/sim1/10k/ore/or_train.csv')
test = pd.read_csv('/Users/chanuka/Desktop/codespaces/liad/processed/trial5/sim1/10k/ore/or_test.csv')

gt = pd.read_csv('/Users/chanuka/Desktop/codespaces/liad/processed/trial5/sim1/gt/anomalous_temporal.csv')
test['started_at']  = pd.to_datetime(test['started_at'])
test['finished_at'] = pd.to_datetime(test['finished_at'])

train['started_at']  = pd.to_datetime(train['started_at'])
train['finished_at'] = pd.to_datetime(train['finished_at'])

gt['started_at'] = pd.to_datetime(gt['started_at']) 
gt['started_at'] = gt['started_at'].dt.tz_convert('Asia/Tokyo')
gt['finished_at'] = pd.to_datetime(gt['finished_at']) 
gt['finished_at'] = gt['finished_at'].dt.tz_convert('Asia/Tokyo')

# # adding ground truth
train['label'] = 0
test['label'] = 0

for agent, gt_agent in gt.groupby('agent'):
    agent_mask = test['agent'] == agent

    if not agent_mask.any():
        continue

    for _, row in gt_agent.iterrows():
        anomaly_start_time = row['started_at']
        anomaly_end_time   = row['finished_at']

        overlap_mask = (
            agent_mask &
            (test['started_at'] < anomaly_end_time) &
            (test['finished_at'] > anomaly_start_time)
        )

        test.loc[overlap_mask, 'label'] = 1

In [4]:
agent_features = build_agent_features(train, test)

weights, intercept, model = fit_logistic_weights(agent_features)

print("Learned weights:")
for k, v in weights.items():
    print(f"{k:20s}: {v:+.4f}")

print("Intercept:", intercept)

Learned weights:
f_duration          : +0.9954
f_dist              : +1.0792
f_speed_1           : +0.0466
f_speed_2           : +0.3712
f_trans_jaccard     : -0.3132
Intercept: -0.6771675175820481
