In [1]:
import pandas as pd
import numpy as np
from math import radians, sin, cos, sqrt, atan2
from joblib import Parallel, delayed
import math
import os
os.environ["OMP_NUM_THREADS"] = "1"
os.environ["MKL_NUM_THREADS"] = "1"
os.environ["OPENBLAS_NUM_THREADS"] = "1"
os.environ["NUMEXPR_NUM_THREADS"] = "1"

In [2]:
train = pd.read_csv('/Users/chanuka/Desktop/codespaces/liad/processed/sim2_evalb/train_monthly.csv')
test = pd.read_csv('/Users/chanuka/Desktop/codespaces/liad/processed/sim2_evalb/test_monthly.csv')

print(f"Training profiles: {len(train)} entries for {train['agent'].nunique()} agents")
print(f"Testing profiles: {len(test)} entries for {test['agent'].nunique()} agents")
print(f"\nCommon agents: {len(set(train['agent'].unique()) & set(test['agent'].unique()))}")

gt = pd.read_csv('/Users/chanuka/Desktop/codespaces/liad/processed/sim2_evalb/anomalous_segmented.csv')

Training profiles: 6991244 entries for 583452 agents
Testing profiles: 6667615 entries for 559366 agents

Common agents: 559366


In [3]:
train['label'] = 0
test['label'] = 0

In [4]:
gt_agents = set(gt['agent'].unique())
train_agents = set(train['agent'].unique())
normal_agents = np.array(list(train_agents - gt_agents))

print("GT agents:", len(gt_agents))
print("Available normal agents:", len(normal_agents))
np.random.seed(42)
sampled_normals = np.random.choice(normal_agents, size=1000, replace=False)

train = pd.concat([
    train[train['agent'].isin(gt_agents)],          # anomalous agents
    train[train['agent'].isin(sampled_normals)]     # clean agents
]).reset_index(drop=True)


GT agents: 274
Available normal agents: 583199


In [5]:
test = test[test.agent.isin(train.agent.unique())]

In [8]:
# train.to_csv('temp.csv', index=False)
# test.to_csv('temp_test.csv', index=False)

train = pd.read_csv('/Users/chanuka/Desktop/codespaces/liad/notebooks/temp.csv')
test = pd.read_csv('/Users/chanuka/Desktop/codespaces/liad/notebooks/temp_test.csv')

In [6]:
for agent, gt_agent in gt.groupby('agent'):
    agent_mask = test['agent'] == agent

    if not agent_mask.any():
        continue

    for _, row in gt_agent.iterrows():
        anomaly_time_segment = row['time_segment']
        anomaly_day_type = row['day_type']

        overlap_mask = (
            agent_mask &
            (test['day_type'] == anomaly_day_type) &
            (test['time_segment'] == anomaly_time_segment)
        )

        test.loc[overlap_mask, 'label'] = 1

In [9]:
def build_anomaly_features(train_profiles, test_profiles):
    # Merge train/test profiles (don't filter by agent here; keep everything)
    merged = pd.merge(
        test_profiles,
        train_profiles,
        on=['day_type', 'time_segment'],
        suffixes=('_test', '_train'),
        how='left'
    )

    # Fill numeric training columns when no history exists
    numeric_cols = [
        'unique_location_ids_train',
        'avg_distance_from_home_km_train',
        'avg_speed_kmh_train',
        'max_stay_duration_train',
        'transformations_train',
        'max_distance_from_home_train'
    ]
    merged[numeric_cols] = merged[numeric_cols].fillna(0)

    # Component 1: Count difference
    merged['f_count_diff'] = (merged['unique_location_ids_test'] -
                              merged['unique_location_ids_train']).abs()

    # Component 2: Distance difference
    merged['f_dist_diff'] = (merged['avg_distance_from_home_km_test'] -
                             merged['avg_distance_from_home_km_train']).abs()

    # Component 3: Speed difference
    merged['f_speed_diff'] = (merged['avg_speed_kmh_test'] -
                              merged['avg_speed_kmh_train']).abs()

    # Component 4: New locations
    def get_new_loc_count(row):
        locs_train = row['unique_locs_train']
        locs_test = row['unique_locs_test']
        set_train = set(locs_train) if isinstance(locs_train, list) else set()
        set_test = set(locs_test) if isinstance(locs_test, list) else set()
        return len(set_test - set_train)

    merged['f_new_locs'] = merged.apply(get_new_loc_count, axis=1)

    # Component 5: max stay duration
    merged['f_max_stay_diff'] = (
        merged['max_stay_duration_test'] -
        merged['max_stay_duration_train']
    ).abs()

    # Component 6: number of transformations
    merged['f_transforms_diff'] = (
        merged['transformations_test'] -
        merged['transformations_train']
    ).abs()

    # Component 7: max distance from home
    merged['f_max_dist_diff'] = (
        merged['max_distance_from_home_test'] -
        merged['max_distance_from_home_train']
    ).abs()

    # Component 8: dominant poi changed
    merged['f_dom_poi_changed'] = (
        merged['dominent_poi_test'] != merged['dominent_poi_train']
    ).astype(int)

    # Component 9: new POI categories
    def get_new_poi_count(row):
        pois_train = row['poi_dict_train']
        pois_test = row['poi_dict_test']
        set_train = set(pois_train) if isinstance(pois_train, list) else set()
        set_test = set(pois_test) if isinstance(pois_test, list) else set()
        return len(set_test - set_train)

    merged['f_new_pois'] = merged.apply(get_new_poi_count, axis=1)

    return merged

def fit_anomaly_weight_model(train_profiles, test_profiles):
    merged = build_anomaly_features(train_profiles, test_profiles)

    feature_cols = [
        'f_count_diff',
        'f_dist_diff',
        'f_speed_diff',
        'f_new_locs',
        'f_max_stay_diff',
        'f_transforms_diff',
        'f_max_dist_diff',
        'f_dom_poi_changed',
        'f_new_pois',
    ]

    X = merged[feature_cols]
    y = merged['label_test']  # 0/1 anomalous row

    model = Pipeline([
        ('scaler', StandardScaler()),
        ('clf', LogisticRegression(
            class_weight='balanced',  # you likely have few anomalies
            max_iter=1000
        ))
    ])

    model.fit(X, y)
    return model, feature_cols

In [10]:
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

def fit_anomaly_weight_model(train_profiles, test_profiles):
    merged = build_anomaly_features(train_profiles, test_profiles)

    feature_cols = [
        'f_count_diff',
        'f_dist_diff',
        'f_speed_diff',
        'f_new_locs',
        'f_max_stay_diff',
        'f_transforms_diff',
        'f_max_dist_diff',
        'f_dom_poi_changed',
        'f_new_pois',
    ]

    X = merged[feature_cols]
    y = merged['label_test']  # 0/1 anomalous row

    model = Pipeline([
        ('scaler', StandardScaler()),
        ('clf', LogisticRegression(
            class_weight='balanced',  # you likely have few anomalies
            max_iter=1000
        ))
    ])

    model.fit(X, y)
    return model, feature_cols

In [11]:
model, feature_cols = fit_anomaly_weight_model(train, test)
scaler = model.named_steps['scaler']
clf = model.named_steps['clf']

weights = clf.coef_[0]
for name, w in zip(feature_cols, weights):
    print(name, w)

f_count_diff 0.09441474976202284
f_dist_diff -0.11272085849683175
f_speed_diff 0.379662030687741
f_new_locs 0.0
f_max_stay_diff 0.18430967719595884
f_transforms_diff -0.008703571242394305
f_max_dist_diff 0.34405856529748496
f_dom_poi_changed -0.001464858802034066
f_new_pois 0.0


In [5]:
import numpy as np
import pandas as pd

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression

EPS = 1e-6

NUM_COLS = [
    "unique_location_ids",
    "avg_distance_from_home_km",
    "avg_speed_kmh",
    "max_stay_duration",
    "transformations",
    "max_distance_from_home",
]

def _safe_set(x):
    if isinstance(x, list):
        return set(x)
    if isinstance(x, (set, tuple)):
        return set(x)
    return set()

def build_weekly_features(weekly_profiles: pd.DataFrame) -> pd.DataFrame:
    df = weekly_profiles.copy()

    # --- Sort for lag features ---
    df = df.sort_values(["agent", "day_type", "time_segment", "week_id"])

    # --- Training stats per slot (level baseline) ---
    train = df[df["phase"] == "train"].copy()

    level_mu = (
        train.groupby(["agent", "day_type", "time_segment"])[NUM_COLS]
        .mean()
        .add_suffix("_mu_train")
        .reset_index()
    )
    level_sd = (
        train.groupby(["agent", "day_type", "time_segment"])[NUM_COLS]
        .std(ddof=0)
        .fillna(0)
        .add_suffix("_sd_train")
        .reset_index()
    )

    # --- Training deltas per slot (drift baseline) ---
    # delta = this_week - prev_week within training weeks
    train_d = train.sort_values(["agent", "day_type", "time_segment", "week_id"]).copy()
    for c in NUM_COLS:
        train_d[c + "_delta"] = train_d.groupby(["agent","day_type","time_segment"])[c].diff()

    delta_cols = [c + "_delta" for c in NUM_COLS]
    delta_mu = (
        train_d.groupby(["agent", "day_type", "time_segment"])[delta_cols]
        .mean()
        .add_suffix("_mu_train")
        .reset_index()
    )
    delta_sd = (
        train_d.groupby(["agent", "day_type", "time_segment"])[delta_cols]
        .std(ddof=0)
        .fillna(0)
        .add_suffix("_sd_train")
        .reset_index()
    )

    # --- Merge baselines into all rows ---
    out = df.merge(level_mu, on=["agent","day_type","time_segment"], how="left") \
            .merge(level_sd, on=["agent","day_type","time_segment"], how="left") \
            .merge(delta_mu, on=["agent","day_type","time_segment"], how="left") \
            .merge(delta_sd, on=["agent","day_type","time_segment"], how="left")

    # Fill missing baselines (agents/slots with no train history)
    out = out.fillna(0)

    # --- Lag (previous week) from combined timeline (train+test) ---
    for c in NUM_COLS:
        out[c + "_prev"] = out.groupby(["agent","day_type","time_segment"])[c].shift(1)
        out[c + "_delta_now"] = out[c] - out[c + "_prev"]

    # --- Feature construction: level z + drift z ---
    feature_cols = []
    for c in NUM_COLS:
        mu = out[c + "_mu_train"]
        sd = out[c + "_sd_train"]
        out[f"f_{c}_level_z"] = (out[c] - mu) / (sd + EPS)
        feature_cols.append(f"f_{c}_level_z")

        dmu = out[c + "_delta_mu_train"]
        dsd = out[c + "_delta_sd_train"]
        out[f"f_{c}_drift_z"] = (out[c + "_delta_now"] - dmu) / (dsd + EPS)
        feature_cols.append(f"f_{c}_drift_z")

        out[f"f_{c}_abs_level"] = (out[c] - mu).abs()
        out[f"f_{c}_abs_drift"] = (out[c + "_delta_now"] - dmu).abs()
        feature_cols += [f"f_{c}_abs_level", f"f_{c}_abs_drift"]

    # --- Set novelty features ---
    # Union sets over training for each agent (global baseline)
    train_locs_union = train.groupby("agent")["unique_locs"].apply(
        lambda s: set().union(*[_safe_set(x) for x in s])
    )
    train_pois_union = train.groupby("agent")["poi_dict"].apply(
        lambda s: set().union(*[_safe_set(x) for x in s])
    )

    def new_count(row, union_series, col):
        base = union_series.get(row["agent"], set())
        cur = _safe_set(row.get(col))
        return len(cur - base)

    out["f_new_locs_vs_train"] = out.apply(lambda r: new_count(r, train_locs_union, "unique_locs"), axis=1)
    out["f_new_pois_vs_train"] = out.apply(lambda r: new_count(r, train_pois_union, "poi_dict"), axis=1)
    feature_cols += ["f_new_locs_vs_train", "f_new_pois_vs_train"]

    # Optional: dominant POI change vs train mode (if you have it reliably)
    if "dominant_poi" in df.columns:
        poi_mode = train.groupby(["agent","day_type","time_segment"])["dominant_poi"] \
                        .agg(lambda x: x.mode().iloc[0] if len(x.mode()) else None) \
                        .reset_index() \
                        .rename(columns={"dominant_poi": "dominant_poi_mode_train"})
        out = out.merge(poi_mode, on=["agent","day_type","time_segment"], how="left")
        out["f_dom_poi_changed"] = (out["dominant_poi"] != out["dominant_poi_mode_train"]).astype(int)
        feature_cols.append("f_dom_poi_changed")

    out.attrs["feature_cols"] = feature_cols
    return out

def fit_weekly_row_model(features_df: pd.DataFrame, label_col="label"):
    feature_cols = features_df.attrs["feature_cols"]
    train_rows = features_df[features_df["phase"].isin(["train","test"])].copy()

    # IMPORTANT: train on TRAIN+TEST? Usually you train on train+some validation.
    # Here we'll fit on whatever rows have labels.
    train_rows = train_rows[train_rows[label_col].notna()].copy()

    X = train_rows[feature_cols].astype(float)
    y = train_rows[label_col].astype(int)

    model = Pipeline([
        ("scaler", StandardScaler(with_mean=True, with_std=True)),
        ("clf", LogisticRegression(class_weight="balanced", max_iter=2000))
    ])
    model.fit(X, y)
    return model, feature_cols

def score_rows(model, features_df: pd.DataFrame, feature_cols):
    X = features_df[feature_cols].astype(float)
    # probability of anomaly
    p = model.predict_proba(X)[:, 1]
    out = features_df.copy()
    out["anomaly_prob"] = p
    return out

def pool_week_score(scored_rows: pd.DataFrame, k=10):
    # top-k mean pooling per agent-week
    def topk_mean(x):
        x = np.sort(x)[::-1]
        return float(np.mean(x[:min(k, len(x))])) if len(x) else 0.0

    return scored_rows.groupby(["agent","week_id"], as_index=False)["anomaly_prob"].agg(
        week_score=topk_mean,
        max_score="max",
        mean_score="mean"
    )


In [12]:
# test 'chunk' if 0 -> 5, 1-> 6, 2-> 7, 3-> 8,4-> 9
test["chunk"] = test["chunk"].replace({0: 5, 1: 6, 2: 7, 3: 8, 4: 9})

In [18]:
test.rename(columns={'chunk': 'week_id'}, inplace=True)
test['phase'] = 'test'
train['phase'] = 'train'
# append test dataframe to train at the end
new_df = pd.concat([train, test], ignore_index=True)

In [19]:
out = build_weekly_features(new_df)

In [16]:
model, feature_cols = fit_weekly_row_model(new_df)

KeyError: 'feature_cols'