In [1]:
import os
import ast
import numpy as np
import pandas as pd
from pathlib import Path

from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
os.environ["OMP_NUM_THREADS"] = "1"
os.environ["MKL_NUM_THREADS"] = "1"
os.environ["OPENBLAS_NUM_THREADS"] = "1"
os.environ["NUMEXPR_NUM_THREADS"] = "1"

def parse_maybe_list(x):
    """
    Safely parse list-like values that may be stored as strings in CSV.
    Returns a Python list.
    Handles: NaN/None, list, tuple, set, np.ndarray, strings like "[1,2]".
    """
    # Fast path for common nulls
    if x is None:
        return []

    # If it's already list-like, return it as a list
    if isinstance(x, list):
        return x
    if isinstance(x, (set, tuple)):
        return list(x)
    if isinstance(x, np.ndarray):
        return x.tolist()

    # Handle scalar NaN (only safe for scalars)
    if isinstance(x, (float, np.floating)) and np.isnan(x):
        return []

    # Strings: try literal_eval
    if isinstance(x, str):
        s = x.strip()
        if s == "" or s.lower() in ("nan", "none", "null"):
            return []
        try:
            v = ast.literal_eval(s)
            if isinstance(v, (list, set, tuple)):
                return list(v)
            if isinstance(v, np.ndarray):
                return v.tolist()
            if isinstance(v, dict):
                # choose keys (adjust if you want values instead)
                return list(v.keys())
            return []
        except Exception:
            return []

    # For anything else, try a safe pandas scalar-null check
    try:
        if pd.isna(x):
            return []
    except Exception:
        pass

    return []



def ensure_columns_exist(df, cols, fill_value=np.nan):
    for c in cols:
        if c not in df.columns:
            df[c] = fill_value
    return df


def build_anomaly_features(train_profiles: pd.DataFrame, test_profiles: pd.DataFrame) -> pd.DataFrame:
    """
    Build per-row anomaly features by comparing test slot vs train slot
    for the SAME agent + day_type + time_segment.

    Prevents many-to-many merges by aggregating to unique keys first.
    """
    keys = ['agent', 'day_type', 'time_segment']

    base_numeric = [
        'unique_location_ids',
        'avg_distance_from_home_km',
        'avg_speed_kmh',
        'max_stay_duration',
        'transformations',
        'max_distance_from_home',
        'label'
    ]
    base_misc = ['unique_locs', 'poi_dict', 'dominent_poi']

    train_profiles = ensure_columns_exist(train_profiles.copy(), base_numeric + base_misc)
    test_profiles  = ensure_columns_exist(test_profiles.copy(),  base_numeric + base_misc)

    # Aggregate to ensure 1 row per key in each split
    agg_num = {
        'unique_location_ids': 'mean',
        'avg_distance_from_home_km': 'mean',
        'avg_speed_kmh': 'mean',
        'max_stay_duration': 'max',
        'transformations': 'mean',
        'max_distance_from_home': 'max',
        'label': 'max'
    }
    # For list-like / categorical fields: take first non-null occurrence
    agg_misc = {
        'unique_locs': 'first',
        'poi_dict': 'first',
        'dominent_poi': 'first'
    }

    train_agg = train_profiles.groupby(keys, as_index=False).agg({**agg_num, **agg_misc})
    test_agg  = test_profiles.groupby(keys, as_index=False).agg({**agg_num, **agg_misc})

    # Merge on agent+slot (THE actual intended join)
    merged = pd.merge(
        test_agg,
        train_agg,
        on=keys,
        suffixes=('_test', '_train'),
        how='left'
    )

    numeric_train_cols = [
        'unique_location_ids_train',
        'avg_distance_from_home_km_train',
        'avg_speed_kmh_train',
        'max_stay_duration_train',
        'transformations_train',
        'max_distance_from_home_train'
    ]
    for c in numeric_train_cols:
        if c not in merged.columns:
            merged[c] = 0.0
    merged[numeric_train_cols] = merged[numeric_train_cols].fillna(0)


    for c in ['unique_locs_train', 'unique_locs_test', 'poi_dict_train', 'poi_dict_test']:
        if c not in merged.columns:
            merged[c] = [[]] * len(merged)
        merged[c] = merged[c].apply(parse_maybe_list)

    merged['f_count_diff'] = (merged['unique_location_ids_test'] - merged['unique_location_ids_train']).abs()
    merged['f_dist_diff']  = (merged['avg_distance_from_home_km_test'] - merged['avg_distance_from_home_km_train']).abs()
    merged['f_speed_diff'] = (merged['avg_speed_kmh_test'] - merged['avg_speed_kmh_train']).abs()

    def get_new_loc_count(row):
        set_train = set(row['unique_locs_train']) if isinstance(row['unique_locs_train'], list) else set()
        set_test  = set(row['unique_locs_test'])  if isinstance(row['unique_locs_test'], list)  else set()
        return len(set_test - set_train)

    merged['f_new_locs'] = merged.apply(get_new_loc_count, axis=1)

    merged['f_max_stay_diff'] = (merged['max_stay_duration_test'] - merged['max_stay_duration_train']).abs()
    merged['f_transforms_diff'] = (merged['transformations_test'] - merged['transformations_train']).abs()
    merged['f_max_dist_diff'] = (merged['max_distance_from_home_test'] - merged['max_distance_from_home_train']).abs()

    if 'dominent_poi_test' not in merged.columns:
        merged['dominent_poi_test'] = np.nan
    if 'dominent_poi_train' not in merged.columns:
        merged['dominent_poi_train'] = np.nan

    merged['f_dom_poi_changed'] = (merged['dominent_poi_test'] != merged['dominent_poi_train']).astype(int)

    def get_new_poi_count(row):
        set_train = set(row['poi_dict_train']) if isinstance(row['poi_dict_train'], list) else set()
        set_test  = set(row['poi_dict_test'])  if isinstance(row['poi_dict_test'], list)  else set()
        return len(set_test - set_train)

    merged['f_new_pois'] = merged.apply(get_new_poi_count, axis=1)

    return merged


def fit_anomaly_weight_model(train_profiles: pd.DataFrame, test_profiles: pd.DataFrame):
    merged = build_anomaly_features(train_profiles, test_profiles)

    feature_cols = [
        'f_count_diff',
        'f_dist_diff',
        'f_speed_diff',
        'f_new_locs',
        'f_max_stay_diff',
        'f_transforms_diff',
        'f_max_dist_diff',
        'f_dom_poi_changed',
        'f_new_pois',
    ]

    X = merged[feature_cols]
    y = merged['label_test']  # 0/1 anomalous row in test

    model = Pipeline([
        ('scaler', StandardScaler()),
        ('clf', LogisticRegression(
            class_weight='balanced',
            max_iter=1000,
            n_jobs=1
        ))
    ])

    model.fit(X, y)
    return model, feature_cols

In [6]:
gt = pd.read_csv("/Users/chanuka/Desktop/codespaces/liad/processed/trial5/anomalous_temporal.csv")

TRAIN_DIR = Path("../processed/trial5/train_monthly")
TEST_DIR  = Path("../processed/trial5/test_monthly")

OUT_TRAIN_DIR = Path("../processed/trial5/train_monthly_subsampled")
OUT_TEST_DIR  = Path("../processed/trial5/test_monthly_subsampled")
OUT_TRAIN_DIR.mkdir(parents=True, exist_ok=True)
OUT_TEST_DIR.mkdir(parents=True, exist_ok=True)

RANDOM_AGENT_COUNT = 200_000
SEED = 42

def bucket_id_from_path(p: Path) -> int:
    return int(p.name.split("agent_bucket=")[1].split(".csv")[0])

gt_agents = set(gt["agent"].unique())

train_files = {bucket_id_from_path(p): p for p in TRAIN_DIR.glob("agent_bucket=*.csv")}
test_files  = {bucket_id_from_path(p): p for p in TEST_DIR.glob("agent_bucket=*.csv")}
common_buckets = sorted(set(train_files).intersection(test_files))

if not common_buckets:
    raise RuntimeError("No matching agent_bucket=*.csv files found between train and test.")

rng = np.random.default_rng(SEED)

for b in common_buckets:
    train_path = train_files[b]
    test_path  = test_files[b]

    print(f"\n=== Bucket {b} ===")
    train_df = pd.read_csv(train_path)
    test_df  = pd.read_csv(test_path)

    # Agents available in each split
    train_agents = set(train_df["agent"].unique())
    test_agents  = set(test_df["agent"].unique())

    # GT agents present in this bucket (keep all of them)
    gt_in_bucket = sorted((train_agents | test_agents) & gt_agents)

    # Candidate random agents = agents NOT in gt, ideally present in BOTH train and test
    # (so your background exists in both splits)
    candidates = sorted((train_agents & test_agents) - gt_agents)

    if len(candidates) == 0:
        print("WARNING: No non-gt candidate agents present in BOTH train & test for this bucket.")
        sampled_random = []
    else:
        k = min(RANDOM_AGENT_COUNT, len(candidates))
        sampled_random = rng.choice(candidates, size=k, replace=False).tolist()

    keep_agents = set(gt_in_bucket) | set(sampled_random)

    print(f"GT agents kept: {len(gt_in_bucket)}")
    print(f"Random agents kept: {len(sampled_random)} (requested {RANDOM_AGENT_COUNT})")
    print(f"Total kept agents: {len(keep_agents)}")

    # Filter rows by kept agents
    train_keep = train_df[train_df["agent"].isin(keep_agents)].copy()
    test_keep  = test_df[test_df["agent"].isin(keep_agents)].copy()

    # Save
    out_train = OUT_TRAIN_DIR / f"agent_bucket={b}.parquet"
    out_test  = OUT_TEST_DIR  / f"agent_bucket={b}.parquet"

    train_keep.to_parquet(out_train, index=False)
    test_keep.to_parquet(out_test, index=False)

    print(f"Saved train: {out_train} | rows={len(train_keep):,}")
    print(f"Saved test : {out_test}  | rows={len(test_keep):,}")



=== Bucket 0 ===
GT agents kept: 1
Random agents kept: 782 (requested 200000)
Total kept agents: 783
Saved train: ../processed/trial5/train_monthly_subsampled/agent_bucket=0.parquet | rows=9,396
Saved test : ../processed/trial5/test_monthly_subsampled/agent_bucket=0.parquet  | rows=9,396

=== Bucket 1 ===
GT agents kept: 0
Random agents kept: 789 (requested 200000)
Total kept agents: 789
Saved train: ../processed/trial5/train_monthly_subsampled/agent_bucket=1.parquet | rows=9,468
Saved test : ../processed/trial5/test_monthly_subsampled/agent_bucket=1.parquet  | rows=9,468

=== Bucket 2 ===
GT agents kept: 0
Random agents kept: 785 (requested 200000)
Total kept agents: 785
Saved train: ../processed/trial5/train_monthly_subsampled/agent_bucket=2.parquet | rows=9,420
Saved test : ../processed/trial5/test_monthly_subsampled/agent_bucket=2.parquet  | rows=9,420

=== Bucket 3 ===
GT agents kept: 1
Random agents kept: 787 (requested 200000)
Total kept agents: 788
Saved train: ../processed/tr

In [2]:
gt = pd.read_csv('/Users/chanuka/Desktop/codespaces/liad/processed/trial5/anomalous_temporal.csv')


TRAIN_DIR = Path("../processed/trial5/train_monthly_subsampled")
TEST_DIR  = Path("../processed/trial5/test_monthly_subsampled")


def bucket_id_from_path(p: Path) -> int:
    return int(p.name.split("agent_bucket=")[1].split(".csv")[0])

train_files = {bucket_id_from_path(p): p for p in TRAIN_DIR.glob("agent_bucket=*.csv")}
test_files  = {bucket_id_from_path(p): p for p in TEST_DIR.glob("agent_bucket=*.csv")}
common_buckets = sorted(set(train_files).intersection(test_files))

train_list = []
test_list = []

for b in common_buckets:
    train_path = train_files[b]
    test_path  = test_files[b]

    train_df = pd.read_csv(train_path)
    test_df = pd.read_csv(test_path)

    train_list.append(train_df)
    test_list.append(test_df)

train = pd.concat(train_list, ignore_index=True)
test = pd.concat(test_list, ignore_index=True)

train['label'] = 0
test['label'] = 0

gt_agents = set(gt['agent'].unique())
train_agents = set(train['agent'].unique())
normal_agents = np.array(list(train_agents - gt_agents))

print("GT agents:", len(gt_agents))
print("Available normal agents:", len(normal_agents))

np.random.seed(42)
sample_size = 200000
if len(normal_agents) < sample_size:
    raise ValueError(f"Not enough normal agents to sample {sample_size}. Only {len(normal_agents)} available.")

GT agents: 613
Available normal agents: 1577048


In [3]:
sampled_normals = np.random.choice(normal_agents, size=sample_size, replace=False)

train = pd.concat([
    train[train['agent'].isin(gt_agents)],
    train[train['agent'].isin(sampled_normals)]
], ignore_index=True)

test = test[test['agent'].isin(train['agent'].unique())].copy()

gt_keys = set(zip(gt['agent'], gt['day_type'], gt['time_segment']))
test_keys = list(zip(test['agent'], test['day_type'], test['time_segment']))
test['label'] = np.fromiter((k in gt_keys for k in test_keys), dtype=np.int8, count=len(test))

for col in ['unique_locs', 'poi_dict']:
    if col in train.columns:
        train[col] = train[col].apply(parse_maybe_list)
    if col in test.columns:
        test[col] = test[col].apply(parse_maybe_list)

In [6]:
train.to_csv('../processed/trial5/train_monthly_subsampled.csv', index=False)
test.to_csv('../processed/trial5/test_monthly_subsampled.csv', index=False)

In [11]:
print('fitting the model')
model, feature_cols = fit_anomaly_weight_model(train, test)

clf = model.named_steps['clf']
weights = clf.coef_[0]

# print weights
for name, w in zip(feature_cols, weights):
    print(name, w)

# save weights
weights_df = pd.DataFrame({
    "feature": feature_cols,
    "weight": weights
}).sort_values("weight", key=abs, ascending=False)

out_path = "sim2_evalb_model_weights.csv"
weights_df.to_csv(out_path, index=False)
print(f"\nSaved weights to: {out_path}")

fitting the model
f_count_diff 0.15302082531276692
f_dist_diff -0.1913591142485288
f_speed_diff 0.24952645751043032
f_new_locs -0.5090649992846829
f_max_stay_diff 0.0780266674132837
f_transforms_diff -0.03548937193854445
f_max_dist_diff 0.4501840772380571
f_dom_poi_changed 0.029679350593119064
f_new_pois 0.22113424571069545

Saved weights to: sim2_evalb_model_weights.csv


In [5]:
import numpy as np
import pandas as pd

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression

EPS = 1e-6

NUM_COLS = [
    "unique_location_ids",
    "avg_distance_from_home_km",
    "avg_speed_kmh",
    "max_stay_duration",
    "transformations",
    "max_distance_from_home",
]

def _safe_set(x):
    if isinstance(x, list):
        return set(x)
    if isinstance(x, (set, tuple)):
        return set(x)
    return set()

def build_weekly_features(weekly_profiles: pd.DataFrame) -> pd.DataFrame:
    df = weekly_profiles.copy()

    # --- Sort for lag features ---
    df = df.sort_values(["agent", "day_type", "time_segment", "week_id"])

    # --- Training stats per slot (level baseline) ---
    train = df[df["phase"] == "train"].copy()

    level_mu = (
        train.groupby(["agent", "day_type", "time_segment"])[NUM_COLS]
        .mean()
        .add_suffix("_mu_train")
        .reset_index()
    )
    level_sd = (
        train.groupby(["agent", "day_type", "time_segment"])[NUM_COLS]
        .std(ddof=0)
        .fillna(0)
        .add_suffix("_sd_train")
        .reset_index()
    )

    # --- Training deltas per slot (drift baseline) ---
    # delta = this_week - prev_week within training weeks
    train_d = train.sort_values(["agent", "day_type", "time_segment", "week_id"]).copy()
    for c in NUM_COLS:
        train_d[c + "_delta"] = train_d.groupby(["agent","day_type","time_segment"])[c].diff()

    delta_cols = [c + "_delta" for c in NUM_COLS]
    delta_mu = (
        train_d.groupby(["agent", "day_type", "time_segment"])[delta_cols]
        .mean()
        .add_suffix("_mu_train")
        .reset_index()
    )
    delta_sd = (
        train_d.groupby(["agent", "day_type", "time_segment"])[delta_cols]
        .std(ddof=0)
        .fillna(0)
        .add_suffix("_sd_train")
        .reset_index()
    )

    # --- Merge baselines into all rows ---
    out = df.merge(level_mu, on=["agent","day_type","time_segment"], how="left") \
            .merge(level_sd, on=["agent","day_type","time_segment"], how="left") \
            .merge(delta_mu, on=["agent","day_type","time_segment"], how="left") \
            .merge(delta_sd, on=["agent","day_type","time_segment"], how="left")

    # Fill missing baselines (agents/slots with no train history)
    out = out.fillna(0)

    # --- Lag (previous week) from combined timeline (train+test) ---
    for c in NUM_COLS:
        out[c + "_prev"] = out.groupby(["agent","day_type","time_segment"])[c].shift(1)
        out[c + "_delta_now"] = out[c] - out[c + "_prev"]

    # --- Feature construction: level z + drift z ---
    feature_cols = []
    for c in NUM_COLS:
        mu = out[c + "_mu_train"]
        sd = out[c + "_sd_train"]
        out[f"f_{c}_level_z"] = (out[c] - mu) / (sd + EPS)
        feature_cols.append(f"f_{c}_level_z")

        dmu = out[c + "_delta_mu_train"]
        dsd = out[c + "_delta_sd_train"]
        out[f"f_{c}_drift_z"] = (out[c + "_delta_now"] - dmu) / (dsd + EPS)
        feature_cols.append(f"f_{c}_drift_z")

        out[f"f_{c}_abs_level"] = (out[c] - mu).abs()
        out[f"f_{c}_abs_drift"] = (out[c + "_delta_now"] - dmu).abs()
        feature_cols += [f"f_{c}_abs_level", f"f_{c}_abs_drift"]

    # --- Set novelty features ---
    # Union sets over training for each agent (global baseline)
    train_locs_union = train.groupby("agent")["unique_locs"].apply(
        lambda s: set().union(*[_safe_set(x) for x in s])
    )
    train_pois_union = train.groupby("agent")["poi_dict"].apply(
        lambda s: set().union(*[_safe_set(x) for x in s])
    )

    def new_count(row, union_series, col):
        base = union_series.get(row["agent"], set())
        cur = _safe_set(row.get(col))
        return len(cur - base)

    out["f_new_locs_vs_train"] = out.apply(lambda r: new_count(r, train_locs_union, "unique_locs"), axis=1)
    out["f_new_pois_vs_train"] = out.apply(lambda r: new_count(r, train_pois_union, "poi_dict"), axis=1)
    feature_cols += ["f_new_locs_vs_train", "f_new_pois_vs_train"]

    # Optional: dominant POI change vs train mode (if you have it reliably)
    if "dominant_poi" in df.columns:
        poi_mode = train.groupby(["agent","day_type","time_segment"])["dominant_poi"] \
                        .agg(lambda x: x.mode().iloc[0] if len(x.mode()) else None) \
                        .reset_index() \
                        .rename(columns={"dominant_poi": "dominant_poi_mode_train"})
        out = out.merge(poi_mode, on=["agent","day_type","time_segment"], how="left")
        out["f_dom_poi_changed"] = (out["dominant_poi"] != out["dominant_poi_mode_train"]).astype(int)
        feature_cols.append("f_dom_poi_changed")

    out.attrs["feature_cols"] = feature_cols
    return out

def fit_weekly_row_model(features_df: pd.DataFrame, label_col="label"):
    feature_cols = features_df.attrs["feature_cols"]
    train_rows = features_df[features_df["phase"].isin(["train","test"])].copy()

    # IMPORTANT: train on TRAIN+TEST? Usually you train on train+some validation.
    # Here we'll fit on whatever rows have labels.
    train_rows = train_rows[train_rows[label_col].notna()].copy()

    X = train_rows[feature_cols].astype(float)
    y = train_rows[label_col].astype(int)

    model = Pipeline([
        ("scaler", StandardScaler(with_mean=True, with_std=True)),
        ("clf", LogisticRegression(class_weight="balanced", max_iter=2000))
    ])
    model.fit(X, y)
    return model, feature_cols

def score_rows(model, features_df: pd.DataFrame, feature_cols):
    X = features_df[feature_cols].astype(float)
    # probability of anomaly
    p = model.predict_proba(X)[:, 1]
    out = features_df.copy()
    out["anomaly_prob"] = p
    return out

def pool_week_score(scored_rows: pd.DataFrame, k=10):
    # top-k mean pooling per agent-week
    def topk_mean(x):
        x = np.sort(x)[::-1]
        return float(np.mean(x[:min(k, len(x))])) if len(x) else 0.0

    return scored_rows.groupby(["agent","week_id"], as_index=False)["anomaly_prob"].agg(
        week_score=topk_mean,
        max_score="max",
        mean_score="mean"
    )


In [12]:
# test 'chunk' if 0 -> 5, 1-> 6, 2-> 7, 3-> 8,4-> 9
test["chunk"] = test["chunk"].replace({0: 5, 1: 6, 2: 7, 3: 8, 4: 9})

In [18]:
test.rename(columns={'chunk': 'week_id'}, inplace=True)
test['phase'] = 'test'
train['phase'] = 'train'
# append test dataframe to train at the end
new_df = pd.concat([train, test], ignore_index=True)

In [19]:
out = build_weekly_features(new_df)

In [16]:
model, feature_cols = fit_weekly_row_model(new_df)

KeyError: 'feature_cols'