In [6]:
import pandas as pd
import numpy as np
import ast

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression


# ----------------- LOAD -----------------
train = pd.read_csv('../../processed/train_monthly.csv')
test  = pd.read_csv('../../processed/test_monthly.csv')

gt_segmented = pd.read_csv('../../processed/anomalous_segmented.csv')   # agent, day_type, time_segment (and maybe more)
gt_agents    = pd.read_csv('../../processed/anomalous_agents.csv')      # not used below, but keep if you want agent-level eval


# ----------------- FIX TYPES: list columns -----------------
def parse_list_col(x):
    if pd.isna(x) or x == 0:
        return []
    if isinstance(x, list):
        return x
    if isinstance(x, str):
        s = x.strip()
        if not s:
            return []
        try:
            v = ast.literal_eval(s)
            if isinstance(v, (list, tuple, set)):
                return list(v)
            return []
        except Exception:
            return []
    return []

for col in ["unique_locs", "poi_dict"]:
    if col in train.columns:
        train[col] = train[col].apply(parse_list_col)
    if col in test.columns:
        test[col]  = test[col].apply(parse_list_col)


# ----------------- LABEL TEST FAST (NO LOOPS) -----------------
# Goal: label test rows as 1 if (agent, day_type, time_segment) is in gt_segmented.

test["label"] = 0

key_cols = ["agent", "day_type", "time_segment"]
gt_keys = gt_segmented[key_cols].drop_duplicates()
gt_keys["label"] = 1

# left-join keys onto test
test = test.merge(gt_keys, on=key_cols, how="left", suffixes=("", "_gt"))
test["label"] = test["label_gt"].fillna(0).astype(int)
test.drop(columns=["label_gt"], inplace=True)

# optional: train labels all 0 (explicit)
train["label"] = 0


# ----------------- BUILD FEATURES (CORRECT MERGE ON AGENT) -----------------
def build_anomaly_features(train_profiles: pd.DataFrame, test_profiles: pd.DataFrame) -> pd.DataFrame:
    # Merge per agent + slot (this is the correct alignment)
    merged = test_profiles.merge(
        train_profiles,
        on=["agent", "day_type", "time_segment"],
        suffixes=("_test", "_train"),
        how="left"
    )

    # Fill numeric train columns when no history exists
    numeric_cols = [
        'unique_location_ids_train',
        'avg_distance_from_home_km_train',
        'avg_speed_kmh_train',
        'max_stay_duration_train',
        'transformations_train',
        'max_distance_from_home_train'
    ]
    for c in numeric_cols:
        if c in merged.columns:
            merged[c] = merged[c].fillna(0)

    # Simple numeric diffs (vectorized)
    merged['f_count_diff'] = (merged['unique_location_ids_test'] - merged['unique_location_ids_train']).abs()
    merged['f_dist_diff']  = (merged['avg_distance_from_home_km_test'] - merged['avg_distance_from_home_km_train']).abs()
    merged['f_speed_diff'] = (merged['avg_speed_kmh_test'] - merged['avg_speed_kmh_train']).abs()

    merged['f_max_stay_diff'] = (merged['max_stay_duration_test'] - merged['max_stay_duration_train']).abs()
    merged['f_transforms_diff'] = (merged['transformations_test'] - merged['transformations_train']).abs()
    merged['f_max_dist_diff'] = (merged['max_distance_from_home_test'] - merged['max_distance_from_home_train']).abs()

    merged['f_dom_poi_changed'] = (merged['dominent_poi_test'] != merged['dominent_poi_train']).astype(int)

    # Set-diff features (do NOT use DataFrame.apply(axis=1) â€” use list comprehension)
    tl = merged['unique_locs_train'].tolist()
    te = merged['unique_locs_test'].tolist()
    merged['f_new_locs'] = np.fromiter((len(set(b) - set(a)) for a, b in zip(tl, te)),
                                       dtype=np.int32, count=len(merged))

    pt = merged['poi_dict_train'].tolist()
    pe = merged['poi_dict_test'].tolist()
    merged['f_new_pois'] = np.fromiter((len(set(b) - set(a)) for a, b in zip(pt, pe)),
                                       dtype=np.int32, count=len(merged))

    return merged


def fit_anomaly_weight_model(train_profiles: pd.DataFrame, test_profiles: pd.DataFrame):
    merged = build_anomaly_features(train_profiles, test_profiles)

    feature_cols = [
        'f_count_diff',
        'f_dist_diff',
        'f_speed_diff',
        'f_new_locs',
        'f_max_stay_diff',
        'f_transforms_diff',
        'f_max_dist_diff',
        'f_dom_poi_changed',
        'f_new_pois',
    ]

    # y is test label (because merged rows are test slots with attached train stats)
    y = merged['label_test'].astype(int).values
    X = merged[feature_cols].replace([np.inf, -np.inf], np.nan).fillna(0)

    model = Pipeline([
        ('scaler', StandardScaler()),
        ('clf', LogisticRegression(
            class_weight='balanced',
            max_iter=2000,
            n_jobs=1  # keep deterministic; parallelism here isn't your bottleneck
        ))
    ])

    model.fit(X, y)
    return model, feature_cols, merged


# ----------------- TRAIN MODEL -----------------
model, feature_cols, merged = fit_anomaly_weight_model(train, test)

clf = model.named_steps['clf']
weights = clf.coef_[0]

print("\nLearned weights:")
for name, w in sorted(zip(feature_cols, weights), key=lambda x: -abs(x[1])):
    print(f"{name:18s} {w: .6f}")



Learned weights:
f_max_dist_diff     0.340422
f_dom_poi_changed   0.209055
f_new_locs         -0.179196
f_speed_diff        0.169116
f_new_pois         -0.105039
f_dist_diff         0.096859
f_count_diff        0.058848
f_transforms_diff  -0.025947
f_max_stay_diff     0.015560


In [None]:
train = pd.read_csv('../../processed/train_monthly.csv')
test = pd.read_csv('../../processed/test_monthly.csv')
gt = pd.read_csv('../../processed/anomalous_segmented.csv')
ground_truth = pd.read_csv('../../processed/anomalous_agents.csv')
train['label'] = 0
test['label'] = 0

Training profiles: 7680000 entries for 640000 agents
Testing profiles: 7680000 entries for 640000 agents

Common agents: 640000
Ground truth: 285 anomalous agents
Unique agents in ground truth: 285


In [5]:
for agent, gt_agent in gt.groupby('agent'):
    agent_mask = test['agent'] == agent

    if not agent_mask.any():
        continue

    for _, row in gt_agent.iterrows():
        anomaly_time_segment = row['time_segment']
        anomaly_day_type = row['day_type']

        overlap_mask = (
            agent_mask &
            (test['day_type'] == anomaly_day_type) &
            (test['time_segment'] == anomaly_time_segment)
        )

        test.loc[overlap_mask, 'label'] = 1

KeyboardInterrupt: 

In [20]:
test[test['agent'] == 6307]

Unnamed: 0,s_date,e_date,agent,day_type,time_segment,unique_location_ids,avg_distance_from_home_km,avg_speed_kmh,unique_locs,max_stay_duration,transformations,max_distance_from_home,dominent_poi,poi_dict,label
636,2024-11-18 00:00:00+09:00,2024-12-20 00:00:00+09:00,6307,weekday,0-5.59,3,0.28,4.88,[ 0 3 15],352.17,2,6.149692,residential,['residential' 'religion:restaurant:workplace'...,1
637,2024-11-18 06:00:00+09:00,2024-12-20 06:00:00+09:00,6307,weekday,6-8.59,4,0.19,0.76,[ 0 11 3 5],177.73,2,1.825269,residential,['residential' 'workplace' 'religion:restauran...,0
638,2024-11-18 09:00:00+09:00,2024-12-20 09:44:50+09:00,6307,weekday,9-13.59,5,0.81,6.34,[ 0 1 3 11 5],253.13,2,1.825269,residential,['residential' 'recreation' 'religion:restaura...,0
639,2024-11-18 14:00:00+09:00,2024-12-20 17:17:10+09:00,6307,weekday,14-17.29,6,1.4,3.68,[1 2 3 7 9 0],192.56,2,2.130139,religion:restaurant:workplace,['recreation' 'religion:restaurant:workplace' ...,0
640,2024-11-18 17:30:00+09:00,2024-12-20 17:30:00+09:00,6307,weekday,17.30-21.29,11,1.03,5.54,[ 2 0 3 4 5 7 8 6 10 12 14],173.25,3,2.783658,residential,['recreation' 'residential' 'religion:restaura...,0
641,2024-11-18 21:30:00+09:00,2024-12-20 21:30:00+09:00,6307,weekday,21.30-23.59,1,0.0,0.0,[0],149.73,1,0.000401,residential,['residential'],0
642,2024-11-23 00:00:00+09:00,2024-12-15 00:00:00+09:00,6307,weekend,0-5.59,1,0.0,0.0,[0],360.0,1,0.000376,residential,['residential'],0
643,2024-11-23 06:00:00+09:00,2024-12-15 06:00:00+09:00,6307,weekend,6-8.59,1,0.0,0.0,[0],180.0,1,0.000376,residential,['residential'],0
644,2024-11-23 09:00:00+09:00,2024-12-15 09:00:00+09:00,6307,weekend,9-13.59,3,0.65,2.94,[0 6 3],245.17,2,1.631794,residential,['residential' 'religion:restaurant:workplace'],0
645,2024-11-23 14:00:00+09:00,2024-12-15 14:00:00+09:00,6307,weekend,14-17.29,3,1.09,1.34,[0 3 6],202.58,2,1.631794,religion:restaurant:workplace,['residential' 'religion:restaurant:workplace'],0


In [21]:
train[train['agent'] == 6307]

Unnamed: 0,s_date,e_date,agent,day_type,time_segment,unique_location_ids,avg_distance_from_home_km,avg_speed_kmh,unique_locs,max_stay_duration,transformations,max_distance_from_home,dominent_poi,poi_dict,label
636,2024-10-16 00:00:00+09:00,2024-11-15 00:00:00+09:00,6307,weekday,0-5.59,1,0.0,0.0,[0],360.0,1,0.000665,residential,['residential'],0
637,2024-10-16 06:00:00+09:00,2024-11-15 06:00:00+09:00,6307,weekday,6-8.59,1,0.0,0.0,[0],180.0,1,0.000665,residential,['residential'],0
638,2024-10-16 09:00:00+09:00,2024-11-15 09:41:20+09:00,6307,weekday,9-13.59,2,0.82,4.41,[0 1],254.17,2,1.632314,residential,['residential' 'religion:restaurant:workplace'],0
639,2024-10-16 14:00:00+09:00,2024-11-15 14:00:00+09:00,6307,weekday,14-17.29,5,1.54,1.73,[ 1 0 8 3 13],201.38,2,1.632314,religion:restaurant:workplace,['religion:restaurant:workplace' 'residential'...,0
640,2024-10-16 17:30:00+09:00,2024-11-15 17:30:00+09:00,6307,weekday,17.30-21.29,11,1.04,5.42,[ 1 2 0 6 7 3 10 11 12 13 14],177.09,3,2.631937,religion:restaurant:workplace,['religion:restaurant:workplace' 'residential:...,0
641,2024-10-16 21:30:00+09:00,2024-11-14 21:30:00+09:00,6307,weekday,21.30-23.59,2,0.1,0.42,[ 0 12],148.86,2,2.341336,residential,['residential' 'recreation'],0
642,2024-10-19 00:00:00+09:00,2024-11-17 00:00:00+09:00,6307,weekend,0-5.59,1,0.0,0.0,[0],360.0,1,0.000399,residential,['residential'],0
643,2024-10-19 06:00:00+09:00,2024-11-17 06:00:00+09:00,6307,weekend,6-8.59,1,0.0,0.0,[0],178.78,1,0.000399,residential,['residential'],0
644,2024-10-19 09:00:00+09:00,2024-11-17 09:47:10+09:00,6307,weekend,9-13.59,5,0.68,4.02,[0 3 4 1 9],256.0,2,1.774736,residential,['residential' 'religion:restaurant:workplace'...,0
645,2024-10-19 14:00:00+09:00,2024-11-17 14:00:00+09:00,6307,weekend,14-17.29,4,0.95,1.18,[0 4 5 1],199.92,2,2.130042,residential,['residential' 'recreation' 'religion:restaura...,0


In [22]:
# selecting random 60% of agents

unique_agents = gt['agent'].unique()
random_agents = pd.Series(unique_agents).sample(frac=0.8)

train = train[train['agent'].isin(random_agents)]
test = test[test['agent'].isin(random_agents)]

In [None]:
def build_anomaly_features(train_profiles, test_profiles):
    # Merge train/test profiles (don't filter by agent here; keep everything)
    merged = pd.merge(
        test_profiles,
        train_profiles,
        on=['day_type', 'time_segment'],
        suffixes=('_test', '_train'),
        how='left'
    )

    # Fill numeric training columns when no history exists
    numeric_cols = [
        'unique_location_ids_train',
        'avg_distance_from_home_km_train',
        'avg_speed_kmh_train',
        'max_stay_duration_train',
        'transformations_train',
        'max_distance_from_home_train'
    ]
    merged[numeric_cols] = merged[numeric_cols].fillna(0)

    # Component 1: Count difference
    merged['f_count_diff'] = (merged['unique_location_ids_test'] -
                              merged['unique_location_ids_train']).abs()

    # Component 2: Distance difference
    merged['f_dist_diff'] = (merged['avg_distance_from_home_km_test'] -
                             merged['avg_distance_from_home_km_train']).abs()

    # Component 3: Speed difference
    merged['f_speed_diff'] = (merged['avg_speed_kmh_test'] -
                              merged['avg_speed_kmh_train']).abs()

    # Component 4: New locations
    def get_new_loc_count(row):
        locs_train = row['unique_locs_train']
        locs_test = row['unique_locs_test']
        set_train = set(locs_train) if isinstance(locs_train, list) else set()
        set_test = set(locs_test) if isinstance(locs_test, list) else set()
        return len(set_test - set_train)

    merged['f_new_locs'] = merged.apply(get_new_loc_count, axis=1)

    # Component 5: max stay duration
    merged['f_max_stay_diff'] = (
        merged['max_stay_duration_test'] -
        merged['max_stay_duration_train']
    ).abs()

    # Component 6: number of transformations
    merged['f_transforms_diff'] = (
        merged['transformations_test'] -
        merged['transformations_train']
    ).abs()

    # Component 7: max distance from home
    merged['f_max_dist_diff'] = (
        merged['max_distance_from_home_test'] -
        merged['max_distance_from_home_train']
    ).abs()

    # Component 8: dominant poi changed
    merged['f_dom_poi_changed'] = (
        merged['dominent_poi_test'] != merged['dominent_poi_train']
    ).astype(int)

    # Component 9: new POI categories
    def get_new_poi_count(row):
        pois_train = row['poi_dict_train']
        pois_test = row['poi_dict_test']
        set_train = set(pois_train) if isinstance(pois_train, list) else set()
        set_test = set(pois_test) if isinstance(pois_test, list) else set()
        return len(set_test - set_train)

    merged['f_new_pois'] = merged.apply(get_new_poi_count, axis=1)

    return merged


In [None]:


def fit_anomaly_weight_model(train_profiles, test_profiles):
    merged = build_anomaly_features(train_profiles, test_profiles)

    feature_cols = [
        'f_count_diff',
        'f_dist_diff',
        'f_speed_diff',
        'f_new_locs',
        'f_max_stay_diff',
        'f_transforms_diff',
        'f_max_dist_diff',
        'f_dom_poi_changed',
        'f_new_pois',
    ]

    X = merged[feature_cols]
    y = merged['label_test']  # 0/1 anomalous row

    model = Pipeline([
        ('scaler', StandardScaler()),
        ('clf', LogisticRegression(
            class_weight='balanced',  # you likely have few anomalies
            max_iter=1000
        ))
    ])

    model.fit(X, y)
    return model, feature_cols


In [25]:
model, feature_cols = fit_anomaly_weight_model(train, test)
scaler = model.named_steps['scaler']
clf = model.named_steps['clf']

weights = clf.coef_[0]
for name, w in zip(feature_cols, weights):
    print(name, w)


f_count_diff 0.15275051180191404
f_dist_diff -0.1298183458063062
f_speed_diff 0.07990499369616784
f_new_locs 0.0
f_max_stay_diff -0.07054805627861394
f_transforms_diff 0.17161455932528552
f_max_dist_diff 0.6046523416127123
f_dom_poi_changed 0.09907394577832318
f_new_pois 0.0
