In [1]:
import os
import ast
import numpy as np
import pandas as pd
from pathlib import Path

from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import RobustScaler
from typing import Tuple, List

os.environ["OMP_NUM_THREADS"] = "1"
os.environ["MKL_NUM_THREADS"] = "1"
os.environ["OPENBLAS_NUM_THREADS"] = "1"
os.environ["NUMEXPR_NUM_THREADS"] = "1"

NUMERIC_COLS = [
    "unique_location_ids",
    "avg_distance_from_home_km",
    "avg_speed_kmh",
    "max_stay_duration",
    "transformations",
    "max_distance_from_home",
]

def scale_train_test(
    train_df: pd.DataFrame,
    test_df: pd.DataFrame,
    numeric_cols: List[str] = NUMERIC_COLS,
) -> Tuple[pd.DataFrame, pd.DataFrame, RobustScaler]:
    """
    Fit scaler on train numeric columns, apply to both train and test.

    Uses RobustScaler (median + IQR), which is safer for heavy-tailed
    mobility features and anomalies.

    Returns:
        scaled_train_df
        scaled_test_df
        fitted_scaler
    """
    missing = [c for c in numeric_cols if c not in train_df.columns]
    if missing:
        raise ValueError(f"Missing numeric columns in train: {missing}")

    train = train_df.copy()
    test = test_df.copy()

    scaler = RobustScaler(quantile_range=(25.0, 75.0))

    fit_df = train[numeric_cols]

    scaler.fit(fit_df)

    train[numeric_cols] = scaler.transform(train[numeric_cols])
    test[numeric_cols] = scaler.transform(test[numeric_cols])

    return train, test, scaler

In [None]:
#gt = pd.read_csv('/Users/chanuka/Desktop/codespaces/liad/processed/trial5/gt/anomalous_temporal.csv')
gt = pd.read_csv('/Users/chanuka/Desktop/codespaces/liad/processed/trial5/sim2/ta5_vae_sim2_KSP_NT.csv')
gt = gt[gt['label_x'] == 1]
gt.rename(columns={'user_id': 'agent'}, inplace=True)


train = pd.read_csv('/Users/chanuka/Desktop/codespaces/liad/processed/trial5/sim2/10k/whole/train_weekly/agent_bucket=0.csv')
test = pd.read_csv('/Users/chanuka/Desktop/codespaces/liad/processed/trial5/sim2/10k/whole/test_weekly/agent_bucket=0.csv')
res = pd.read_csv('/Users/chanuka/Desktop/codespaces/liad/processed/trial5/Sim1_Trial5_Agent_Classification.csv', low_memory=False)
res = res[(res['past_classification'] == 'residents') & (res['future_classification'] == 'residents')]
train = train[train['agent'].isin(res['agent'].values)]
test = test[test['agent'].isin(res['agent'].values)]

train, test, scaler = scale_train_test(train, test)

train['label'] = 0
test['label'] = 0

gt_keys = set(zip(gt['agent'], gt['day_type'], gt['time_segment']))
test_keys = list(zip(test['agent'], test['day_type'], test['time_segment']))
test['label'] = np.fromiter((k in gt_keys for k in test_keys), dtype=np.int8, count=len(test))

In [33]:
train.to_csv('../processed/trial5/10k/train_weekly_subsampled.csv', index=False)
test.to_csv('../processed/trial5/10k/test_weekly_subsampled.csv', index=False)