In [12]:
import os
import ast
import numpy as np
import pandas as pd

from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
os.environ["OMP_NUM_THREADS"] = "1"
os.environ["MKL_NUM_THREADS"] = "1"
os.environ["OPENBLAS_NUM_THREADS"] = "1"
os.environ["NUMEXPR_NUM_THREADS"] = "1"

def parse_maybe_list(x):
    """
    Safely parse list-like values that may be stored as strings in CSV.
    Returns a Python list.
    Handles: NaN/None, list, tuple, set, np.ndarray, strings like "[1,2]".
    """
    # Fast path for common nulls
    if x is None:
        return []

    # If it's already list-like, return it as a list
    if isinstance(x, list):
        return x
    if isinstance(x, (set, tuple)):
        return list(x)
    if isinstance(x, np.ndarray):
        return x.tolist()

    # Handle scalar NaN (only safe for scalars)
    if isinstance(x, (float, np.floating)) and np.isnan(x):
        return []

    # Strings: try literal_eval
    if isinstance(x, str):
        s = x.strip()
        if s == "" or s.lower() in ("nan", "none", "null"):
            return []
        try:
            v = ast.literal_eval(s)
            if isinstance(v, (list, set, tuple)):
                return list(v)
            if isinstance(v, np.ndarray):
                return v.tolist()
            if isinstance(v, dict):
                # choose keys (adjust if you want values instead)
                return list(v.keys())
            return []
        except Exception:
            return []

    # For anything else, try a safe pandas scalar-null check
    try:
        if pd.isna(x):
            return []
    except Exception:
        pass

    return []



def ensure_columns_exist(df, cols, fill_value=np.nan):
    for c in cols:
        if c not in df.columns:
            df[c] = fill_value
    return df


def build_anomaly_features(train_profiles: pd.DataFrame, test_profiles: pd.DataFrame) -> pd.DataFrame:
    """
    Build per-row anomaly features by comparing test slot vs train slot
    for the SAME agent + day_type + time_segment.

    Prevents many-to-many merges by aggregating to unique keys first.
    """
    keys = ['agent', 'day_type', 'time_segment']

    base_numeric = [
        'unique_location_ids',
        'avg_distance_from_home_km',
        'avg_speed_kmh',
        'max_stay_duration',
        'transformations',
        'max_distance_from_home',
        'label'
    ]
    base_misc = ['unique_locs', 'poi_dict', 'dominent_poi']

    train_profiles = ensure_columns_exist(train_profiles.copy(), base_numeric + base_misc)
    test_profiles  = ensure_columns_exist(test_profiles.copy(),  base_numeric + base_misc)

    # Aggregate to ensure 1 row per key in each split
    agg_num = {
        'unique_location_ids': 'mean',
        'avg_distance_from_home_km': 'mean',
        'avg_speed_kmh': 'mean',
        'max_stay_duration': 'max',
        'transformations': 'mean',
        'max_distance_from_home': 'max',
        'label': 'max'
    }
    # For list-like / categorical fields: take first non-null occurrence
    agg_misc = {
        'unique_locs': 'first',
        'poi_dict': 'first',
        'dominent_poi': 'first'
    }

    train_agg = train_profiles.groupby(keys, as_index=False).agg({**agg_num, **agg_misc})
    test_agg  = test_profiles.groupby(keys, as_index=False).agg({**agg_num, **agg_misc})

    # Merge on agent+slot (THE actual intended join)
    merged = pd.merge(
        test_agg,
        train_agg,
        on=keys,
        suffixes=('_test', '_train'),
        how='left'
    )

    numeric_train_cols = [
        'unique_location_ids_train',
        'avg_distance_from_home_km_train',
        'avg_speed_kmh_train',
        'max_stay_duration_train',
        'transformations_train',
        'max_distance_from_home_train'
    ]
    for c in numeric_train_cols:
        if c not in merged.columns:
            merged[c] = 0.0
    merged[numeric_train_cols] = merged[numeric_train_cols].fillna(0)


    for c in ['unique_locs_train', 'unique_locs_test', 'poi_dict_train', 'poi_dict_test']:
        if c not in merged.columns:
            merged[c] = [[]] * len(merged)
        merged[c] = merged[c].apply(parse_maybe_list)

    merged['f_count_diff'] = (merged['unique_location_ids_test'] - merged['unique_location_ids_train']).abs()
    merged['f_dist_diff']  = (merged['avg_distance_from_home_km_test'] - merged['avg_distance_from_home_km_train']).abs()
    merged['f_speed_diff'] = (merged['avg_speed_kmh_test'] - merged['avg_speed_kmh_train']).abs()

    def get_new_loc_count(row):
        set_train = set(row['unique_locs_train']) if isinstance(row['unique_locs_train'], list) else set()
        set_test  = set(row['unique_locs_test'])  if isinstance(row['unique_locs_test'], list)  else set()
        return len(set_test - set_train)

    merged['f_new_locs'] = merged.apply(get_new_loc_count, axis=1)

    merged['f_max_stay_diff'] = (merged['max_stay_duration_test'] - merged['max_stay_duration_train']).abs()
    merged['f_transforms_diff'] = (merged['transformations_test'] - merged['transformations_train']).abs()
    merged['f_max_dist_diff'] = (merged['max_distance_from_home_test'] - merged['max_distance_from_home_train']).abs()

    if 'dominent_poi_test' not in merged.columns:
        merged['dominent_poi_test'] = np.nan
    if 'dominent_poi_train' not in merged.columns:
        merged['dominent_poi_train'] = np.nan

    merged['f_dom_poi_changed'] = (merged['dominent_poi_test'] != merged['dominent_poi_train']).astype(int)

    def get_new_poi_count(row):
        set_train = set(row['poi_dict_train']) if isinstance(row['poi_dict_train'], list) else set()
        set_test  = set(row['poi_dict_test'])  if isinstance(row['poi_dict_test'], list)  else set()
        return len(set_test - set_train)

    merged['f_new_pois'] = merged.apply(get_new_poi_count, axis=1)

    return merged


def fit_anomaly_weight_model(train_profiles: pd.DataFrame, test_profiles: pd.DataFrame):
    merged = build_anomaly_features(train_profiles, test_profiles)

    feature_cols = [
        'f_count_diff',
        'f_dist_diff',
        'f_speed_diff',
        'f_new_locs',
        'f_max_stay_diff',
        'f_transforms_diff',
        'f_max_dist_diff',
        'f_dom_poi_changed',
        'f_new_pois',
    ]

    X = merged[feature_cols]
    y = merged['label_test']  # 0/1 anomalous row in test

    model = Pipeline([
        ('scaler', StandardScaler()),
        ('clf', LogisticRegression(
            class_weight='balanced',
            max_iter=1000,
            n_jobs=1
        ))
    ])

    model.fit(X, y)
    return model, feature_cols

In [13]:
train = pd.read_csv('/Users/chanuka/Desktop/codespaces/liad/processed/sim2_evalb/train_monthly.csv')
test  = pd.read_csv('/Users/chanuka/Desktop/codespaces/liad/processed/sim2_evalb/test_monthly.csv')

In [15]:
train[train['agent'] == 1043494]

Unnamed: 0,s_date,e_date,agent,day_type,time_segment,unique_location_ids,avg_distance_from_home_km,avg_speed_kmh,unique_locs,max_stay_duration,transformations,max_distance_from_home,dominent_poi,poi_dict
6593595,2024-10-16 00:01:30+09:00,2024-11-15 05:37:30+09:00,1043494,weekday,0-5.59,1,0.0,0.0,[3383176],66.1,5,0.0,residential,['residential']
6593596,2024-10-16 06:00:00+09:00,2024-11-15 06:56:20+09:00,1043494,weekday,6-8.59,2,1.55,1.52,[3383176 460735],68.99,3,3.88,residential,['residential' 'office_building']
6593597,2024-10-16 09:00:00+09:00,2024-11-15 12:30:50+09:00,1043494,weekday,9-13.59,4,2.79,1.03,[3383176 899164 460735 369884],113.92,4,3.88,office_building,['residential' 'food' 'office_building']
6593598,2024-10-16 14:04:00+09:00,2024-11-15 16:48:10+09:00,1043494,weekday,14-17.29,3,3.26,1.05,[ 460735 3383176 899164],106.17,3,3.88,office_building,['office_building' 'residential' 'food']
6593599,2024-10-16 17:30:00+09:00,2024-11-14 20:40:00+09:00,1043494,weekday,17.30-21.29,3,0.97,0.92,[ 460735 544334 3383176],81.05,3,3.88,residential,['office_building' 'outdoor_recreation' 'resid...
6593600,2024-10-16 21:30:00+09:00,2024-11-15 23:46:40+09:00,1043494,weekday,21.30-23.59,1,0.0,0.0,[3383176],43.56,3,0.0,residential,['residential']
6593601,2024-10-20 00:13:10+09:00,2024-11-17 04:38:30+09:00,1043494,weekend,0-5.59,1,0.0,0.0,[3383176],73.26,6,0.0,residential,['residential']
6593602,2024-10-19 06:26:00+09:00,2024-11-17 06:00:00+09:00,1043494,weekend,6-8.59,2,0.48,0.98,[3383176 460735],70.88,3,3.88,residential,['residential' 'office_building']
6593603,2024-10-19 09:00:00+09:00,2024-11-17 13:52:10+09:00,1043494,weekend,9-13.59,3,1.11,0.65,[3383176 460735 899164],94.6,5,3.88,residential,['residential' 'office_building' 'food']
6593604,2024-10-19 14:13:30+09:00,2024-11-17 17:25:20+09:00,1043494,weekend,14-17.29,4,1.03,1.14,[ 393240 3383176 460735 899164],44.0,4,6.85,residential,['filling_stations' 'residential' 'office_buil...


In [18]:
data = pd.read_csv('/Users/chanuka/Desktop/codespaces/liad/processed/test_monthly.csv')
data[data['agent'] == 1599899]

Unnamed: 0,s_date,e_date,agent,day_type,time_segment,unique_location_ids,avg_distance_from_home_km,avg_speed_kmh,unique_locs,max_stay_duration,transformations,max_distance_from_home,dominent_poi,poi_dict
7679988,2024-11-18 00:00:00+09:00,2024-12-20 00:00:00+09:00,1599899,weekday,0-5.59,1,0.0,0.0,[2367390],317.79,2,0.0,residential,['residential']
7679989,2024-11-18 06:00:00+09:00,2024-12-20 08:56:40+09:00,1599899,weekday,6-8.59,3,2.72,5.32,[2367390 4203898 2981214],120.12,3,6.0,workplace,['residential' 'train_stop' 'workplace']
7679990,2024-11-18 09:00:00+09:00,2024-12-20 09:00:00+09:00,1599899,weekday,9-13.59,2,5.93,0.21,[2981214 3051384],248.58,2,6.0,workplace,['workplace' 'restaurant:workplace']
7679991,2024-11-18 14:00:00+09:00,2024-12-20 17:26:50+09:00,1599899,weekday,14-17.29,12,3.8,4.03,[2981214 2367390 4203739 2426544 3051384 16298...,133.57,3,6.1,workplace,['workplace' 'residential' 'train_stop' 'resta...
7679992,2024-11-18 17:30:00+09:00,2024-12-20 18:33:40+09:00,1599899,weekday,17.30-21.29,8,0.84,2.95,[2367390 2981214 1629820 2426544 4203739 31076...,186.01,2,6.02,residential,['residential' 'workplace' 'restaurant:workpla...
7679993,2024-11-18 21:30:00+09:00,2024-12-20 21:30:00+09:00,1599899,weekday,21.30-23.59,1,0.0,0.0,[2367390],146.74,2,0.0,residential,['residential']
7679994,2024-11-23 00:00:00+09:00,2024-12-15 00:00:00+09:00,1599899,weekend,0-5.59,1,0.0,0.0,[2367390],340.48,2,0.0,residential,['residential']
7679995,2024-11-23 06:00:00+09:00,2024-12-15 06:00:00+09:00,1599899,weekend,6-8.59,3,1.21,3.22,[2367390 4203898 2981214],149.58,3,6.0,residential,['residential' 'train_stop' 'workplace']
7679996,2024-11-23 09:00:00+09:00,2024-12-15 09:00:00+09:00,1599899,weekend,9-13.59,4,2.34,1.27,[2367390 105199 2981214 670112],236.75,2,6.0,residential,['residential' 'recreation' 'workplace'\n 'edu...
7679997,2024-11-23 14:00:00+09:00,2024-12-15 14:00:00+09:00,1599899,weekend,14-17.29,8,2.36,4.03,[ 105199 948915 2367390 74480 2981214 42037...,161.27,2,6.1,residential,['recreation' 'restaurant:workplace' 'resident...


In [16]:
test[test['agent'] == 1043494]

Unnamed: 0,s_date,e_date,agent,day_type,time_segment,unique_location_ids,avg_distance_from_home_km,avg_speed_kmh,unique_locs,max_stay_duration,transformations,max_distance_from_home,dominent_poi,poi_dict
6288005,2024-11-18 00:10:10+09:00,2024-12-20 05:41:20+09:00,1043494,weekday,0-5.59,1,0.0,0.0,[3383176],82.58,4,0.0,residential,['residential']
6288006,2024-11-18 06:00:00+09:00,2024-12-20 06:00:00+09:00,1043494,weekday,6-8.59,2,0.32,0.48,[3383176 460735],54.97,3,3.88,residential,['residential' 'office_building']
6288007,2024-11-18 09:00:00+09:00,2024-12-20 13:49:20+09:00,1043494,weekday,9-13.59,4,2.44,0.28,[3383176 460735 2736464 998407],65.07,4,6.67,office_building,['residential' 'office_building' 'store']
6288008,2024-11-18 14:25:30+09:00,2024-12-20 16:58:10+09:00,1043494,weekday,14-17.29,5,3.64,0.7,[3383176 460735 109481 998407 3940258],106.64,3,16.68,office_building,['residential' 'office_building' 'store']
6288009,2024-11-18 17:30:00+09:00,2024-12-20 20:38:00+09:00,1043494,weekday,17.30-21.29,3,1.3,1.04,[3383176 544334 460735],77.51,3,3.88,residential,['residential' 'outdoor_recreation' 'office_bu...
6288010,2024-11-18 21:30:00+09:00,2024-12-20 22:01:30+09:00,1043494,weekday,21.30-23.59,3,0.13,1.55,[3383176 544334 460735],58.13,3,3.88,residential,['residential' 'outdoor_recreation' 'office_bu...
6288011,2024-11-23 00:00:00+09:00,2024-12-15 00:00:00+09:00,1043494,weekend,0-5.59,1,0.0,0.0,[3383176],145.42,4,0.0,residential,['residential']
6288012,2024-11-23 06:08:10+09:00,2024-12-15 06:00:00+09:00,1043494,weekend,6-8.59,2,0.6,0.88,[3383176 460735],91.11,3,3.88,residential,['residential' 'office_building']
6288013,2024-11-23 09:00:00+09:00,2024-12-15 13:25:20+09:00,1043494,weekend,9-13.59,3,2.8,0.13,[3383176 460735 998407],89.81,3,3.88,office_building,['residential' 'office_building' 'store']
6288014,2024-11-23 14:00:00+09:00,2024-12-15 16:03:10+09:00,1043494,weekend,14-17.29,3,3.09,0.0,[ 460735 3383176 998407],78.92,3,3.88,office_building,['office_building' 'residential' 'store']


In [9]:
train = pd.read_csv('/Users/chanuka/Desktop/codespaces/liad/processed/sim2_evalb/train_monthly.csv')
test  = pd.read_csv('/Users/chanuka/Desktop/codespaces/liad/processed/sim2_evalb/test_monthly.csv')

gt = pd.read_csv('/Users/chanuka/Desktop/codespaces/liad/processed/sim2_evalb/anomalous_segmented.csv')
residents = pd.read_csv('/Users/chanuka/Desktop/codespaces/liad/processed/sim2_evalb/residents.csv')

train = train[train['agent'].isin(residents['agent'].unique())].copy()

train['label'] = 0
test['label'] = 0

gt_agents = set(gt['agent'].unique())
train_agents = set(train['agent'].unique())
normal_agents = np.array(list(train_agents - gt_agents))

print("GT agents:", len(gt_agents))
print("Available normal agents:", len(normal_agents))

np.random.seed(42)
sample_size = 100
if len(normal_agents) < sample_size:
    raise ValueError(f"Not enough normal agents to sample {sample_size}. Only {len(normal_agents)} available.")

GT agents: 274
Available normal agents: 371183


In [10]:
sampled_normals = np.random.choice(normal_agents, size=sample_size, replace=False)

train = pd.concat([
    train[train['agent'].isin(gt_agents)],
    train[train['agent'].isin(sampled_normals)]
], ignore_index=True)

test = test[test['agent'].isin(train['agent'].unique())].copy()

gt_keys = set(zip(gt['agent'], gt['day_type'], gt['time_segment']))
test_keys = list(zip(test['agent'], test['day_type'], test['time_segment']))
test['label'] = np.fromiter((k in gt_keys for k in test_keys), dtype=np.int8, count=len(test))

for col in ['unique_locs', 'poi_dict']:
    if col in train.columns:
        train[col] = train[col].apply(parse_maybe_list)
    if col in test.columns:
        test[col] = test[col].apply(parse_maybe_list)

In [11]:
print('fitting the model')
model, feature_cols = fit_anomaly_weight_model(train, test)

clf = model.named_steps['clf']
weights = clf.coef_[0]

# print weights
for name, w in zip(feature_cols, weights):
    print(name, w)

# save weights
weights_df = pd.DataFrame({
    "feature": feature_cols,
    "weight": weights
}).sort_values("weight", key=abs, ascending=False)

out_path = "sim2_evalb_model_weights.csv"
weights_df.to_csv(out_path, index=False)
print(f"\nSaved weights to: {out_path}")

fitting the model
f_count_diff 0.15302082531276692
f_dist_diff -0.1913591142485288
f_speed_diff 0.24952645751043032
f_new_locs -0.5090649992846829
f_max_stay_diff 0.0780266674132837
f_transforms_diff -0.03548937193854445
f_max_dist_diff 0.4501840772380571
f_dom_poi_changed 0.029679350593119064
f_new_pois 0.22113424571069545

Saved weights to: sim2_evalb_model_weights.csv


In [5]:
import numpy as np
import pandas as pd

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression

EPS = 1e-6

NUM_COLS = [
    "unique_location_ids",
    "avg_distance_from_home_km",
    "avg_speed_kmh",
    "max_stay_duration",
    "transformations",
    "max_distance_from_home",
]

def _safe_set(x):
    if isinstance(x, list):
        return set(x)
    if isinstance(x, (set, tuple)):
        return set(x)
    return set()

def build_weekly_features(weekly_profiles: pd.DataFrame) -> pd.DataFrame:
    df = weekly_profiles.copy()

    # --- Sort for lag features ---
    df = df.sort_values(["agent", "day_type", "time_segment", "week_id"])

    # --- Training stats per slot (level baseline) ---
    train = df[df["phase"] == "train"].copy()

    level_mu = (
        train.groupby(["agent", "day_type", "time_segment"])[NUM_COLS]
        .mean()
        .add_suffix("_mu_train")
        .reset_index()
    )
    level_sd = (
        train.groupby(["agent", "day_type", "time_segment"])[NUM_COLS]
        .std(ddof=0)
        .fillna(0)
        .add_suffix("_sd_train")
        .reset_index()
    )

    # --- Training deltas per slot (drift baseline) ---
    # delta = this_week - prev_week within training weeks
    train_d = train.sort_values(["agent", "day_type", "time_segment", "week_id"]).copy()
    for c in NUM_COLS:
        train_d[c + "_delta"] = train_d.groupby(["agent","day_type","time_segment"])[c].diff()

    delta_cols = [c + "_delta" for c in NUM_COLS]
    delta_mu = (
        train_d.groupby(["agent", "day_type", "time_segment"])[delta_cols]
        .mean()
        .add_suffix("_mu_train")
        .reset_index()
    )
    delta_sd = (
        train_d.groupby(["agent", "day_type", "time_segment"])[delta_cols]
        .std(ddof=0)
        .fillna(0)
        .add_suffix("_sd_train")
        .reset_index()
    )

    # --- Merge baselines into all rows ---
    out = df.merge(level_mu, on=["agent","day_type","time_segment"], how="left") \
            .merge(level_sd, on=["agent","day_type","time_segment"], how="left") \
            .merge(delta_mu, on=["agent","day_type","time_segment"], how="left") \
            .merge(delta_sd, on=["agent","day_type","time_segment"], how="left")

    # Fill missing baselines (agents/slots with no train history)
    out = out.fillna(0)

    # --- Lag (previous week) from combined timeline (train+test) ---
    for c in NUM_COLS:
        out[c + "_prev"] = out.groupby(["agent","day_type","time_segment"])[c].shift(1)
        out[c + "_delta_now"] = out[c] - out[c + "_prev"]

    # --- Feature construction: level z + drift z ---
    feature_cols = []
    for c in NUM_COLS:
        mu = out[c + "_mu_train"]
        sd = out[c + "_sd_train"]
        out[f"f_{c}_level_z"] = (out[c] - mu) / (sd + EPS)
        feature_cols.append(f"f_{c}_level_z")

        dmu = out[c + "_delta_mu_train"]
        dsd = out[c + "_delta_sd_train"]
        out[f"f_{c}_drift_z"] = (out[c + "_delta_now"] - dmu) / (dsd + EPS)
        feature_cols.append(f"f_{c}_drift_z")

        out[f"f_{c}_abs_level"] = (out[c] - mu).abs()
        out[f"f_{c}_abs_drift"] = (out[c + "_delta_now"] - dmu).abs()
        feature_cols += [f"f_{c}_abs_level", f"f_{c}_abs_drift"]

    # --- Set novelty features ---
    # Union sets over training for each agent (global baseline)
    train_locs_union = train.groupby("agent")["unique_locs"].apply(
        lambda s: set().union(*[_safe_set(x) for x in s])
    )
    train_pois_union = train.groupby("agent")["poi_dict"].apply(
        lambda s: set().union(*[_safe_set(x) for x in s])
    )

    def new_count(row, union_series, col):
        base = union_series.get(row["agent"], set())
        cur = _safe_set(row.get(col))
        return len(cur - base)

    out["f_new_locs_vs_train"] = out.apply(lambda r: new_count(r, train_locs_union, "unique_locs"), axis=1)
    out["f_new_pois_vs_train"] = out.apply(lambda r: new_count(r, train_pois_union, "poi_dict"), axis=1)
    feature_cols += ["f_new_locs_vs_train", "f_new_pois_vs_train"]

    # Optional: dominant POI change vs train mode (if you have it reliably)
    if "dominant_poi" in df.columns:
        poi_mode = train.groupby(["agent","day_type","time_segment"])["dominant_poi"] \
                        .agg(lambda x: x.mode().iloc[0] if len(x.mode()) else None) \
                        .reset_index() \
                        .rename(columns={"dominant_poi": "dominant_poi_mode_train"})
        out = out.merge(poi_mode, on=["agent","day_type","time_segment"], how="left")
        out["f_dom_poi_changed"] = (out["dominant_poi"] != out["dominant_poi_mode_train"]).astype(int)
        feature_cols.append("f_dom_poi_changed")

    out.attrs["feature_cols"] = feature_cols
    return out

def fit_weekly_row_model(features_df: pd.DataFrame, label_col="label"):
    feature_cols = features_df.attrs["feature_cols"]
    train_rows = features_df[features_df["phase"].isin(["train","test"])].copy()

    # IMPORTANT: train on TRAIN+TEST? Usually you train on train+some validation.
    # Here we'll fit on whatever rows have labels.
    train_rows = train_rows[train_rows[label_col].notna()].copy()

    X = train_rows[feature_cols].astype(float)
    y = train_rows[label_col].astype(int)

    model = Pipeline([
        ("scaler", StandardScaler(with_mean=True, with_std=True)),
        ("clf", LogisticRegression(class_weight="balanced", max_iter=2000))
    ])
    model.fit(X, y)
    return model, feature_cols

def score_rows(model, features_df: pd.DataFrame, feature_cols):
    X = features_df[feature_cols].astype(float)
    # probability of anomaly
    p = model.predict_proba(X)[:, 1]
    out = features_df.copy()
    out["anomaly_prob"] = p
    return out

def pool_week_score(scored_rows: pd.DataFrame, k=10):
    # top-k mean pooling per agent-week
    def topk_mean(x):
        x = np.sort(x)[::-1]
        return float(np.mean(x[:min(k, len(x))])) if len(x) else 0.0

    return scored_rows.groupby(["agent","week_id"], as_index=False)["anomaly_prob"].agg(
        week_score=topk_mean,
        max_score="max",
        mean_score="mean"
    )


In [12]:
# test 'chunk' if 0 -> 5, 1-> 6, 2-> 7, 3-> 8,4-> 9
test["chunk"] = test["chunk"].replace({0: 5, 1: 6, 2: 7, 3: 8, 4: 9})

In [18]:
test.rename(columns={'chunk': 'week_id'}, inplace=True)
test['phase'] = 'test'
train['phase'] = 'train'
# append test dataframe to train at the end
new_df = pd.concat([train, test], ignore_index=True)

In [19]:
out = build_weekly_features(new_df)

In [16]:
model, feature_cols = fit_weekly_row_model(new_df)

KeyError: 'feature_cols'