In [None]:
import os
import numpy as np
import pandas as pd

# Config
CITY = "chicago"

DIR = "data/LENS-2023-11-CSV/LENS-2023-11-CSV/inside-out/active"
VALUE_COLUMN = "rtt"
TIME_COLUMN = "timestamp"

CLUSTER_CSV = "data/2024-01-24/cluster_results.csv"
FREQ = "10ms"
BLOCK_SECONDS = 15

In [None]:
# 1) Reuse your df_raw construction (adapted here for completeness, but keep yours)
df_hours = []
start_seconds = [12, 27, 42, 57]

for folder in os.listdir(f"{DIR}/{CITY}"):
    for hour in range(24):
        # csv_file_name = f"data/2024-01-24/irtt-10ms-1h-2024-01-24-{hour:0{2}d}-00-00.csv"

        csv_file_name = f"{DIR}/{CITY}/{folder}/irtt-10ms-1h-2024-01-24-{hour:0{2}d}-00-00.csv"
        if not os.path.exists(csv_file_name):
            continue
        try:
            df_hour = pd.read_csv(csv_file_name)
        except Exception as e:
            print(f"Skipping {csv_file_name}: {e}")
            continue

        # Convert ns to datetime and round to 10ms bins
        df_hour[TIME_COLUMN] = pd.to_datetime(df_hour[TIME_COLUMN], unit="ns").dt.round(FREQ)
        # Filter out non-positive RTT
        df_hour = df_hour[df_hour[VALUE_COLUMN] > 0].copy()
        # Keep only relevant columns to reduce memory
        df_hour = df_hour[[TIME_COLUMN, VALUE_COLUMN]]
        df_hours.append(df_hour)

if not df_hours:
    raise RuntimeError("No hourly CSVs loaded. Check paths.")

df_raw = pd.concat(df_hours, ignore_index=True)
df_raw = df_raw.drop_duplicates(subset=[TIME_COLUMN]).sort_values(TIME_COLUMN)

# Align to first starting second in [12, 27, 42, 57]
start_time = df_raw.loc[df_raw[TIME_COLUMN].dt.second.isin(start_seconds), TIME_COLUMN].min()
if pd.isna(start_time):
    raise RuntimeError("Could not find a start_time matching seconds in [12, 27, 42, 57].")

df_raw = df_raw.loc[df_raw[TIME_COLUMN] >= start_time].copy()

# Compute 15s block id and filter last, too-short blocks
df_raw["id"] = ((df_raw[TIME_COLUMN] - start_time).dt.total_seconds() // BLOCK_SECONDS).astype(int)
df_raw = df_raw.loc[df_raw["id"] != df_raw["id"].max()].copy()
# Optional: keep only substantially populated blocks (your original threshold)
df_raw = df_raw.groupby('id').filter(lambda x: len(x) > 1300).copy()
display(df_raw)

Unnamed: 0,timestamp,rtt,id
1070,2024-01-24 00:00:12.000,396.884962,0
1071,2024-01-24 00:00:12.010,400.207015,0
1072,2024-01-24 00:00:12.020,389.998357,0
1073,2024-01-24 00:00:12.030,379.528565,0
1074,2024-01-24 00:00:12.070,340.219383,0
...,...,...,...
6120245,2024-01-24 23:38:41.840,127.328116,5673
6120246,2024-01-24 23:38:41.850,143.254740,5673
6120247,2024-01-24 23:38:41.860,133.201958,5673
6120248,2024-01-24 23:38:41.870,124.388290,5673


In [3]:
# 2) Load cluster labels; only process ids that exist in this file
clusters = pd.read_csv(CLUSTER_CSV)
# Ensure timestamp is parsed (optional but useful)
if "timestamp" in clusters.columns:
    clusters["timestamp"] = pd.to_datetime(clusters["timestamp"])
# Keep only fields we need
clusters = clusters[["id", "cluster"]].drop_duplicates()

# Restrict df_raw to ids present in clusters
df_raw = df_raw[df_raw["id"].isin(clusters["id"])].copy()

# 3) Helper: build canonical index for a given block id
def expected_index_for_id(id_val: int) -> pd.DatetimeIndex:
    start = start_time + pd.Timedelta(seconds=BLOCK_SECONDS * id_val)
    # Inclusive end at 15s - one step (so 1500 points for 10ms frequency)
    end = start + pd.Timedelta(seconds=BLOCK_SECONDS) - pd.Timedelta(milliseconds=10)
    return pd.date_range(start, end, freq=FREQ)


In [4]:
# 4) Helper: nearest-neighbor imputation choosing last/next by closest distance
def nearest_neighbor_fill_uniform(s: pd.Series) -> pd.Series:
    """
    s is a Series indexed by a uniformly spaced DatetimeIndex (10ms).
    Missing values are NaN.
    We choose the nearest observed sample (prev or next). Ties: prefer previous.
    """
    n = len(s)
    mask = s.isna().values
    if not mask.any():
        return s

    pos = np.arange(n)

    # Indices of last observed sample at or before each position
    prev_obs_pos = np.where(~mask, pos, np.nan)
    prev_obs_pos = pd.Series(prev_obs_pos).ffill().values  # positions or NaN at leading missing

    # Indices of next observed sample at or after each position
    next_obs_pos = np.where(~mask, pos, np.nan)
    next_obs_pos = pd.Series(next_obs_pos).bfill().values  # positions or NaN at trailing missing

    # Distances
    # Use large numbers where no prev/next exists
    dist_prev = pos - prev_obs_pos
    dist_prev = np.where(np.isnan(dist_prev), np.inf, dist_prev)
    dist_next = next_obs_pos - pos
    dist_next = np.where(np.isnan(dist_next), np.inf, dist_next)

    # Choose prev if strictly closer or tie (<=), else next
    use_prev = dist_prev <= dist_next

    s_ffill = s.ffill()
    s_bfill = s.bfill()

    filled_values = np.where(use_prev, s_ffill.values, s_bfill.values)

    out = s.copy()
    out[mask] = filled_values[mask]
    return out

In [10]:
# 5) Process each id into aligned sequences (long format)
records = []
feature_rows = []

grouped = df_raw.groupby("id", sort=True)
for id_val, grp in grouped:
    # Compute canonical index
    idx = expected_index_for_id(id_val)

    # Align this group's RTT to the canonical timeline
    s = grp.set_index(TIME_COLUMN)[VALUE_COLUMN].sort_index()
    s = s.reindex(idx)  # Missing samples become NaN
    # Token with -1 for missing
    s_token = s.fillna(-1)

    # Nearest neighbor fill (last or next whichever is closer; ties -> last)
    s_nearest = nearest_neighbor_fill_uniform(s)

    # Missing mask (based on original aligned series)
    missing_mask = s.isna()

    # Cluster label
    cluster = clusters.loc[clusters["id"] == id_val, "cluster"]
    cluster = int(cluster.iloc[0]) if len(cluster) else None

    # Build long-form rows
    df_block = pd.DataFrame({
        "id": id_val,
        "timestamp": idx,
        "rtt": s.values,
        "rtt_token": s_token.values,      # -1 indicates missing
        "rtt_nearest": s_nearest.values,  # nearest neighbor imputation
        "is_missing": missing_mask.values
    })

    # Append cluster if present
    df_block["cluster"] = cluster

    records.append(df_block)

    # Compute simple per-block features (good for training/eval)
    # Feel free to expand with more robust statistics
    valid = ~missing_mask
    valid_vals = s[valid]
    features = {
        "id": id_val,
        "cluster": cluster,
        "samples_expected": len(s),
        "samples_observed": int(valid.sum()),
        "missing_count": int(missing_mask.sum()),
        "missing_ratio": float(missing_mask.mean()),
        "rtt_mean": float(valid_vals.mean()) if len(valid_vals) else np.nan,
        "rtt_std": float(valid_vals.std(ddof=1)) if len(valid_vals) > 1 else np.nan,
        "rtt_median": float(valid_vals.median()) if len(valid_vals) else np.nan,
        "rtt_p10": float(valid_vals.quantile(0.10)) if len(valid_vals) else np.nan,
        "rtt_p90": float(valid_vals.quantile(0.90)) if len(valid_vals) else np.nan,
        # Compare imputation behavior
        "nearest_fill_delta_mean": float((s_nearest - s).abs().mean(skipna=True)),
        "nearest_fill_delta_max": float((s_nearest - s).abs().max(skipna=True)),
    }
    feature_rows.append(features)

# Concatenate and save
df_long = pd.concat(records, ignore_index=True)
df_features = pd.DataFrame(feature_rows)

# Optionally, inspect a couple of blocks
display(df_features.sort_values("missing_ratio", ascending=False).head(10))

Unnamed: 0,id,cluster,samples_expected,samples_observed,missing_count,missing_ratio,rtt_mean,rtt_std,rtt_median,rtt_p10,rtt_p90,nearest_fill_delta_mean,nearest_fill_delta_max
1931,3408,2,1500,1302,198,0.132,211.304389,62.724034,242.718247,120.034292,262.741334,0.0,0.0
2173,3652,1,1500,1303,197,0.131333,195.410981,47.873257,185.598667,166.603514,222.911011,0.0,0.0
1470,2918,0,1500,1303,197,0.131333,111.74654,18.319269,108.083478,98.014673,126.703893,0.0,0.0
1708,3172,3,1500,1305,195,0.13,157.156896,6.409719,156.298391,151.707951,162.004716,0.0,0.0
1461,2888,6,1500,1305,195,0.13,133.726868,15.81717,129.715732,120.30689,150.1733,0.0,0.0
1452,2859,5,1500,1305,195,0.13,107.852606,20.668037,104.056336,93.449438,122.540518,0.0,0.0
2214,3699,2,1500,1306,194,0.129333,219.303079,65.124284,189.429763,163.297454,343.920526,0.0,0.0
1449,2855,2,1500,1307,193,0.128667,273.464655,19.415389,271.621304,253.938686,293.811632,0.0,0.0
1454,2866,0,1500,1309,191,0.127333,119.395677,14.750677,116.576421,105.3886,135.346662,0.0,0.0
1457,2871,6,1500,1309,191,0.127333,129.260875,17.595357,125.414879,112.518733,148.239144,0.0,0.0


In [14]:
df_long_clean = df_long[["id", "timestamp", "rtt_token", "rtt_nearest"]]
df_long_clean.head()
df_long_clean.to_csv('data/2024-01-24/clustering_data.csv', float_format='%.4f', index=False)