# 02 — Data Preparation and Splitting

In this notebook we:
- Preprocess mouse dynamics data from two sources:
  - **PMC dataset (Boun Mouse Dynamics)** — public dataset.
  - **OUR dataset** — collected locally using the logging script.
- Engineer features (Δx, Δy, Δt, speed).
- Split into **train/validation/test** sets.
- Demonstrate a baseline model training.

In [None]:
import numpy as np
import pandas as pd
from pathlib import Path
from sklearn.model_selection import train_test_split
import json
import glob
import matplotlib.pyplot as plt

## 1. Helper Functions

We define reusable helpers for:
- Feature engineering (`add_deltas`).
- Splitting sessions into segments (`sessionize`).
- Building sequences of fixed length for deep learning.
- Constructing labeled datasets from sessions.

In [2]:
def add_deltas(df, xcol="x", ycol="y", tcol="timestamp_ms"):
    df = df.copy()
    df["dt"] = df[tcol].diff().fillna(1).clip(lower=1)
    df["dx"] = df[xcol].diff().fillna(0)
    df["dy"] = df[ycol].diff().fillna(0)
    df["speed"] = np.sqrt(df["dx"]**2 + df["dy"]**2) / df["dt"]
    return df

def sessionize(df, tcol="timestamp_ms", gap_ms=5*60*1000):
    df = df.sort_values(tcol).copy()
    df["gap"] = df[tcol].diff().fillna(0)
    df["session_id"] = (df["gap"] > gap_ms).cumsum()
    return df

def build_sequence(df, seq_len=128, cols=("dx","dy","dt","speed")):
    feats = df[list(cols)].replace([np.inf,-np.inf],0).fillna(0).to_numpy()
    if len(feats) >= seq_len:
        return feats[-seq_len:]
    else:
        pad = np.zeros((seq_len - len(feats), feats.shape[1]))
        return np.vstack([pad, feats])

def make_dataset(df, label, seq_len=128):
    X, y = [], []
    for sid, g in df.groupby("session_id"):
        seq = build_sequence(g, seq_len=seq_len)
        X.append(seq)
        y.append(label)
    return np.array(X), np.array(y)

## 2. PMC Dataset Loading

Demo with 10 sessions

In [8]:
pmc_files = glob.glob("../data/raw/boun-mouse-dynamics-dataset/users/*/*/*.csv")

frames = []
for f in pmc_files[:10]:
    df = pd.read_csv(f)
    if "client_timestamp" in df.columns:
        df.rename(columns={"client_timestamp": "timestamp_ms"}, inplace=True)
    frames.append(df[["timestamp_ms","x","y"]])

pmc = pd.concat(frames, ignore_index=True)
print("PMC shape:", pmc.shape)
pmc.head()

PMC shape: (53712, 3)


Unnamed: 0,timestamp_ms,x,y
0,1563886000.0,827,198
1,1563886000.0,827,200
2,1563886000.0,825,200
3,1563886000.0,821,204
4,1563886000.0,819,206


In [None]:
pmc_proc = add_deltas(pmc, xcol="x", ycol="y", tcol="timestamp_ms")
pmc_proc = sessionize(pmc_proc, tcol="timestamp_ms")

X_pmc, y_pmc = make_dataset(pmc_proc, label=0, seq_len=128)

print("PMC dataset shapes:", X_pmc.shape, y_pmc.shape)


PMC dataset shapes: (7, 128, 4) (7,)


In [12]:
# Split
X_train_pmc, X_tmp_pmc, y_train_pmc, y_tmp_pmc = train_test_split(
    X_pmc, y_pmc, test_size=0.30, stratify=y_pmc, random_state=42
)
X_val_pmc, X_test_pmc, y_val_pmc, y_test_pmc = train_test_split(
    X_tmp_pmc, y_tmp_pmc, test_size=0.50, stratify=y_tmp_pmc, random_state=42
)

print("PMC Train:", X_train_pmc.shape,
      "Val:", X_val_pmc.shape,
      "Test:", X_test_pmc.shape)

PMC Train: (4, 128, 4) Val: (1, 128, 4) Test: (2, 128, 4)


## 3. OUR Dataset Loading

In [10]:
our_files = glob.glob("../data/raw/our/v1/mouse_events_*.jsonl")

frames = []
for f in our_files:
    df = pd.read_json(f, lines=True)
    df.rename(columns={
        "wall_time_ms": "timestamp_ms",
        "x_screen": "x",
        "y_screen": "y"
    }, inplace=True)
    frames.append(df[["timestamp_ms","x","y"]])

our = pd.concat(frames, ignore_index=True)
print("OUR shape:", our.shape)
our.head()


OUR shape: (155012, 3)


Unnamed: 0,timestamp_ms,x,y
0,1760684986648,1108,546
1,1760684986653,1094,534
2,1760684986659,1079,519
3,1760684986666,1069,507
4,1760684986674,1060,495


In [None]:
our_proc = add_deltas(our, xcol="x", ycol="y", tcol="timestamp_ms")
our_proc = sessionize(our_proc, tcol="timestamp_ms")

X_our, y_our = make_dataset(our_proc, label=0, seq_len=128)

print("OUR dataset shapes:", X_our.shape, y_our.shape)


OUR dataset shapes: (9, 128, 4) (9,)


In [13]:
# Split
X_train_our, X_tmp_our, y_train_our, y_tmp_our = train_test_split(
    X_our, y_our, test_size=0.30, stratify=y_our, random_state=42
)
X_val_our, X_test_our, y_val_our, y_test_our = train_test_split(
    X_tmp_our, y_tmp_our, test_size=0.50, stratify=y_tmp_our, random_state=42
)

print("OUR Train:", X_train_our.shape,
      "Val:", X_val_our.shape,
      "Test:", X_test_our.shape)

OUR Train: (6, 128, 4) Val: (1, 128, 4) Test: (2, 128, 4)


## 6. Baseline Model Training (Demo)

In [None]:
import tensorflow as tf
from tensorflow.keras import layers, models

seq_len = X_train_our.shape[1]
num_features = X_train_our.shape[2]

model = models.Sequential([
    layers.Input(shape=(seq_len, num_features)),
    layers.Flatten(),
    layers.Dense(32, activation="relu"),
    layers.Dense(1, activation="sigmoid")
])

model.compile(optimizer="adam", loss="binary_crossentropy", metrics=["accuracy"])
model.summary()


In [43]:
history = model.fit(
    X_train_our, y_train_our,
    validation_data=(X_val_our, y_val_our),
    epochs=5,
    batch_size=32,
    verbose=1
)

test_loss, test_acc = model.evaluate(X_test_our, y_test_our, verbose=0)
print(f"OUR dataset → Test accuracy: {test_acc:.3f}")

Epoch 1/5
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 425ms/step - accuracy: 0.6667 - loss: 48.6629 - val_accuracy: 0.0000e+00 - val_loss: 762.1984
Epoch 2/5
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 29ms/step - accuracy: 0.6667 - loss: 1.6130 - val_accuracy: 0.0000e+00 - val_loss: 696.9526
Epoch 3/5
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 27ms/step - accuracy: 1.0000 - loss: 3.6813e-08 - val_accuracy: 0.0000e+00 - val_loss: 646.4330
Epoch 4/5
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 28ms/step - accuracy: 1.0000 - loss: 3.2475e-09 - val_accuracy: 0.0000e+00 - val_loss: 605.0789
Epoch 5/5
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 28ms/step - accuracy: 1.0000 - loss: 4.8001e-10 - val_accuracy: 0.0000e+00 - val_loss: 570.1461
OUR dataset → Test accuracy: 1.000
