In [1]:
import pandas as pd

df = pd.read_csv('recording_20250201_202233.csv')
df = df.sort_values(by='time').reset_index(drop=True)
df = df.drop(columns=['timestamp'])
event_types = df['event_type'].unique()
row_count, unique_event_count = len(df), len(event_types)
print(f"{row_count} rows of event types: {event_types}")
df.head(10)

681940 rows of event types: ['mouse_moved' 'mouse_pressed' 'mouse_clicked' 'mouse_released'
 'key_pressed' 'key_released' 'mouse_dragged' 'mouse_wheel']


Unnamed: 0,event_source,event_type,time,x,y,button,clicks,keycode,rawcode,char,mask,wheel_amount,wheel_direction,wheel_rotation
0,local,mouse_moved,34301500,2482.0,21.0,0.0,0.0,,,,0,,,
1,local,mouse_moved,34301500,2482.0,20.0,0.0,0.0,,,,0,,,
2,local,mouse_moved,34301500,2483.0,20.0,0.0,0.0,,,,0,,,
3,local,mouse_moved,34301500,2482.0,21.0,0.0,0.0,,,,0,,,
4,local,mouse_moved,34301546,2482.0,21.0,0.0,0.0,,,,0,,,
5,local,mouse_moved,34301546,2482.0,21.0,0.0,0.0,,,,0,,,
6,local,mouse_moved,34301546,2482.0,21.0,0.0,0.0,,,,0,,,
7,local,mouse_moved,34301562,2483.0,18.0,0.0,0.0,,,,0,,,
8,local,mouse_moved,34301562,2484.0,18.0,0.0,0.0,,,,0,,,
9,local,mouse_moved,34301562,2483.0,19.0,0.0,0.0,,,,0,,,


In [2]:
df = df[~df["event_type"].isin(["mouse_clicked", "mouse_dragged"])]
df["delta_time"] = df["time"].diff()
df["delta_x"] = df["x"].diff()
df["delta_y"] = df["y"].diff()

df[["time", "delta_time", "event_type", "x", "y", "delta_x", "delta_y"]].head(10)

Unnamed: 0,time,delta_time,event_type,x,y,delta_x,delta_y
0,34301500,,mouse_moved,2482.0,21.0,,
1,34301500,0.0,mouse_moved,2482.0,20.0,0.0,-1.0
2,34301500,0.0,mouse_moved,2483.0,20.0,1.0,0.0
3,34301500,0.0,mouse_moved,2482.0,21.0,-1.0,1.0
4,34301546,46.0,mouse_moved,2482.0,21.0,0.0,0.0
5,34301546,0.0,mouse_moved,2482.0,21.0,0.0,0.0
6,34301546,0.0,mouse_moved,2482.0,21.0,0.0,0.0
7,34301562,16.0,mouse_moved,2483.0,18.0,1.0,-3.0
8,34301562,0.0,mouse_moved,2484.0,18.0,1.0,0.0
9,34301562,0.0,mouse_moved,2483.0,19.0,-1.0,1.0


In [3]:
# count how many rows of mouse_moved and otherwise
mouse_moved_count = df[df["event_type"] == "mouse_moved"].shape[0]
other_count = df[df["event_type"] != "mouse_moved"].shape[0]
print(f"mouse_moved_count: {mouse_moved_count}")
print(f"other_count: {other_count}")

mouse_moved_count: 650364
other_count: 3994


In [4]:
filtered_df = []
pressed_keys = set()
previous_time = None
smallest_time_unit = 16  # ms

# NOTE:
#   smallest_time_unit is the minimum amount of time to perform 1 observation, 1 inference and 1 action.
#
#   for simplicity, in data post-processing we put each event at least 1 smallest_time_unit apart.
#   this means the data will not perfectly replicate original trajectory, but we'll have a consistent
#   sampling rate.
#
#   we may tune this hyperparameter to get the best performance.

for index, row in df.iterrows():
    event_type = row["event_type"]
    key_or_button = row["keycode"] if pd.notna(row["keycode"]) else row["button"]

    # If current event is a press event, skip if key/button is already pressed
    if "pressed" in str(event_type) and key_or_button in pressed_keys:
        continue

    # Otherwise, update pressed keys set
    if "pressed" in str(event_type):
        pressed_keys.add(key_or_button)
    elif "released" in str(event_type):
        pressed_keys.discard(key_or_button)

    # If this is the first row, set the previous_time to the current time
    if previous_time is None:
        filtered_df.append(row)
        previous_time = row["time"]
        continue

    # Otherwise, check if the time difference is less than the minimum time difference
    time_difference = row["time"] - previous_time
    if time_difference < smallest_time_unit:
        if event_type == "mouse_moved":
            # we'll just take the next mouse position
            continue
        row["time"] = previous_time + smallest_time_unit

    # Finally, append the row to the filtered_df
    filtered_df.append(row)
    previous_time = row["time"]

# add delta_time to the filtered_df and update index
filtered_df = pd.DataFrame(filtered_df)
filtered_df["delta_time"] = filtered_df["time"].diff()
filtered_df = filtered_df.reset_index(drop=True)

event_types = filtered_df["event_type"].unique()

In [5]:
print(f"{len(filtered_df)} rows of event types: {event_types}") #37229
filtered_df.head(10)

37229 rows of event types: ['mouse_moved' 'mouse_pressed' 'mouse_released' 'key_pressed'
 'key_released' 'mouse_wheel']


Unnamed: 0,event_source,event_type,time,x,y,button,clicks,keycode,rawcode,char,mask,wheel_amount,wheel_direction,wheel_rotation,delta_time,delta_x,delta_y
0,local,mouse_moved,34301500,2482.0,21.0,0.0,0.0,,,,0,,,,,,
1,local,mouse_moved,34301546,2482.0,21.0,0.0,0.0,,,,0,,,,46.0,0.0,0.0
2,local,mouse_moved,34301562,2483.0,18.0,0.0,0.0,,,,0,,,,16.0,1.0,-3.0
3,local,mouse_moved,34301578,2485.0,17.0,0.0,0.0,,,,0,,,,16.0,2.0,-2.0
4,local,mouse_moved,34301609,2492.0,11.0,0.0,0.0,,,,0,,,,31.0,4.0,-3.0
5,local,mouse_moved,34301625,2506.0,2.0,0.0,0.0,,,,0,,,,16.0,17.0,-11.0
6,local,mouse_moved,34301656,2523.0,0.0,0.0,0.0,,,,0,,,,31.0,12.0,0.0
7,local,mouse_moved,34301687,2537.0,-1.0,0.0,0.0,,,,0,,,,31.0,1.0,-1.0
8,local,mouse_moved,34301750,2537.0,0.0,0.0,0.0,,,,0,,,,63.0,0.0,0.0
9,local,mouse_moved,34301781,2526.0,0.0,0.0,0.0,,,,0,,,,31.0,-7.0,0.0


In [6]:
# print length of original df and filtered df
print(len(df))
print(len(filtered_df))

# find out the none zero minimum delta_time in filtered_df and the index of the row, then print the row and its previous row
min_delta_time = filtered_df['delta_time'].min()
print(f"min_delta_time: {min_delta_time}")

654358
37229
min_delta_time: 16.0


In [7]:
# difference in time between the first and last row
df_time_difference = df["time"].iloc[-1] - df["time"].iloc[0]
print(f"df_time_difference: {df_time_difference}")

filtered_df_time_difference = filtered_df["time"].iloc[-1] - filtered_df["time"].iloc[0]
print(f"filtered_df_time_difference: {filtered_df_time_difference}")

df_time_difference: 1294734
filtered_df_time_difference: 1294734
