In [25]:
# import libraries
import pandas as pd
import numpy as np
from math import *
import os
import glob
import time

# Data Processing

In this section, we take data from the mouse tracker:
- perform calculations on data
- segment data (identify mouse actions)

## Calculate metrics
In this section, we are going to calculate metrics for `df`. The math shows how each value will be calculated

Variables to know
- $\Delta$: change in specific quantity. Final data point minus initial data point
    - example: $\Delta v = v - v_0$ where $v$ is final velocity and $v_0$ is initial velocity
- $t$: time
- $x$: data points in the `x` direction
- $y$: data points in the `y` direction

Horizontal velocity of mouse:

$v_x = \Delta x \div \Delta t$

Vertical velocity of mouse:

$v_y = \Delta y \div \Delta t$

Total speed of mouse:

$v = \sqrt{(v_x)^2 + (v_y)^2}$

- Note, the term speed is used instead of velocity because speed is purely a magnitude

Acceleration:

$a = \Delta v \div \Delta t$

Jerk: how fast acceleration changes

$j = \Delta a \div \Delta t$

Angle that mouse is moving at (in radians):

$\theta = \tan^{-1} \Bigl( \Delta y \div \Delta x \Bigr)$

Angular velocity: how fast the angle is changing

$\omega = \Delta \theta \div \Delta t$

Curvature: how fast a curve is changing direction at a given point

$\kappa = \omega \div v$



In [26]:
# finds change in each element, appends to list, and adds it to another DataFrame
def find_change_in(column):
    # store change in quantity
    change = []

    for i, x in enumerate(column):
        # skip first iteration because there is nothing before the first item in list
        if i == 0:
            continue

        final = column[i]
        initial = column[i-1]

        change.append(final - initial)
        
    # returns all change in quantities
    return pd.Series(change, dtype='float64')

In [27]:
def calculate_metrics(df):
    # create new dataframe to store calculations
    calc_col = ['Change in Time', 'Change in X', 'Change in Y', 'Horizontal Velocity', 'Vertical Velocity', 'Speed',
               'Acceleration', 'Jerk', 'Angle of Movement', 'Angular Velocity']
    calculations = pd.DataFrame(columns=calc_col)

    # find change in time, horizontal velocity, and vertical velocity
    calculations['Change in Time'] = find_change_in(df['Time Since Start of Test'])
    calculations['Change in X'] = find_change_in(df['Mouse X Position'])
    calculations['Change in Y'] = find_change_in(df['Mouse Y Position'])

    # find speed and velocity components
    calculations['Horizontal Velocity'] = calculations['Change in X'] / calculations['Change in Time']
    calculations['Vertical Velocity'] = calculations['Change in Y'] / calculations['Change in Time']
    calculations['Speed'] = np.sqrt(calculations['Horizontal Velocity'] ** 2 + calculations['Vertical Velocity'] ** 2)

    #######

    # find acceleration
    change_in_speed = find_change_in(calculations['Speed'])
    calculations['Acceleration'] = change_in_speed / calculations['Change in Time'][:len(change_in_speed)]

    # find jerk
    change_in_acceleration = find_change_in(calculations['Acceleration'])
    calculations['Jerk'] = change_in_acceleration / calculations['Change in Time'][:len(change_in_acceleration)]

    #######

    # find angle that mouse was moved
    trig_ratios = calculations['Change in Y'] / calculations['Change in X']
    angles = []

    for i, x in enumerate(trig_ratios):
        angle = atan(x)
        # if angle is negative, make it positive
        if angle < 0:
            angle += 2 * pi
        angles.append(angle)
    calculations['Angle of Movement'] = angles

    # find angular velocity
    change_in_angles = find_change_in(calculations['Angle of Movement'])
    calculations['Angular Velocity'] = change_in_angles / calculations['Change in Time'][:len(change_in_angles)]

    # approximates curvature curvature
    calculations['Curvature'] = calculations['Angular Velocity'] / calculations['Speed']

    ########

    # replace NaN vars with 0
    calculations.fillna(0, inplace=True)
    
    return calculations

## Segmentation
In this section, we determine what a user has done based on data. The columns are `Mouse Button Pressed` (affected button) and `Pressed` (whether or not affected button was pressed)
- single-click: [(`Button` and `True`), (`Button` and `False`)] and no change in mouse position. change in time must be less than 0.15 seconds
- drag and drop: (`Button` and `TRUE`) and `change_in_time` > 0.15 seconds and change in coordinates
- mouse movement: (`FALSE` and `FALSE`) and change in mouse position. click is more than 0.5 seconds after
- point-click: mouse movement followed by single-click less than 0.5 seconds after
    - sets all mouse movement inside mouse movement segment to point-click

### Impossible Values
Values below can never be `True` together
- Mouse Movement, Single Click
- Point Click, Drag and Drop
- Single Click, Drag and Drop

Check the above by calling `segmentation.value_counts()`

Note, some entries *may* violate the above rules. However, they are very rare and may not affect the program that much.

In [28]:
def segmentation(df, calculations):
    # create segmentation dataframe
    seg_col = ['Mouse Movement', 'Single Click', 'Point Click', 'Drag and Drop']
    segmentation = pd.DataFrame(columns=seg_col)

    #########################

    # find mouse clicks
    for i, x in enumerate(df['Mouse Button Clicked']):
        if i == 0:
            continue
            
        if x != 'False':
            # checking if data switches from true to false
            if df['Pressed'][i-1] == True and df['Pressed'][i] == False and calculations['Change in Time'][i-1] <= 0.15:
                segmentation.at[i-1, 'Single Click'] = 1
                segmentation.at[i, 'Single Click'] = 1
            else:
                segmentation.at[i, 'Single Click'] = 0
        else:
            segmentation.at[i, 'Single Click'] = 0

    #########################

    # find mouse movement
    for i, x in enumerate(calculations['Change in X']):
            if (calculations['Change in X'][i] != 0 or calculations['Change in Y'][i] != 0) and df['Mouse Button Clicked'][i] == 'False':
                segmentation.at[i, 'Mouse Movement'] = 1
            else:
                segmentation.at[i, 'Mouse Movement'] = 0

    #########################           

    # find drag and drop actions
    for i, x in enumerate(df['Mouse Button Clicked']):
        if i == 0:
            continue

        if (segmentation['Mouse Movement'][i] == 1 and df['Mouse Button Clicked'][i] != 'False') or (df['Pressed'][i-1] == True and df['Pressed'][i] == True):
            segmentation.at[i-1, 'Drag and Drop'] = 1
        else:
            segmentation.at[i-1, 'Drag and Drop'] = 0

    #########################

    """
    Find point click

    The following code:
    1. Finds all single-click indices
    2. Loops backwards from each single-click indice.
        - If mouse movement is true and change in time is less than 0.5 (rounded to nearest tenth), set Point Click to True
        - Else, set to Point Click to False
    """

    single_click_indices = segmentation[segmentation['Single Click'] == 1].index

    for i, x in enumerate(single_click_indices):
        # skip first iteration
        if i == 0:
            continue
        # check 0th entry in first iteration
        if i == 1:
            # start at first item and head to 0
            for j, x in enumerate(list(range(single_click_indices[i], 0, -1))):
                if round(calculations['Change in Time'][x-1], 1) <= 0.5:
                    segmentation.at[x, 'Point Click'] = 1
                else:
                    segmentation.at[x, 'Point Click'] = 0

        for j, x in enumerate(list(range(single_click_indices[i], single_click_indices[i-1], -1))):
            if round(calculations['Change in Time'][x-1], 1) <= 0.5 and (segmentation['Single Click'][x] == 1):
                segmentation.at[x, 'Point Click'] = 1
            else:
                segmentation.at[x, 'Point Click'] = 0
    
    segmentation.fillna(0, inplace=True)
    # return data when done
    return segmentation

## One-Hot Encoding
Some data is still not numerical. This section will make all data numeric.

In [29]:
# all data should be in one dataframe by now
def one_hot_encode(df):
    col=['Button.left', 'Button.middle', 'Button.right', 'FALSE']
    # one hot encode 'Mouse Button Clicked' column
    dummies = pd.get_dummies(df['Mouse Button Clicked'], columns=col)
    for column in col:
        try:
            # check if data exists in column
            dummies[column].iloc[0]
        except:
            # set all values in column to False if there is no data there
            dummies[column] = [0] * len(dummies.iloc[:, 0])
    
    df = pd.concat([df, dummies], axis=1)
    df.drop('Mouse Button Clicked', inplace=True, axis=1)
    
    try:
        df.drop('False', axis=1, inplace=True)
    except:
        pass
    
    # one hot encode 'Pressed' column
    dummies = pd.get_dummies(df['Pressed'], drop_first=True)
    dummies.columns = ['Button Pressed']
    df = pd.concat([df, dummies], axis=1)
    df.drop('Pressed', axis=1, inplace=True)
    
    return df

## Categorize Data
Adds `Cheater` column based on whether file name has `cheater` or `innocent` in it. This will help the AI find the correct answers later on.

In [30]:
def data_categorizer(file, is_cheater):
    if is_cheater:
        file['Cheater'] = 1
    else:
        file['Cheater'] = 0
    
    return file

## Import File
Import all CSV files in the `Mouse Logs` folder and export them to the `Output` folder.

In [31]:
def get_all_files_in_folder(folder, cwd):  
    # get current path and requested folder
    path = str(cwd + f'\\{folder}')

    # find all csv files
    csv_files = glob.glob(os.path.join(path, "*.csv"))

    return csv_files

In [32]:
def data_processing(file, id_num, folder='Mouse Logs'):
    df = pd.read_csv(f'{folder}\\{file}')
    df.columns = ['Time Since Start of Test', 'Mouse X Position', 'Mouse Y Position', 'Mouse Button Clicked', 'Pressed']
    
    # check if df is empty
    try:
        df.iloc[0]
    except:
        print(f'ERROR: {file} is empty!')
        # return empty Series to show that file is empty
        return pd.Series(dtype='float64')

    calc = calculate_metrics(df)  # perform calculations
    segment = segmentation(df, calc)  # segment mouse movements

    # combine original data with calculations and segmentation
    output = pd.concat([df, calc, segment], axis=1)
    
    # add final touches to data
    output = one_hot_encode(output)
    output.fillna(0, inplace=True)
    
    # add ID number to file
    output['File ID'] = id_num

    # replace infinity values with highest max value in column
    for i, x in enumerate(output.columns):
        max_value = output.loc[output[x] != np.inf, x].max()
        output[x].replace(np.inf, max_value, inplace=True)
        output[x].replace(-1 * np.inf, max_value, inplace=True)
    
    return output

In [33]:
def check_folder(folder, path='./'):
    # if folder does not exist, make folder
    if folder in os.listdir():
        print(f'{folder} directory already exists!')
    else:
        os.mkdir(f'{path}{folder}/')
        print(f'Created {folder}')


def create_output_file(file, file_name, folder='Processed Data'):
    file_location = os.path.join(output_folder, file_name)        
    file.to_csv(file_location)

In [44]:
# Yield successive chunks of size n from length l.
def divide_chunks(l, n):
    # looping till length l
    for i in range(0, len(l), n):
        yield l[i:i + n]

In [46]:
# get file names from input_folder
input_folder = 'Useful Logs/4tests Algebra Test'
output_folder = 'Useful Processed Data'
cwd = os.getcwd()
files = [i.split('\\')[-1] for i in get_all_files_in_folder(input_folder, cwd)]

check_folder(output_folder)

# calculate data
for i, file in enumerate(files):
    start = time.time()
    input_file = data_processing(file, i, input_folder)
    
    # if file is empty, skip the file
    if input_file.empty:
        continue
    
    # determine if file is cheater or not
    if 'innocent' in file:
        data_categorizer(input_file, is_cheater=False)
    elif 'cheater' in file:
        data_categorizer(input_file, is_cheater=True)
    
    # output to file in output_folder
    file_location = os.path.join(output_folder, file)        
    input_file.to_csv(file_location)
    
    end = time.time()
    total_time = round(end-start, 4)
    print(f'File {i+1} of {len(files)} completed. Time: {total_time}. File: {file}')
    
# create training and test folders in output_folder
print()
check_folder('test')
check_folder('training')

folder = os.chdir(output_folder)

output_files = [i.split('\\')[-1] for i in get_all_files_in_folder(output_folder, cwd)]

# split files into innocent and cheating
innocent = []
cheater = []
for i, file in enumerate(output_files):
    if 'innocent' in file:
        innocent.append(file)
    elif 'cheater' in file:
        cheater.append(file)

    # move 2/3 of files to training
    # 1. split list into 3
    # move two parts to training
    # move one part to test
    innocent_split = list(divide_chunks(innocent, 3))
    cheater_split = list(divide_chunks(cheater, 3))
    print(innocent_split)
    # todo
    # move rest to test
    
    
os.chdir('..')
os.getcwd()

Useful Processed Data directory already exists!
File 1 of 4 completed. Time: 0.1047. File: 2023-04-20T13.57.20_mouse_log_innocent.csv
File 2 of 4 completed. Time: 0.2155. File: 2023-04-20T14.11.26_mouse_log_cheater.csv
File 3 of 4 completed. Time: 0.1572. File: 2023-04-20T14.22.32_mouse_log_innocent.csv
File 4 of 4 completed. Time: 0.3471. File: 2023-04-20T14.48.59_mouse_log_cheater.csv

test directory already exists!
training directory already exists!
[['2023-04-20T13.57.20_mouse_log_innocent.csv']]
[['2023-04-20T13.57.20_mouse_log_innocent.csv']]
[['2023-04-20T13.57.20_mouse_log_innocent.csv', '2023-04-20T14.22.32_mouse_log_innocent.csv']]
[['2023-04-20T13.57.20_mouse_log_innocent.csv', '2023-04-20T14.22.32_mouse_log_innocent.csv']]


'C:\\Users\\92.89\\OneDrive - Pinellas County Schools\\2022-23 School Year\\6 EIP\\Mouse Tracker'

## Random Code Snippets
Following code snippets may help when debugging program

### Checking Point Click

```
from random import randint
j = randint(0, len(segmentation['Single Click']))
j = 171
print(f'j={j}')

print(segmentation['Mouse Movement'][j] or segmentation['Single Click'][j])
print(round(calculations['Change in Time'][j-1], 1))

print(segmentation['Point Click'][j])

print('\n')

print('Calculations')
display(calculations.iloc[j-5:j+5])
print()

print('Segmentation')
display(segmentation.iloc[j-5:j+5])
print()

print('Raw Data')
display(df.iloc[j-5:j+5])
print()

print((segmentation['Mouse Movement'][j] == True and x != 'False') or (df['Pressed'][j-1] == True and df['Pressed'][j] == True))
```

## Future Debugging
Below code is used for future debugging. This will only process for *one* file.

Please remove the backticks in the snippets.

```python
import pandas as pd
df = pd.read_csv('./Mouse Logs/2023-03-31T13.16.09_mouse_log.csv')
```

```python
# create new dataframe to store calculations
calc_col = ['Change in Time', 'Change in X', 'Change in Y', 'Horizontal Velocity', 'Vertical Velocity', 'Speed',
           'Acceleration', 'Jerk', 'Angle of Movement', 'Angular Velocity']
calculations = pd.DataFrame(columns=calc_col)

# find change in time, horizontal velocity, and vertical velocity
calculations['Change in Time'] = find_change_in(df['Time Since Start of Test'])
calculations['Change in X'] = find_change_in(df['Mouse X Position'])
calculations['Change in Y'] = find_change_in(df['Mouse Y Position'])

# find speed and velocity components
calculations['Horizontal Velocity'] = calculations['Change in X'] / calculations['Change in Time']
calculations['Vertical Velocity'] = calculations['Change in Y'] / calculations['Change in Time']
calculations['Speed'] = np.sqrt(calculations['Horizontal Velocity'] ** 2 + calculations['Vertical Velocity'] ** 2)

#######

# find acceleration
change_in_speed = find_change_in(calculations['Speed'])
calculations['Acceleration'] = change_in_speed / calculations['Change in Time'][:len(change_in_speed)]

# find jerk
change_in_acceleration = find_change_in(calculations['Acceleration'])
calculations['Jerk'] = change_in_acceleration / calculations['Change in Time'][:len(change_in_acceleration)]

#######

# find angle that mouse was moved
trig_ratios = calculations['Change in Y'] / calculations['Change in X']
angles = []

for i, x in enumerate(trig_ratios):
    angle = atan(x)
    # if angle is negative, make it positive
    if angle < 0:
        angle += 2 * pi
    angles.append(angle)
calculations['Angle of Movement'] = angles

# find angular velocity
change_in_angles = find_change_in(calculations['Angle of Movement'])
calculations['Angular Velocity'] = change_in_angles / calculations['Change in Time'][:len(change_in_angles)]

# approximates curvature curvature
calculations['Curvature'] = calculations['Angular Velocity'] / calculations['Speed']

########

# replace NaN vars with 0
calculations.fillna(0, inplace=True)

calculations
```

```python
# create segmentation dataframe
seg_col = ['Mouse Movement', 'Single Click', 'Point Click', 'Drag and Drop']
segmentation = pd.DataFrame(columns=seg_col)

#########################

# find mouse clicks
for i, x in enumerate(df['Mouse Button Clicked']):
    if i == 0:
        continue

    if x != 'False':
        # checking if data switches from true to false
        if df['Pressed'][i-1] == True and df['Pressed'][i] == False and calculations['Change in Time'][i-1] <= 0.15:
            segmentation.at[i-1, 'Single Click'] = 1
            segmentation.at[i, 'Single Click'] = 1
        else:
            segmentation.at[i, 'Single Click'] = 0
    else:
        segmentation.at[i, 'Single Click'] = 0

#########################

# find mouse movement
for i, x in enumerate(calculations['Change in X']):
        if (calculations['Change in X'][i] != 0 or calculations['Change in Y'][i] != 0) and df['Mouse Button Clicked'][i] == 'False':
            segmentation.at[i, 'Mouse Movement'] = 1
        else:
            segmentation.at[i, 'Mouse Movement'] = 0

#########################           

# find drag and drop actions
for i, x in enumerate(df['Mouse Button Clicked']):
    if i == 0:
        continue

    if (segmentation['Mouse Movement'][i] == 1 and df['Mouse Button Clicked'][i] != 'False') or (df['Pressed'][i-1] == True and df['Pressed'][i] == True):
        segmentation.at[i-1, 'Drag and Drop'] = 1
    else:
        segmentation.at[i-1, 'Drag and Drop'] = 0

#########################

"""
Find point click

The following code:
1. Finds all single-click indices
2. Loops backwards from each single-click indice.
    - If mouse movement is true and change in time is less than 0.5 (rounded to nearest tenth), set Point Click to True
    - Else, set to Point Click to False
"""

single_click_indices = segmentation[segmentation['Single Click'] == 1].index

for i, x in enumerate(single_click_indices):
    # skip first iteration
    if i == 0:
        continue
    # check 0th entry in first iteration
    if i == 1:
        # start at first item and head to 0
        for j, x in enumerate(list(range(single_click_indices[i], 0, -1))):
            if round(calculations['Change in Time'][x-1], 1) <= 0.5:
                segmentation.at[x, 'Point Click'] = 1
            else:
                segmentation.at[x, 'Point Click'] = 0

    for j, x in enumerate(list(range(single_click_indices[i], single_click_indices[i-1], -1))):
        if round(calculations['Change in Time'][x-1], 1) <= 0.5 and (segmentation['Single Click'][x] == 1):
            segmentation.at[x, 'Point Click'] = 1
        else:
            segmentation.at[x, 'Point Click'] = 0

segmentation.fillna(0, inplace=True)
# return data when done
segmentation
```