# MouseMovementClassifier

The goal of this Notebook is to classify users based on their mouse movements. In the "data" directory you will find .csv files that contain Y coordinates that describe how a user moved his mouse in order to press a button. The file names can be used as labels for the data. 

For the sake of simplicity, we are omitting X coordinates. We will use simple Deep Neural Networks to classify users. We can discuss some ways to improve our data collection and classification afterwards.

In [None]:
import csv
import numpy as np
from collections import Counter
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from sklearn.model_selection import train_test_split
from keras.callbacks import ModelCheckpoint
from keras.layers import Input, Dense, Dropout, BatchNormalization
from keras.utils import to_categorical
from keras.models import Model
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import KFold
from keras.wrappers.scikit_learn import KerasClassifier
from sklearn.metrics import confusion_matrix
import matplotlib.pyplot as plt
import os

In [None]:
def get_data():
    
    inputs = []
    labels = []
    
    for root, dirs, files in os.walk("./data"):
    
        for filename in files:
            with open("./data/" + filename) as csv_file:

                csv_reader = csv.reader(csv_file, delimiter=',')
                for row in csv_reader:
                    inputs.append(transform_input([int(elem) for elem in row]))
                    labels.append(filename.replace('.csv', ''))
    return inputs, labels

# Data Transformation

In order to classify users based on our data, we need to extract meaningful features. We could treat the sample data as a sequence but Deep Neural Networks are not suitable for such things (Convolutional or Recurrent Neural Networks would be better for this task).

We will try to come up with some features that describe a user's movement in general. 

For example: We could use some common measures such as mean or median. Furthermore, we could try to split each data entry in two halfs to capture more information about the movement style towards the first and second half. 

You can edit the function below if you want. "transform_input" is called while the data is being read in "get_data" (see above).

In [None]:
# modify features if needed

def transform_input(input):
    output = []
    middle = round(len(input)/2)
    first_half = input[middle:]
    second_half = input[:middle]
    
    # initial position
    output.append(input[0])
    
    # end position
    output.append(input[-1])
    
    # number of movements
    output.append(len(first_half))
    output.append(len(second_half)) 
    
    # highest Y
    output.append(max(first_half)) 
    output.append(max(second_half)) 
    
    # lowest Y
    output.append(min(first_half)) 
    output.append(min(second_half)) 
    
    # avg Y
    output.append(np.mean(first_half))
    output.append(np.mean(second_half))
    
    # median Y
    output.append(np.median(first_half))
    output.append(np.median(second_half))
    
    return output

The libraries that we are using require us to encode our labels. Furthermore we are using a MinMaxScaler to normalize the inputs. This reduces bias due to very large or small values.

In [None]:
def encode_labels(labels):
    
    label_encoder = LabelEncoder()
    label_encoder.fit(labels)
    encoded_labels = label_encoder.transform(labels)
    one_hot_labels = to_categorical(encoded_labels)
    users = label_encoder.classes_.tolist()
    print(users)
    
    return one_hot_labels, label_encoder

In [None]:
def scale_inputs(inputs):

    min_max_scaler = MinMaxScaler()
    scaled_inputs = min_max_scaler.fit_transform(inputs)

    return scaled_inputs

# Deep Neural Network

The function below is used to dynamically generate a DNN. We can specify the amount of Neurons, Layers and Dropout.

In [None]:
def create_DNN_model(
    input_dim,
    output_dim,
    optimizer='adam',
    dense_neurons=(150,150,150),
    dropout=(0.1,0.1,0.1),
    model_name='DNN_model'):

    model_input = Input(shape=(input_dim,), dtype='float32')
    x = Dense(dense_neurons[0], name='dense_1', activation='relu')(model_input)
    x = Dropout(dropout[0])(x)
    x = BatchNormalization()(x)

    layer_count = 2

    if len(dense_neurons)>1:
        for neurons,dropout in zip(dense_neurons[1:],dropout[1:]):
            x = Dense(neurons, name='dense_'+ str(layer_count), activation='relu')(x)
            x = Dropout(dropout)(x)
            x = BatchNormalization()(x)
            layer_count +=1

    model_output = Dense(output_dim, name='dense_out' ,activation='softmax')(x)
    
    model = Model(inputs=model_input, outputs=model_output)
    model.name = model_name
    model.compile(optimizer=optimizer, loss='categorical_crossentropy', metrics=['accuracy'])
    model.summary()
    return model

In [None]:
def evaluate_model(model,weights,label_encoder,test_inputs,test_labels):
    
    model.load_weights(weights)
    
    y_pred = []
    y_true = []

    for i in range(0,len(test_labels)):
        pred = label_encoder.classes_[np.argmax(model.predict(test_inputs[None,i]))]
        y_pred.append(pred)
        true = label_encoder.classes_[np.argmax(test_labels[i])]
        y_true.append(true)

    max_group = 0
    for i,c in Counter(y_true).items():
        max_group = max(c,max_group)
    print('Most common class: {}'.format(max_group/len(y_true)))
    score = model.evaluate(test_inputs, test_labels, verbose=0)
    print("Accuracy: ", score[1])

    cm_labels = label_encoder.classes_.tolist()

    x_labels = [l[:7] for l in cm_labels]
    cm = confusion_matrix(y_true,y_pred,labels=cm_labels)
    fig = plt.figure(figsize=(8,8))
    ax = fig.add_subplot(111)
    cax = ax.matshow(cm)
    plt.title('Confusion Matrix')
    fig.colorbar(cax)
    ax.set_xticks(np.arange(len(x_labels)))
    ax.set_xticklabels(x_labels)
    ax.set_yticks(np.arange(len(cm_labels)))
    ax.set_yticklabels(cm_labels)
    plt.xlabel('Predicted')
    plt.ylabel('True')
    plt.show()

Call defined functions to transform our data

In [None]:
inputs, labels = get_data()
scaled_inputs = scale_inputs(inputs)
encoded_labels, label_encoder = encode_labels(labels)

# GridSearch (optional)

In order to find good parameters for our model, we can perform a grid search. Parameter tuning can take a lot of time and we could speed it up significantly through parallel processing (Hello Spark!)

In [None]:
model = KerasClassifier(build_fn=create_DNN_model,
                        input_dim=len(scaled_inputs[0]),
                        output_dim=len(encoded_labels[0]))
param_grid = dict(epochs=[50],
                  batch_size=[8],
                  dense_neurons=[(64,),(32,16)],
                  dropout=[(0.25,0.25)],
                  optimizer=['adam'])

In [None]:
cv = KFold(n_splits=4, random_state=42, shuffle=True)
grid = GridSearchCV(estimator=model, param_grid=param_grid, n_jobs=1, verbose=2, cv = cv)
grid_result = grid.fit(scaled_inputs, encoded_labels, validation_split=0.2, shuffle=True)

In [None]:
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
    print("%f (%f) with: %r" % (mean, stdev, param))

# Training the final model

Once we have a good idea about the parameters that we want to use, we can train and evaluate our final model. It is easily possible to export the final model and deploy it to production.

In [None]:
train_inputs, test_inputs, train_labels, test_labels = train_test_split(scaled_inputs, 
                                                                        encoded_labels, 
                                                                        test_size=0.2, 
                                                                        random_state=42,
                                                                        stratify=labels)
model = create_DNN_model(input_dim=len(scaled_inputs[0]),
                         output_dim=len(encoded_labels[0]),
                         optimizer='adam',
                         dense_neurons=(64,),
                         dropout=(0.25,),
                         model_name='DNN_model')
checkpointer = ModelCheckpoint(filepath='best_model.h5', 
                               verbose=1, 
                               save_best_only=True)
hist = model.fit(train_inputs, 
                 train_labels,
                 batch_size=16,
                 epochs=200,
                 callbacks=[checkpointer],
                 validation_split=0.2,
                 verbose=1)

In [None]:
evaluate_model(model,'best_model.h5',label_encoder,test_inputs,test_labels)

# How can we improve our data and classifier?

* ...