# User Authentication Based on Mouse Characteristics #

## Load Packages ##

In [1]:
import pandas as pd
import numpy as np

import os
import pickle
import copy

# preprocessing
from sklearn.preprocessing import LabelEncoder, OneHotEncoder

# algorithms
# from sklearn.ensemble import RandomForestClassifier
# from sklearn.linear_model import LogisticRegression
from lightgbm import LGBMClassifier
# from xgboost import XGBClassifier

# optimization
from sklearn.model_selection import RandomizedSearchCV

# performance
from sklearn.metrics import roc_auc_score

# random seed 
np.random.seed(0)

  return f(*args, **kwds)
  return f(*args, **kwds)


## Load Data ##

In [2]:
data_dir = '/home/lee/Documents/DatasetsForGitHub/balabit_mouse_dynamics_challenge/'

In [3]:
all_train = pd.read_pickle(data_dir + 'all_training_aggregation.pickle')

In [4]:
file_paths = []

for root, dirs, files in os.walk(data_dir + "training_files/"):
    for file in files:
        file_paths.append(os.path.join(root, file))

# randomly pick 66% of all training sessions, use these sessions to train classification models
draw_train = np.random.randint(low=0, high=len(file_paths), size=np.floor(len(file_paths)*0.66).astype('int'))
train_users = list(map(lambda x: x.split(os.path.sep)[-2], [file_paths[y] for y in draw_train]))
train_sessions = list(map(lambda x: x.split(os.path.sep)[-1], [file_paths[y] for y in draw_train]))
df_train = all_train[all_train['user'].isin(train_users) & all_train['session'].isin(train_sessions)]

# the rest of the sessions are validation data
draw_val = list(set(range(len(file_paths))) - set(draw_train))
val_users = list(map(lambda x: x.split(os.path.sep)[-2], [file_paths[y] for y in draw_val]))
val_sessions = list(map(lambda x: x.split(os.path.sep)[-1], [file_paths[y] for y in draw_val]))
df_val = all_train[all_train['user'].isin(val_users) & all_train['session'].isin(val_sessions)]

## Process Data ##

In [5]:
le_user = LabelEncoder()
le_categ = LabelEncoder()

oh_user = OneHotEncoder()
oh_categ = OneHotEncoder()

In [6]:
y_train = le_user.fit_transform(df_train['user'])

# label encode
df_train['categ_le'] = le_categ.fit_transform(df_train['categ_agg'])

# one-hot encode
vec_size = df_train['categ_agg'].nunique()
df_train[['oh_categ{}'.format(i) \
          for i in range(vec_size)]] = \
        pd.DataFrame(oh_categ.fit_transform(\
                df_train['categ_le'].values.reshape(len(df_train['categ_le']), 1)).todense(), \
             index=df_train.index)

X_train = df_train.drop(['categ_agg', 'session', 'categ_le', 'user'], axis=1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  after removing the cwd from sys.path.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self[k1] = value[k2]


In [7]:
y_val = le_user.transform(df_val['user'])

# label encode
df_val['categ_le'] = le_categ.transform(df_val['categ_agg'])

# one-hot encode
df_val[['oh_categ{}'.format(i) \
          for i in range(vec_size)]] = \
        pd.DataFrame(oh_categ.transform(\
                df_val['categ_le'].values.reshape(len(df_val['categ_le']), 1)).todense(), \
             index=df_val.index)

X_val = df_val.drop(['categ_agg', 'session', 'categ_le', 'user'], axis=1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  after removing the cwd from sys.path.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self[k1] = value[k2]


## Fit Models ##

Define a few classification models. 

In [8]:
clf_lgb = LGBMClassifier(random_state=0)
# clf_xgb = XGBClassifier(random_state=0)
# clf_rf = RandomForestClassifier(random_state=0)
# clf_lr = LogisticRegression(random_state=0)

For a given user in the training sessions, label their mouse actions as legal (`is_illegal`=0). All the other users' mouse actions are labeled illegal (`is_illegal`=1). Loop over all users. 

In [9]:
for user in le_user.classes_:
    df = df_train.copy()
    df['is_illegal'] = 0
    # 1 = illegal session, 0 = legal session 
    df.loc[df['user'] != user, 'is_illegal'] = 1
    X = df.drop(['categ_agg', 'session', 'categ_le', 'user', 'is_illegal'], axis=1)
    y = df['is_illegal']

    exec('clf_lgb_' + user + " = LGBMClassifier(random_state=0)")
    exec('clf_lgb_' + user + ".fit(X, y)")
    
    auc = eval('roc_auc_score(y, clf_lgb_' + user + ".predict_proba(X)[:, 1])")

    print("ROC AUC in training data for {0}: {1:0.4}".format(user, auc))
    
    del df, X, y

ROC AUC in training data for user12: 0.9014
ROC AUC in training data for user15: 0.9653
ROC AUC in training data for user16: 0.871
ROC AUC in training data for user20: 0.9884
ROC AUC in training data for user21: 0.9131
ROC AUC in training data for user23: 0.8894
ROC AUC in training data for user29: 0.9166
ROC AUC in training data for user35: 0.8926
ROC AUC in training data for user7: 0.9868
ROC AUC in training data for user9: 0.9959


In [10]:
for user in le_user.classes_:
    df = df_val.copy()
    df['is_illegal'] = 0
    # 1 = illegal session, 0 = legal session 
    df.loc[df['user'] != user, 'is_illegal'] = 1
    X = df.drop(['categ_agg', 'session', 'categ_le', 'user', 'is_illegal'], axis=1)
    y = df['is_illegal']

    auc = eval('roc_auc_score(y, clf_lgb_' + user + ".predict_proba(X)[:, 1])")

    print("ROC AUC in validation data for {0}: {1:0.4}".format(user, auc))
    
    del df, X, y

ROC AUC in validation data for user12: 0.784
ROC AUC in validation data for user15: 0.769
ROC AUC in validation data for user16: 0.7294
ROC AUC in validation data for user20: 0.6121
ROC AUC in validation data for user21: 0.7616
ROC AUC in validation data for user23: 0.7258
ROC AUC in validation data for user29: 0.7612
ROC AUC in validation data for user35: 0.7348
ROC AUC in validation data for user7: 0.9869
ROC AUC in validation data for user9: 0.9909


For users other than 7 and 9 we seem to have overfit to training data. Now use cross-validation to correct overfitting.

In [11]:
# candidate hyperparameters
gridParams = {
    'num_leaves': [6, 8, 12, 16, 24],
    'min_data_in_leaf': [24, 32, 40], 
    'max_bin': [32, 64, 128],
    'max_depth': [8, 16, 32]
    }

In [12]:
for user in le_user.classes_:
    df = all_train.copy()

    # encode
    df['categ_le'] = le_categ.transform(df['categ_agg'])

    df[['oh_categ{}'.format(i) \
              for i in range(vec_size)]] = \
            pd.DataFrame(oh_categ.transform(\
                    df['categ_le'].values.reshape(len(df['categ_le']), 1)).todense(), \
                 index=df.index)
    
    # define target label
    df['is_illegal'] = 0

    df.loc[df['user'] != user, 'is_illegal'] = 1
    X = df.drop(['categ_agg', 'session', 'categ_le', 'user', 'is_illegal'], axis=1)
    y = df['is_illegal']
    
    # randomized grid search
    clf_lgb = LGBMClassifier(random_state=0)

    random_search = RandomizedSearchCV(clf_lgb, scoring='roc_auc', param_distributions=gridParams)
    random_search.fit(X, y)

    # best hyperparameters
    params = dict()    
    params['num_leaves'] = random_search.best_params_['num_leaves']
    params['min_data_in_leaf'] = random_search.best_params_['min_data_in_leaf']
    params['max_bin'] = random_search.best_params_['max_bin']
    params['max_depth'] = random_search.best_params_['max_depth']

    # re-fit models
    clf_lgb = LGBMClassifier(random_state=0, **params)
    clf_lgb.fit(X, y)
    
    # AUC score
    auc = roc_auc_score(y, clf_lgb.predict_proba(X)[:, 1])
    print("ROC AUC for {0}: {1:0.4}".format(user, auc))

    # save models for each user
    exec('clf_lgb_' + user + " = copy.deepcopy(clf_lgb)")
    
    del df, X, y, random_search, clf_lgb, auc

ROC AUC for user12: 0.8313
Finished loading model, total used 100 iterations
ROC AUC for user15: 0.8676
Finished loading model, total used 100 iterations
ROC AUC for user16: 0.7991
Finished loading model, total used 100 iterations
ROC AUC for user20: 0.8384
Finished loading model, total used 100 iterations
ROC AUC for user21: 0.8466
Finished loading model, total used 100 iterations
ROC AUC for user23: 0.8222
Finished loading model, total used 100 iterations
ROC AUC for user29: 0.8481
Finished loading model, total used 100 iterations
ROC AUC for user35: 0.8279
Finished loading model, total used 100 iterations
ROC AUC for user7: 0.9829
Finished loading model, total used 100 iterations
ROC AUC for user9: 0.9943
Finished loading model, total used 100 iterations


## Apply to Test Data ##

Owner of the challenge dataset did not release the true labels of their entire test set, only the subset that was used to calculate the public leaderboard score. I use this subset to calculate my public leaderboard score. 

In [13]:
# all test data
all_test = pd.read_pickle(data_dir + 'all_testing_aggregation.pickle')

# public leaderboard subset
public_test_labels = pd.read_csv(data_dir + 'public_labels.csv')
public_test = all_test[all_test['session'].isin(public_test_labels['filename'])]

public_test['categ_le'] = le_categ.fit_transform(public_test['categ_agg'])
vec_size = public_test['categ_agg'].nunique()
public_test[['oh_categ{}'.format(i) for i in range(vec_size)]] = \
        pd.DataFrame(oh_categ.fit_transform(\
        public_test['categ_le'].values.reshape(len(public_test['categ_le']), 1)).todense(), index=public_test.index)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self[k1] = value[k2]


The submission requirement for the challenge was that each test mouse session has an anomaly score between 0 and 1 that tells how unlikely the remote session was carried out by the respective user account, i.e., a measure of `is_illegal`=1. My classification model gives a predicted probability of `is_illegal`=1 for each mouse action in a given session; then the anomaly score of the session is the mean of the predicted probability of all its actions. 

In [14]:
session_proba = dict()

for session in public_test['session'].unique():
    user_test = public_test.loc[public_test['session'] == session, 'user'].unique()[0]
    data_test = public_test[(public_test['session'] == session)]\
                .drop(['categ_agg', 'session', 'categ_le', 'user'], axis=1)
    
    proba = eval('clf_lgb_' + user_test + ".predict_proba(data_test)[:, 1]")
    session_proba[session] = np.mean(proba)

Now calculate final public score. 

In [15]:
results = pd.DataFrame.from_dict(session_proba, orient='index', columns=['pred_proba'])
public_test_labels.set_index('filename', inplace=True)
compare_to_label = public_test_labels.join(results, sort=False)
print('Final ROC AUC (public score): {0:0.4}'.format(roc_auc_score(compare_to_label['is_illegal'], compare_to_label['pred_proba'], average='macro')))

Final ROC AUC (public score): 0.9075
