In [1]:
import psycopg2
import pandas as pd
import sys
sys.path.append("..")
import credentials
import numpy as np
from datetime import datetime
import time
import string
from tqdm import tnrange, tqdm_notebook, tqdm
from tqdm._tqdm_notebook import tqdm_notebook
import numpy.ma as ma
from multiprocessing import Pool
from scipy.optimize import minimize
from biokey.data import DataInterface
import biokey.tools
# This makes plots render inline

  (fname, cnt))
  (fname, cnt))


In [2]:
import itertools
import multiprocessing

In [2]:
% matplotlib inline

In [4]:
data = DataInterface(credentials.postgres)

Loading Data
	- Attempting cache load
	- Loaded strokes from cache
Processing Data
	- Attempting cache load
	- Loaded dwell and flight from cache
Done Loading



In [5]:
key_press_count = data.get_dwells().key.value_counts()
mask = list(string.ascii_uppercase)
mask.extend(['ESCAPE', 'ENTER', 'SPACE', 'CONTROL', 'META', 'BACKSPACE', 'SHIFT'])

In [6]:
datasets = data.get_all_sets()
# for u in datasets:
#    datasets[u] = biokey.tools.filter_sets(datasets[u], to_include=mask)

In [7]:
dwells = data.get_dwells()

In [8]:
def assign_user_sections(df, seq_threshold):
    df.loc[:,'sect'] = (seq_threshold < df.down.shift(-1) - df.down).cumsum()
    return df

In [9]:
SECT_BREAK_THRESHOLD = 1000*60*10
SECT_KEY_COUNT_THRESHOLD = 100
SEQ_INTERVAL_THRESHOLD = 200
A_THRESHOLD = 1.25

In [10]:
dwells = dwells.groupby('user_id').apply(lambda x: assign_user_sections(x, SECT_BREAK_THRESHOLD)).reset_index(drop=True)

In [11]:
dwells.groupby(['user_id', 'sect']).dwell.count().groupby(level=0).count()

user_id
1d63b44d-a7cb-4ee6-b228-b0ff5b7d086f    393
28513a33-02a3-4b09-88f6-92c9fdb9dcdf    259
3b57c0d9-2b14-4ec2-8b29-ad4e94811962    160
5aa5405e-4574-4dc1-be50-b31e5bafc4cb    247
62042708-5989-4bbf-a106-d7a66281b367     73
6708d19e-6765-4f96-99eb-d94b342aa665    177
72cc4371-4288-4c84-b163-e441a55ea1bf    250
7bb179ed-9f9f-46ce-a403-57e88a92f216    116
91147071-c19d-485f-993e-d1cd2dc889cd    115
df2d03db-469c-4269-bf41-e320ac4c42ad    291
e85b3844-5af0-47ea-bf51-62da4b4efc39    204
f212c9bc-824a-4011-9a70-55a7d2f86ba2     72
fbd73057-951a-406d-8c5a-67f551ad0e80    245
ffbe1c9c-8dab-4ca2-8613-69e0bbc22cc9    170
Name: dwell, dtype: int64

In [12]:
dwells = dwells.groupby(['user_id', 'sect']).apply(lambda x: x if x.dwell.count() > SECT_KEY_COUNT_THRESHOLD else None).reset_index(drop=True)

In [13]:
dwells.groupby(['user_id', 'sect']).dwell.count().groupby(level=0).count()

user_id
1d63b44d-a7cb-4ee6-b228-b0ff5b7d086f    259
28513a33-02a3-4b09-88f6-92c9fdb9dcdf    164
5aa5405e-4574-4dc1-be50-b31e5bafc4cb    201
62042708-5989-4bbf-a106-d7a66281b367     24
6708d19e-6765-4f96-99eb-d94b342aa665     75
72cc4371-4288-4c84-b163-e441a55ea1bf    158
7bb179ed-9f9f-46ce-a403-57e88a92f216     64
91147071-c19d-485f-993e-d1cd2dc889cd     52
df2d03db-469c-4269-bf41-e320ac4c42ad     90
e85b3844-5af0-47ea-bf51-62da4b4efc39    116
f212c9bc-824a-4011-9a70-55a7d2f86ba2     39
fbd73057-951a-406d-8c5a-67f551ad0e80     42
ffbe1c9c-8dab-4ca2-8613-69e0bbc22cc9    106
Name: dwell, dtype: int64

In [14]:
class Section(object):
    def __init__(self, sect, max_depth=4):
        self.graphs = {}
        keys = sect.dropna()[['key', 'down']].values
        for depth in range(2, max_depth+1):
            self.graphs[depth] = {}
            for i in range(len(keys)-depth+1):
                seq = keys[i][0]
                start = keys[i][1]
                include = True
                for j in range(1, depth):
                    seq += '-'+ str(keys[i+j][0])
                    if keys[i+j][1] - keys[i+j-1][1] > SEQ_INTERVAL_THRESHOLD:
                        include = False
                if include:
                    duration = keys[i+depth-1][1] - start
                    if seq not in self.graphs[depth]:
                        self.graphs[depth][seq] = {'mean': duration, 'count': 1}
                    else: 
                        count = self.graphs[depth][seq]['count']
                        mean = self.graphs[depth][seq]['mean']
                        self.graphs[depth][seq]['mean'] = (mean * count + duration)/(count+1)
                        self.graphs[depth][seq]['count'] += 1

In [15]:
sects = dwells.groupby(['user_id', 'sect']).apply(lambda x: Section(x)).to_dict()

In [16]:
def calc_disorder(s1, s2):
    graph_disorder = {}
    # Loop through the various n-graphs
    for depth in (set(s1.keys()) & set(s2.keys())):
        disorder = 0
        # Copy n-graphs by value
        n_graph_1 = {**s1[depth]}
        n_graph_2 = {**s2[depth]}
        
        # Get mutual shared components
        intersection = (set(n_graph_1.keys()) & set(n_graph_2.keys()))
        n_graph_1 = {k: n_graph_1[k] for k in intersection}
        n_graph_2 = {k: n_graph_2[k] for k in intersection}
        
        sorted_graph_1 = sorted(n_graph_1, key=lambda x : n_graph_1[x]['mean'])
        sorted_graph_2 = sorted(n_graph_2, key=lambda x : n_graph_2[x]['mean'])
        
        index_1 = {sorted_graph_1[i]: i for i in range(len(sorted_graph_1))}
        disorders = {sorted_graph_2[i]: np.abs(index_1[sorted_graph_2[i]] - i) for i in range(len(sorted_graph_2))}
        
        a_values = []
        for i in intersection:
            comp_1 = n_graph_1[i]['mean']
            comp_2 = n_graph_2[i]['mean']
                   
            a_values.append(max(comp_1, comp_2)/max(min(comp_1, comp_2),1) <  A_THRESHOLD)
        
        
        count = len(disorders)
        if count == 0:
            count = 1
        
        disorder = np.sum([disorders[d] for d in disorders]) / (count**2/2 if count % 2 == 0 else (count**2-1)/2)
        
        graph_disorder[depth] = {'disorder': disorder, 'count': count, 'a': 1-np.sum(a_values)/count}
        
    df = pd.DataFrame(graph_disorder).T
    max_count = df['count'].max()
    return {'R': (df['count']/max_count*df.disorder).sum(), 'A': (df['count']/max_count*df.a).sum()}

In [17]:
calc_disorder(sects[('1d63b44d-a7cb-4ee6-b228-b0ff5b7d086f', 0)].graphs, sects[('1d63b44d-a7cb-4ee6-b228-b0ff5b7d086f', 2)].graphs)

{'A': 0.5952380952380953, 'R': 0.6528894386037244}

In [18]:
section_keys = list(sects.keys())
combinations = []
for i in range(len(section_keys)):
    for j in range(i, len(section_keys)):
        combinations.append((section_keys[i], section_keys[j]))

In [19]:
from tqdm import tqdm
from concurrent.futures import ProcessPoolExecutor, as_completed

def parallel_process(array, function, n_jobs=36, use_kwargs=False, front_num=3):
    """
        A parallel version of the map function with a progress bar. 

        Args:
            array (array-like): An array to iterate over.
            function (function): A python function to apply to the elements of array
            n_jobs (int, default=16): The number of cores to use
            use_kwargs (boolean, default=False): Whether to consider the elements of array as dictionaries of 
                keyword arguments to function 
            front_num (int, default=3): The number of iterations to run serially before kicking off the parallel job. 
                Useful for catching bugs
        Returns:
            [function(array[0]), function(array[1]), ...]
    """
    #We run the first few iterations serially to catch bugs
    if front_num > 0:
        front = [function(**a) if use_kwargs else function(a) for a in array[:front_num]]
    #If we set n_jobs to 1, just run a list comprehension. This is useful for benchmarking and debugging.
    if n_jobs==1:
        return front + [function(**a) if use_kwargs else function(a) for a in tqdm(array[front_num:])]
    #Assemble the workers
    with ProcessPoolExecutor(max_workers=n_jobs) as pool:
        #Pass the elements of array into function
        if use_kwargs:
            futures = [pool.submit(function, **a) for a in array[front_num:]]
        else:
            futures = [pool.submit(function, a) for a in array[front_num:]]
        kwargs = {
            'total': len(futures),
            'unit': 'it',
            'unit_scale': True,
            'leave': True
        }
        #Print out the progress as tasks complete
        for f in tqdm(as_completed(futures), **kwargs):
            pass
    out = []
    #Get the results from the futures. 
    for i, future in tqdm(enumerate(futures)):
        try:
            out.append(future.result())
        except Exception as e:
            out.append(e)
    return front + out

In [20]:
def job(params):
    return calc_disorder(sects[params[0]].graphs, sects[params[1]].graphs)

In [21]:
np.seterr(divide='ignore', invalid='ignore')
results = parallel_process(combinations[:], job, front_num = 1)

100%|██████████| 967k/967k [00:05<00:00, 176kit/s]    
966744it [00:02, 451111.03it/s]


In [28]:
df = pd.DataFrame(results)

In [29]:
df['combo'] = pd.Series(combinations)

In [32]:
df.to_csv('results.csv')

# Start from CSV

In [3]:
df = pd.read_csv('results.csv')

In [11]:
from ast import literal_eval as make_tuple

In [19]:
def split_combo(xr):
    x = make_tuple(xr.combo)
    return pd.Series({'p1': x[0][0], 's1': x[0][1], 'p2': x[1][0], 's2': x[1][1], 'A':xr.A, 'R': xr.R})

In [23]:
vals = df.apply((lambda x: split_combo(x)), axis=1)

Unnamed: 0,A,R,p1,p2,s1,s2
0,0.000000,0.000000,1d63b44d-a7cb-4ee6-b228-b0ff5b7d086f,1d63b44d-a7cb-4ee6-b228-b0ff5b7d086f,0,0
1,0.574661,0.765108,1d63b44d-a7cb-4ee6-b228-b0ff5b7d086f,1d63b44d-a7cb-4ee6-b228-b0ff5b7d086f,0,1
2,0.595238,0.652889,1d63b44d-a7cb-4ee6-b228-b0ff5b7d086f,1d63b44d-a7cb-4ee6-b228-b0ff5b7d086f,0,2
3,0.735211,0.897023,1d63b44d-a7cb-4ee6-b228-b0ff5b7d086f,1d63b44d-a7cb-4ee6-b228-b0ff5b7d086f,0,3
4,0.586538,0.747267,1d63b44d-a7cb-4ee6-b228-b0ff5b7d086f,1d63b44d-a7cb-4ee6-b228-b0ff5b7d086f,0,4
5,0.827004,0.921862,1d63b44d-a7cb-4ee6-b228-b0ff5b7d086f,1d63b44d-a7cb-4ee6-b228-b0ff5b7d086f,0,6
6,0.730216,0.912048,1d63b44d-a7cb-4ee6-b228-b0ff5b7d086f,1d63b44d-a7cb-4ee6-b228-b0ff5b7d086f,0,7
7,0.557987,0.862666,1d63b44d-a7cb-4ee6-b228-b0ff5b7d086f,1d63b44d-a7cb-4ee6-b228-b0ff5b7d086f,0,8
8,0.772059,0.853403,1d63b44d-a7cb-4ee6-b228-b0ff5b7d086f,1d63b44d-a7cb-4ee6-b228-b0ff5b7d086f,0,10
9,0.510204,0.742704,1d63b44d-a7cb-4ee6-b228-b0ff5b7d086f,1d63b44d-a7cb-4ee6-b228-b0ff5b7d086f,0,11


In [157]:
vals['AR'] = vals.A + vals.R
inverse_vals = vals.loc[(vals.p1 != vals.p2) | (vals.s1 != vals.s2)].rename({'p1': 'p2', 'p2': 'p1', 's1': 's2', 's2': 's1'}, axis='columns')
vals = pd.concat([vals, inverse_vals], ignore_index=True)

In [158]:
mean_profile_score = vals.loc[(vals.p1 == vals.p2) & (vals.s1 != vals.s2),['p1', 'A', 'R', 'AR']].groupby('p1').mean()

In [241]:
def test_self(test_user, section):
    test_vals = section.loc[(section.p1 != section.p2) | (section.s1 != section.s2), ['p2','s2', 'A', 'R', 'AR']]
    md_section_to_profiles = test_vals.groupby('p2').AR.mean()
    
    profile_distance = md_section_to_profiles.min() - mean_profile_score.loc[test_user, 'AR']
    
    is_user_min_user = (md_section_to_profiles.idxmin() == test_user)
    is_under_avg = (profile_distance < 0)
    is_closer_to_avg = (md_section_to_profiles.min() < mean_profile_score.loc[test_user, 'AR'] + 0.5*(md_section_to_profiles.nsmallest(2).max()-mean_profile_score.loc[test_user, 'AR']))
    
    return (is_user_min_user & (is_under_avg | is_closer_to_avg))

In [242]:
accept_self = vals.groupby(['p1', 's1']).apply(lambda x: test_self(x.name[0], x)).reset_index()

In [243]:
true_positive_rate = (accept_self[0] == True).mean()
print('True Positive Rate %s' % str(true_positive_rate))

True Positive Rate 0.5848920863309353


In [259]:
def test_others(section):
    test_vals = section.loc[(section.p1 != section.p2), ['p2','s2', 'A', 'R', 'AR']]
    md_section_to_profiles = test_vals.groupby('p2').AR.mean()
    
    test_user = md_section_to_profiles.idxmin()
    profile_distance = md_section_to_profiles.min() - mean_profile_score.loc[test_user, 'AR']
    
    is_under_avg = (profile_distance < 0)
    is_closer_to_avg = (md_section_to_profiles.min() < mean_profile_score.loc[test_user, 'AR'] + 0.5*(md_section_to_profiles.nsmallest(2).max()-mean_profile_score.loc[test_user, 'AR']))
    
    return pd.Series({u: (test_user == u) & (is_under_avg | is_closer_to_avg)  for u in md_section_to_profiles.index})

In [260]:
accept_other = vals.groupby(['p1', 's1']).apply(test_others).reset_index()

In [261]:
false_accept_rate = (accept_other[0] == True).mean()
print('False Positive Rate %s' % str(false_accept_rate))

False Positive Rate 0.03405275779376499


In [255]:
false_accept_rate

0.03405275779376499