# Setup

Initial module setup.

In [27]:
import numpy as np
import pandas as pd
import dataclasses
import typing
import random
import auth_biohash.hash
import auth_biohash.random_token
import feature_encoding.threshold

from eeg_auth_models_framework import data, pre_process, features, processor
from eeg_auth_models_framework.utils import conversion

# Constants

In [28]:
AUTHENTICATION_THRESHOLDS = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]
DATASET_SAMPLE_FREQ_HZ = 200
DATA_CHANNEL_NAMES = ['T7','F8','Cz','P4']
FREQUENCIES = [
    pre_process.FrequencyBand(lower=8.0, upper=12.0, label='Alpha'),
    pre_process.FrequencyBand(lower=12.0, upper=35.0, label='Beta'),
    pre_process.FrequencyBand(lower=4.0, upper=8.0, label='Theta'),
    pre_process.FrequencyBand(lower=35.0, upper=None, label='Gamma'),
    pre_process.FrequencyBand(lower=None, upper=None, label='Raw'),
]
WINDOW_SIZE = 1200
WINDOW_OVERLAP = 0
BINARY_THRESHOLD = 50
RANDOM_SEED = 100000000000
RANDOM_GENERATOR = random.Random(RANDOM_SEED)

# Utilities

In [29]:
@dataclasses.dataclass
class HashTest:
    expected_result: bool
    hashes: typing.Tuple[auth_biohash.hash.BioHash, auth_biohash.hash.BioHash]
    
    def run_test(self):
        result = (self.hashes[0] == self.hashes[1])
        if result != self.expected_result:
            return False
        return True

@dataclasses.dataclass
class ThresholdTestSet:
    threshold: str
    template_hash: auth_biohash.hash.BioHash
    positive_cases: typing.List[auth_biohash.hash.BioHash]
    negative_cases: typing.List[auth_biohash.hash.BioHash]

@dataclasses.dataclass
class SubjectTestSet:
    subject_id: str
    threshold_tests: typing.List[ThresholdTestSet]

# Configuration

In [30]:
downloader = data.AuditoryDataDownloader()
reader = data.AuditoryDataReader()
converter = conversion.MNEDataFrameConverter(
    channels=DATA_CHANNEL_NAMES, 
    sample_frequency=DATASET_SAMPLE_FREQ_HZ
)

# Data Processing Setup

## Template Hash Processor

### Pre-Processing Steps

In [31]:
template_pre_process_steps = pre_process.PreProcessingPipeline([
    pre_process.EEGBandpassFilterStep(
        FREQUENCIES,
        converter
    )
])

### Feature Extraction Steps

In [32]:
template_feature_extraction_steps = features.FeatureExtractPipeline([
    features.StatisticalFeatureExtractor([
        features.StatisticalFeature.MIN,
        features.StatisticalFeature.MAX,
        features.StatisticalFeature.MEAN,
        features.StatisticalFeature.ZERO_CROSSING_RATE
    ])
])

### Processor

In [33]:
template_data_processor = processor.DataProcessor(
    pre_process=template_pre_process_steps,
    feature_extraction=template_feature_extraction_steps
)

## Sample Hash Processor

### Pre-Processing Steps

In [34]:
sample_pre_process_steps = pre_process.PreProcessingPipeline([
    pre_process.EEGBandpassFilterStep(
        FREQUENCIES,
        converter
    ),
    pre_process.DataWindowStep(WINDOW_SIZE, WINDOW_OVERLAP)
])

### Feature Extraction Steps

In [35]:
sample_feature_extraction_steps = features.FeatureExtractPipeline([
    features.StatisticalFeatureExtractor([
        features.StatisticalFeature.MIN,
        features.StatisticalFeature.MAX,
        features.StatisticalFeature.MEAN,
        features.StatisticalFeature.ZERO_CROSSING_RATE
    ])
])

### Processor

In [36]:
sample_data_processor = processor.DataProcessor(
    pre_process=sample_pre_process_steps,
    feature_extraction=sample_feature_extraction_steps
)

# Subject Data

In [37]:
data_path = downloader.retrieve()
subject_data_map = reader.format_data(data_path)

## Token Setup

In [38]:
subject_tokens_map = {subject: auth_biohash.random_token.generate_token() for subject in subject_data_map}

## Template Hash Setup

### Processing

In [39]:
processed_template_data_map = {subject: template_data_processor.process(subject_data_map[subject]) for subject in subject_data_map}

Creating RawArray with float64 data, n_channels=4, n_times=24000
    Range : 0 ... 23999 =      0.000 ...   119.995 secs
Ready.
Creating RawArray with float64 data, n_channels=4, n_times=24000
    Range : 0 ... 23999 =      0.000 ...   119.995 secs
Ready.
Creating RawArray with float64 data, n_channels=4, n_times=24000
    Range : 0 ... 23999 =      0.000 ...   119.995 secs
Ready.
Creating RawArray with float64 data, n_channels=4, n_times=24000
    Range : 0 ... 23999 =      0.000 ...   119.995 secs
Ready.
Creating RawArray with float64 data, n_channels=4, n_times=40114
    Range : 0 ... 40113 =      0.000 ...   200.565 secs
Ready.
Creating RawArray with float64 data, n_channels=4, n_times=24000
    Range : 0 ... 23999 =      0.000 ...   119.995 secs
Ready.
Creating RawArray with float64 data, n_channels=4, n_times=24000
    Range : 0 ... 23999 =      0.000 ...   119.995 secs
Ready.
Creating RawArray with float64 data, n_channels=4, n_times=24000
    Range : 0 ... 23999 =      0.000 ..

### Hashing

#### Token Normalization

In [40]:
def normalize_vectors(vectors_to_normalize: typing.List[np.ndarray], token: str) -> typing.List[np.ndarray]:
    matrix_generator = auth_biohash.random_token.MatrixGenerator(token)
    normalization = auth_biohash.hash.TokenMatrixNormalization(matrix_generator)
    return [normalization.normalize(v) for v in vectors_to_normalize]

In [41]:
normalized_template_data_map = {subject: normalize_vectors(processed_template_data_map[subject], subject_tokens_map[subject]) for subject in processed_template_data_map}

#### Encoding

In [42]:
def hash_vectors(vectors_to_hash: typing.List[np.ndarray], threshold: float) -> typing.List[auth_biohash.hash.BioHash]:
    encoder = feature_encoding.threshold.ThresholdBinaryEncoder(BINARY_THRESHOLD)
    return [auth_biohash.hash.BioHash.generate_hash(v, threshold, encoder) for v in vectors_to_hash]


def make_map_of_threshold_hashes(vectors_to_hash: typing.List[np.ndarray]) -> typing.Dict[str, typing.List[auth_biohash.hash.BioHash]]:
    result = {}
    for threshold in AUTHENTICATION_THRESHOLDS:
        result[str(threshold)] = hash_vectors(vectors_to_hash, threshold)
    return result


def normalize_template_hashes_map(raw_hashes_map: typing.Dict[str, typing.Dict[str, typing.List[auth_biohash.hash.BioHash]]]) -> typing.Dict[str, typing.Dict[str, auth_biohash.hash.BioHash]]:
    result = {}
    for subject in raw_hashes_map:
        result[subject] = {}
        for threshold in raw_hashes_map[subject]:
            template_hash_list = raw_hashes_map[subject][threshold]
            if len(template_hash_list) != 1:
                print(
                    f'Warning: there should be only one template hash per subject, '
                    f'but subject {subject} has {len(template_hash_list)} template hashes'
                )
            result[subject][threshold] = template_hash_list[0]
    return result

In [43]:
subject_template_hashes_map = normalize_template_hashes_map(
    {
        subject: make_map_of_threshold_hashes(normalized_template_data_map[subject]) 
        for subject in normalized_template_data_map
    }
)

## Sample Hash Setup

### Processing

In [44]:
processed_data_map = {subject: sample_data_processor.process(subject_data_map[subject]) for subject in subject_data_map}

Creating RawArray with float64 data, n_channels=4, n_times=24000
    Range : 0 ... 23999 =      0.000 ...   119.995 secs
Ready.
Creating RawArray with float64 data, n_channels=4, n_times=24000
    Range : 0 ... 23999 =      0.000 ...   119.995 secs
Ready.
Creating RawArray with float64 data, n_channels=4, n_times=24000
    Range : 0 ... 23999 =      0.000 ...   119.995 secs
Ready.
Creating RawArray with float64 data, n_channels=4, n_times=24000
    Range : 0 ... 23999 =      0.000 ...   119.995 secs
Ready.
Creating RawArray with float64 data, n_channels=4, n_times=40114
    Range : 0 ... 40113 =      0.000 ...   200.565 secs
Ready.
Creating RawArray with float64 data, n_channels=4, n_times=24000
    Range : 0 ... 23999 =      0.000 ...   119.995 secs
Ready.
Creating RawArray with float64 data, n_channels=4, n_times=24000
    Range : 0 ... 23999 =      0.000 ...   119.995 secs
Ready.
Creating RawArray with float64 data, n_channels=4, n_times=24000
    Range : 0 ... 23999 =      0.000 ..

### Hashing

#### Token Normalization

In [45]:
normalized_data_map = {subject: normalize_vectors(processed_data_map[subject], subject_tokens_map[subject]) for subject in processed_data_map}

#### Encoding

In [46]:
subject_hashes_map = {subject: make_map_of_threshold_hashes(normalized_data_map[subject]) for subject in normalized_data_map}

# Test Set Assembly

## Gathering Test Sets

In [47]:
def make_threshold_test_sets(sample_hashes_map: typing.Dict[str, typing.Dict[str, typing.List[auth_biohash.hash.BioHash]]],
                             template_hashes_map: typing.Dict[str, typing.Dict[str, auth_biohash.hash.BioHash]],
                             target_subject: str) -> typing.List[ThresholdTestSet]:
    threshold_test_sets: typing.Dict[str, ThresholdTestSet] = {
        str(threshold): ThresholdTestSet(
            threshold=threshold, template_hash=template_hashes_map[target_subject][str(threshold)],
            positive_cases=[], negative_cases=[]
        )
        for threshold in AUTHENTICATION_THRESHOLDS
    }
    for subject in sample_hashes_map: 
        for threshold in sample_hashes_map[subject]:
            test_set = threshold_test_sets[threshold]
            if subject == target_subject:
                test_set.positive_cases.extend(sample_hashes_map[subject][threshold])
            else:
                test_set.negative_cases.extend(sample_hashes_map[subject][threshold])
    return list(threshold_test_sets.values())

In [48]:
subject_test_sets = [
    SubjectTestSet(
        subject, 
        make_threshold_test_sets(
            subject_hashes_map, subject_template_hashes_map, subject
        )
    ) 
    for subject in subject_hashes_map
]

## Generating Hash Tests

In [49]:
def make_hash_tests(test_set: ThresholdTestSet) -> typing.List[HashTest]:
    tests = []
    # Use the minimum to ensure that the same amount of tests are possible from both populations
    # (there are more than likely more negative cases than positive ones)
    sample_size = min(len(test_set.positive_cases), len(test_set.negative_cases))
    should_match_cases: typing.List[auth_biohash.hash.BioHash] = RANDOM_GENERATOR.sample(test_set.positive_cases, sample_size)
    should_not_match_cases: typing.List[auth_biohash.hash.BioHash] = RANDOM_GENERATOR.sample(test_set.negative_cases, sample_size)
    for case in should_match_cases:
        tests.append(HashTest(True, (test_set.template_hash, case)))
    for case in should_not_match_cases:
        tests.append(HashTest(False, (test_set.template_hash, case)))
    return tests

def make_threshold_tests_map(subject_tests: typing.List[SubjectTestSet]) -> typing.Dict[str, typing.List[HashTest]]:
    threshold_tests = {str(threshold): [] for threshold in AUTHENTICATION_THRESHOLDS}
    for subject_test in subject_tests:
        for threshold_test_data in subject_test.threshold_tests:
            threshold_tests[str(threshold_test_data.threshold)].extend(
                make_hash_tests(threshold_test_data)
            )
    return threshold_tests

In [50]:
threshold_tests_map = make_threshold_tests_map(subject_test_sets)

# Execute Tests

In [51]:
def run_threshold_tests(test_data: typing.Dict[str, typing.List[HashTest]]) -> typing.Dict[str, float]:
    results = {}
    for threshold in test_data:
        hits = 0
        for test in test_data[threshold]:
            is_hit = test.run_test()
            if is_hit:
                hits += 1
        results[threshold] = (hits / len(test_data[threshold])) * 100
    return results

In [52]:
test_results_map = run_threshold_tests(threshold_tests_map)
data_results = []
for threshold_type, accuracy in test_results_map.items():
    data_results.append([threshold_type, accuracy])
test_results = pd.DataFrame(data_results, columns=['Threshold', 'Accuracy'])
test_results

Unnamed: 0,Threshold,Accuracy
0,0.1,94.915254
1,0.2,96.125908
2,0.3,96.125908
3,0.4,94.188862
4,0.5,69.491525
5,0.6,50.242131
6,0.7,50.0
7,0.8,50.0
8,0.9,50.0
