# Setup

Initial module setup.

In [None]:
import numpy as np
import pandas as pd
import copy
import dataclasses
import enum
import typing
import random
import secrets
import time
import statistics
import auth_biohash.bio_hash
import feature_encoding.base
import feature_encoding.threshold
import feature_encoding.direct
import feature_encoding.gray

from eeg_auth_models_framework import data, pre_process, features, processor, normalization
from eeg_auth_models_framework.utils import conversion

# Constants

In [2]:
AUTHENTICATION_THRESHOLDS = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]
DATASET_SAMPLE_FREQ_HZ = 200
DATA_CHANNEL_NAMES = ['T7','F8','Cz','P4']
FREQUENCIES = [
    pre_process.FrequencyBand(lower=8.0, upper=12.0, label='Alpha'),
    pre_process.FrequencyBand(lower=12.0, upper=35.0, label='Beta'),
    pre_process.FrequencyBand(lower=4.0, upper=8.0, label='Theta'),
    pre_process.FrequencyBand(lower=35.0, upper=None, label='Gamma'),
    pre_process.FrequencyBand(lower=None, upper=None, label='Raw'),
]
RESCALE_LOWER = 0
RESCALE_UPPER = 255
WINDOW_SIZE = 1200
WINDOW_OVERLAP = 0
BINARY_THRESHOLD = 50
RANDOM_SEED = 100000000000
RANDOM_GENERATOR = random.Random(RANDOM_SEED)
SYSTEMIC_SAMPLE_RATE = 3

# Utilities

In [3]:
class TestResultType(enum.Enum):
    TRUE_POSITIVE = enum.auto()
    FALSE_POSITIVE = enum.auto()
    FALSE_NEGATIVE = enum.auto()
    TRUE_NEGATIVE = enum.auto()


@dataclasses.dataclass
class TestResultsSummary:
    true_positives: int = 0
    false_positives: int = 0
    false_negatives: int = 0
    true_negatives: int = 0
    
    @classmethod
    def merge_summaries(cls, 
                        summary_a: 'TestResultsSummary', 
                        summary_b: 'TestResultsSummary') -> 'TestResultsSummary':
        return TestResultsSummary(
            true_positives=summary_a.true_positives + summary_b.true_positives,
            false_positives=summary_a.false_positives + summary_b.false_positives,
            false_negatives=summary_a.false_negatives + summary_b.false_negatives,
            true_negatives=summary_a.true_negatives + summary_b.true_negatives
        )
    
    def increment_count(self, result_type: TestResultType):
        if result_type == TestResultType.TRUE_POSITIVE:
            self.true_positives += 1
        elif result_type == TestResultType.FALSE_POSITIVE:
            self.false_positives += 1
        elif result_type == TestResultType.FALSE_NEGATIVE:
            self.false_negatives += 1
        else:
            self.true_negatives += 1
    
    @property
    def accuracy(self) -> float:
        hits = self.true_positives + self.true_negatives
        total = (
            self.true_positives + self.true_negatives + 
            self.false_positives + self.false_negatives
        )
        return hits / total

    @property
    def false_accept_rate(self) -> float:
        return self.false_positives / (self.false_positives + self.true_negatives)
    
    @property
    def false_reject_rate(self):
        return self.false_negatives / (self.false_negatives + self.true_positives)


@dataclasses.dataclass
class HashTest:
    expected_result: bool
    threshold: float
    if_expected: TestResultType
    if_unexpected: TestResultType
    hashes: typing.Tuple[auth_biohash.bio_hash.BioHash, auth_biohash.bio_hash.BioHash]
    
    def run_test(self) -> TestResultType:
        result = auth_biohash.bio_hash.BioHash.compare(
            self.hashes[0],
            self.hashes[1]
        )
        is_match = result <= self.threshold
        if is_match != self.expected_result:
            return self.if_unexpected
        return self.if_expected


@dataclasses.dataclass
class ThresholdTestSet:
    threshold: str
    template_hash: auth_biohash.bio_hash.BioHash
    positive_cases: typing.List[auth_biohash.bio_hash.BioHash]
    negative_cases: typing.List[auth_biohash.bio_hash.BioHash]


@dataclasses.dataclass
class SubjectTestSet:
    subject_id: str
    threshold_tests: typing.List[ThresholdTestSet]

# Configuration

In [4]:
downloader = data.AuditoryDataDownloader()
reader = data.AuditoryDataReader()
converter = conversion.MNEDataFrameConverter(
    channels=DATA_CHANNEL_NAMES, 
    sample_frequency=DATASET_SAMPLE_FREQ_HZ
)

# Data Processing Setup

## Template Hash Processor

### Pre-Processing Steps

In [5]:
template_pre_process_steps = pre_process.PreProcessingPipeline([
    pre_process.EEGBandpassFilterStep(
        FREQUENCIES,
        converter
    )
])

### Feature Extraction Steps

In [6]:
template_feature_extraction_steps = features.FeatureExtractPipeline([
    features.StatisticalFeatureExtractor([
        features.StatisticalFeature.MIN,
        features.StatisticalFeature.MAX,
        features.StatisticalFeature.MEAN,
        features.StatisticalFeature.ZERO_CROSSING_RATE
    ])
])

### Normalization Steps

In [7]:
template_normalization_steps = normalization.NormalizationPipeline([
    normalization.RescaleNormalizationStep(RESCALE_LOWER, RESCALE_UPPER),
    normalization.HistogramEqualizationStep(RESCALE_LOWER, RESCALE_UPPER)
])

### Processor

In [8]:
template_data_processor = processor.DataProcessor(
    pre_process=template_pre_process_steps,
    feature_extraction=template_feature_extraction_steps,
    normalization=template_normalization_steps
)

## Sample Hash Processor

### Pre-Processing Steps

In [9]:
sample_pre_process_steps = pre_process.PreProcessingPipeline([
    pre_process.EEGBandpassFilterStep(
        FREQUENCIES,
        converter
    ),
    pre_process.DataWindowStep(WINDOW_SIZE, WINDOW_OVERLAP)
])

### Feature Extraction Steps

In [10]:
sample_feature_extraction_steps = features.FeatureExtractPipeline([
    features.StatisticalFeatureExtractor([
        features.StatisticalFeature.MIN,
        features.StatisticalFeature.MAX,
        features.StatisticalFeature.MEAN,
        features.StatisticalFeature.ZERO_CROSSING_RATE
    ])
])

In [11]:
normalization_steps = normalization.NormalizationPipeline([
    normalization.RescaleNormalizationStep(RESCALE_LOWER, RESCALE_UPPER),
    normalization.HistogramEqualizationStep(RESCALE_LOWER, RESCALE_UPPER)
])

### Processor

In [12]:
sample_data_processor = processor.DataProcessor(
    pre_process=sample_pre_process_steps,
    feature_extraction=sample_feature_extraction_steps,
    normalization=normalization_steps
)

# Subject Data

In [13]:
RawDataMap = typing.Dict[str, typing.List[pd.DataFrame]]

In [14]:
def make_training_test_maps(data_map: RawDataMap) -> typing.Tuple[RawDataMap, RawDataMap]:
    result_training = {}
    result_test = {}

    for subject in data_map:
        result_training[subject] = []
        result_test[subject] = []
        for frame_data in data_map[subject]:
            training_frame, test_frame = make_training_test_frames(frame_data)
            result_training[subject].append(training_frame)
            result_test[subject].append(test_frame)

    return result_training, result_test


def make_training_test_frames(frame: pd.DataFrame) -> typing.Tuple[pd.DataFrame, pd.DataFrame]:
    training = frame[::SYSTEMIC_SAMPLE_RATE]
    test = frame.drop(training.index)
    return training, test

In [15]:
data_path = downloader.retrieve()
subject_data_map = reader.format_data(data_path)
subject_data_train, subject_data_test = make_training_test_maps(subject_data_map)

## Token Setup

In [16]:
subject_tokens_map = {subject: secrets.token_hex(64) for subject in subject_data_map}

## Template Hash Setup

### Processing

In [17]:
processed_template_data_map = {subject: template_data_processor.process(subject_data_train[subject]) for subject in subject_data_train}

Creating RawArray with float64 data, n_channels=4, n_times=8000
    Range : 0 ... 7999 =      0.000 ...    39.995 secs
Ready.
Creating RawArray with float64 data, n_channels=4, n_times=13372
    Range : 0 ... 13371 =      0.000 ...    66.855 secs
Ready.
Creating RawArray with float64 data, n_channels=4, n_times=8000
    Range : 0 ... 7999 =      0.000 ...    39.995 secs
Ready.
Creating RawArray with float64 data, n_channels=4, n_times=8000
    Range : 0 ... 7999 =      0.000 ...    39.995 secs
Ready.
Creating RawArray with float64 data, n_channels=4, n_times=8000
    Range : 0 ... 7999 =      0.000 ...    39.995 secs
Ready.
Creating RawArray with float64 data, n_channels=4, n_times=8000
    Range : 0 ... 7999 =      0.000 ...    39.995 secs
Ready.
Creating RawArray with float64 data, n_channels=4, n_times=8000
    Range : 0 ... 7999 =      0.000 ...    39.995 secs
Ready.
Creating RawArray with float64 data, n_channels=4, n_times=8000
    Range : 0 ... 7999 =      0.000 ...    39.995 se

### Hashing

In [18]:
SubjectHashesMap = typing.Dict[str, typing.Dict[str, typing.List[auth_biohash.bio_hash.BioHash]]]
SubjectTemplateHashesMap = typing.Dict[str, typing.Dict[str, auth_biohash.bio_hash.BioHash]]
ThresholdHashesMap = typing.Dict[str, typing.List[auth_biohash.bio_hash.BioHash]]
TemplateHashesMap = typing.Dict[str, auth_biohash.bio_hash.BioHash]

In [19]:
def make_map_of_threshold_hashes(vectors_to_hash: typing.List[np.ndarray], 
                                 token: str, 
                                 encoder: feature_encoding.base.BinaryEncoder,
                                 additional_norm: normalization.NormalizationPipeline = None) -> ThresholdHashesMap:
    result = {}
    for threshold in AUTHENTICATION_THRESHOLDS:
        result[str(threshold)] = [
            auth_biohash.bio_hash.BioHash.generate_hash(vector, token, encoder, additional_norm)
            for vector in vectors_to_hash
        ]
    return result


def iter_template_hashes(template_data_map: typing.Dict[str, typing.List[np.ndarray]],
                         tokens_map: typing.Dict[str, str],
                         encoder: feature_encoding.base.BinaryEncoder,
                         additional_norm: normalization.NormalizationPipeline = None) -> typing.Iterator[typing.Tuple[str, TemplateHashesMap]]:
    for subject in template_data_map:
        token = tokens_map[subject]
        template_hashes = make_map_of_threshold_hashes(
            template_data_map[subject], token, encoder, additional_norm
        )
        normalized_hashes_map: TemplateHashesMap = {}
        for threshold in template_hashes:
            hashes_list = template_hashes[threshold]
            if len(hashes_list) != 1:
                print(f'[warning] Multiple hashes for subject {subject}, should be only 1.')
                continue
            normalized_hashes_map[threshold] = hashes_list[0]
        yield subject, normalized_hashes_map

## Sample Hash Setup

### Processing

In [20]:
processed_data_map = {subject: sample_data_processor.process(subject_data_test[subject]) for subject in subject_data_test}

Creating RawArray with float64 data, n_channels=4, n_times=16000
    Range : 0 ... 15999 =      0.000 ...    79.995 secs
Ready.
Creating RawArray with float64 data, n_channels=4, n_times=26742
    Range : 0 ... 26741 =      0.000 ...   133.705 secs
Ready.
Creating RawArray with float64 data, n_channels=4, n_times=16000
    Range : 0 ... 15999 =      0.000 ...    79.995 secs
Ready.
Creating RawArray with float64 data, n_channels=4, n_times=16000
    Range : 0 ... 15999 =      0.000 ...    79.995 secs
Ready.
Creating RawArray with float64 data, n_channels=4, n_times=16000
    Range : 0 ... 15999 =      0.000 ...    79.995 secs
Ready.
Creating RawArray with float64 data, n_channels=4, n_times=16000
    Range : 0 ... 15999 =      0.000 ...    79.995 secs
Ready.
Creating RawArray with float64 data, n_channels=4, n_times=16000
    Range : 0 ... 15999 =      0.000 ...    79.995 secs
Ready.
Creating RawArray with float64 data, n_channels=4, n_times=16000
    Range : 0 ... 15999 =      0.000 ..

### Hashing

In [21]:
ThresholdHashesMap = typing.Dict[str, typing.List[auth_biohash.bio_hash.BioHash]]

In [22]:
def iter_sample_hashes(data_map: typing.Dict[str, typing.List[np.ndarray]], 
                       tokens_map: typing.Dict[str, str], 
                       encoder: feature_encoding.base.BinaryEncoder,
                       additional_norm: normalization.NormalizationPipeline = None) -> typing.Iterator[typing.Tuple[str, ThresholdHashesMap]]:
    for subject in data_map:
        token = tokens_map[subject]
        hashes_map = make_map_of_threshold_hashes(
            data_map[subject], token, encoder, additional_norm
        )
        yield subject, hashes_map

# Test Set Assembly

## Gathering Test Sets

In [23]:
def iter_subject_test_sets(template_data: typing.Dict[str, typing.List[np.ndarray]],
                           sample_data: typing.Dict[str, typing.List[np.ndarray]],
                           tokens_map: typing.Dict[str, str],
                           encoder: feature_encoding.base.BinaryEncoder,
                           additional_norm: normalization.NormalizationPipeline = None) -> typing.Iterator[SubjectTestSet]:
    for subject, template_hashes in iter_template_hashes(template_data, tokens_map, encoder, additional_norm):
        threshold_test_sets: typing.Dict[str, ThresholdTestSet] = {
            str(threshold): ThresholdTestSet(
                threshold=threshold, template_hash=template_hashes[str(threshold)],
                positive_cases=[], negative_cases=[]
            )
            for threshold in AUTHENTICATION_THRESHOLDS
        }
        for sample_subject, sample_hashes in iter_sample_hashes(sample_data, tokens_map, encoder, additional_norm):
            for threshold in sample_hashes:
                test_set = threshold_test_sets[threshold]
                if subject == sample_subject:
                    test_set.positive_cases.extend(sample_hashes[threshold])
                else:
                    test_set.negative_cases.extend(sample_hashes[threshold])
        yield SubjectTestSet(
            subject_id=subject,
            threshold_tests=list(threshold_test_sets.values())
        )

## Generating Hash Tests

In [24]:
def make_hash_tests(test_set: ThresholdTestSet) -> typing.List[HashTest]:
    tests = []
    # Use the minimum to ensure that the same amount of tests are possible from both populations
    # (there are more than likely more negative cases than positive ones)
    sample_size = min(len(test_set.positive_cases), len(test_set.negative_cases))
    should_match_cases: typing.List[auth_biohash.bio_hash.BioHash] = RANDOM_GENERATOR.sample(test_set.positive_cases, sample_size)
    should_not_match_cases: typing.List[auth_biohash.bio_hash.BioHash] = RANDOM_GENERATOR.sample(test_set.negative_cases, sample_size)
    for case in should_match_cases:
        tests.append(
            HashTest(
                expected_result=True, 
                hashes=(test_set.template_hash, case),
                threshold=float(test_set.threshold),
                if_expected=TestResultType.TRUE_POSITIVE,
                if_unexpected=TestResultType.FALSE_NEGATIVE
            )
        )
    for case in should_not_match_cases:
        tests.append(
            HashTest(
                expected_result=False, 
                hashes=(test_set.template_hash, case),
                threshold=float(test_set.threshold),
                if_expected=TestResultType.TRUE_NEGATIVE,
                if_unexpected=TestResultType.FALSE_POSITIVE
            )
        )
    return tests


def iter_hash_tests(template_data: typing.Dict[str, typing.List[np.ndarray]],
                    sample_data: typing.Dict[str, typing.List[np.ndarray]],
                    tokens_map: typing.Dict[str, str],
                    encoder: feature_encoding.base.BinaryEncoder,
                    additional_norm: normalization.NormalizationPipeline = None) -> typing.Iterator[typing.Tuple[str, typing.List[HashTest]]]:
    for subject_test_set in iter_subject_test_sets(template_data, sample_data, tokens_map, encoder, additional_norm):
        for threshold_test_data in subject_test_set.threshold_tests:
            yield threshold_test_data.threshold, make_hash_tests(threshold_test_data)

# Execute Tests

In [25]:
def run_hash_tests(hash_tests: typing.List[HashTest]) -> TestResultsSummary:
    summary = TestResultsSummary()
    for test in hash_tests:
        result_type = test.run_test()
        summary.increment_count(result_type)
    return summary

## Threshold-Based Encoding

In [26]:
threshold_results_map = {}
threshold_encoder = feature_encoding.threshold.ThresholdBinaryEncoder(BINARY_THRESHOLD)
threshold_test_args = (
    processed_template_data_map, processed_data_map, subject_tokens_map, threshold_encoder
)
for auth_threshold, hash_test_data in iter_hash_tests(*threshold_test_args):
    if auth_threshold not in threshold_results_map:
        threshold_results_map[auth_threshold] = TestResultsSummary()
    threshold_results_map[auth_threshold] = TestResultsSummary.merge_summaries(
        threshold_results_map[auth_threshold],
        run_hash_tests(hash_test_data)
    )

In [27]:
threshold_results = []
for auth_threshold, result_summary in threshold_results_map.items():
    threshold_results.append([
        auth_threshold, result_summary.false_accept_rate, 
        result_summary.false_reject_rate, result_summary.accuracy
    ])
threshold_df = pd.DataFrame(
    threshold_results, columns=['Threshold', 'FAR', 'FRR', 'Accuracy']
)
threshold_df

Unnamed: 0,Threshold,FAR,FRR,Accuracy
0,0.1,0.0,0.048327,0.975836
1,0.2,0.0,0.0,1.0
2,0.3,0.003717,0.0,0.998141
3,0.4,0.100372,0.0,0.949814
4,0.5,0.821561,0.0,0.589219
5,0.6,1.0,0.0,0.5
6,0.7,1.0,0.0,0.5
7,0.8,1.0,0.0,0.5
8,0.9,1.0,0.0,0.5


## Direct Encoding

In [28]:
direct_results_map = {}
direct_encoder = feature_encoding.direct.DirectBinaryEncoder()
direct_test_args = (
    processed_template_data_map, processed_data_map, subject_tokens_map, direct_encoder
)
for auth_threshold, hash_test_data in iter_hash_tests(*direct_test_args):
    if auth_threshold not in direct_results_map:
        direct_results_map[auth_threshold] = TestResultsSummary()
    direct_results_map[auth_threshold] = TestResultsSummary.merge_summaries(
        direct_results_map[auth_threshold],
        run_hash_tests(hash_test_data)
    )

In [29]:
direct_results = []
for auth_threshold, result_summary in direct_results_map.items():
    direct_results.append([
        auth_threshold, result_summary.false_accept_rate, 
        result_summary.false_reject_rate, result_summary.accuracy
    ])
direct_df = pd.DataFrame(
    direct_results, columns=['Threshold', 'FAR', 'FRR', 'Accuracy']
)
direct_df

Unnamed: 0,Threshold,FAR,FRR,Accuracy
0,0.1,0.0,1.0,0.5
1,0.2,0.0,1.0,0.5
2,0.3,0.0,1.0,0.5
3,0.4,0.052045,0.003717,0.972119
4,0.5,1.0,0.0,0.5
5,0.6,1.0,0.0,0.5
6,0.7,1.0,0.0,0.5
7,0.8,1.0,0.0,0.5
8,0.9,1.0,0.0,0.5


## Gray Encoding

In [30]:
gray_results_map = {}
gray_encoder = feature_encoding.gray.GrayCodeBinaryEncoder(n=8, round_to_whole=True)
codebook_normalization = normalization.NormalizationPipeline([
    normalization.RescaleNormalizationStep(RESCALE_LOWER, RESCALE_UPPER),
])
gray_test_args = (
    processed_template_data_map, processed_data_map, 
    subject_tokens_map, gray_encoder, codebook_normalization
)
for auth_threshold, hash_test_data in iter_hash_tests(*gray_test_args):
    if auth_threshold not in gray_results_map:
        gray_results_map[auth_threshold] = TestResultsSummary()
    gray_results_map[auth_threshold] = TestResultsSummary.merge_summaries(
        gray_results_map[auth_threshold],
        run_hash_tests(hash_test_data)
    )

In [31]:
gray_results = []
for auth_threshold, result_summary in gray_results_map.items():
    gray_results.append([
        auth_threshold, result_summary.false_accept_rate, 
        result_summary.false_reject_rate, result_summary.accuracy
    ])
gray_df = pd.DataFrame(
    gray_results, columns=['Threshold', 'FAR', 'FRR', 'Accuracy']
)
gray_df

Unnamed: 0,Threshold,FAR,FRR,Accuracy
0,0.1,0.0,1.0,0.5
1,0.2,0.0,0.405204,0.797398
2,0.3,0.0,0.159851,0.920074
3,0.4,0.0,0.0,1.0
4,0.5,0.855019,0.0,0.572491
5,0.6,1.0,0.0,0.5
6,0.7,1.0,0.0,0.5
7,0.8,1.0,0.0,0.5
8,0.9,1.0,0.0,0.5


# Simulated Execution

In [None]:
def simulate_model_executions(test_data_map: typing.Dict[str, typing.List[np.ndarray]], 
                              tokens: typing.Dict[str, str], 
                              encoder: feature_encoding.base.BinaryEncoder, 
                              additional_norm: normalization.NormalizationPipeline = None) -> typing.Tuple[int, float, float]:
    hashes: typing.List[typing.List[auth_biohash.bio_hash.BioHash]] = []
    hash_timings: typing.List[float] = []
    compare_timings: typing.List[float] = []
    for subject, test_data in test_data_map.items():
        sample_hashes: typing.List[auth_biohash.bio_hash.BioHash] = []
        for test_sample in test_data:
            hash_start = time.perf_counter()
            test_hash = auth_biohash.bio_hash.BioHash.generate_hash(
                test_sample, tokens[subject], encoder, additional_norm
            )
            hash_end = time.perf_counter()
            sample_hashes.append(test_hash)
            hash_timings.append(hash_end - hash_start)
        hashes.append(sample_hashes)
    for hash_samples in hashes:
        randomized_samples = [sample for sample in hash_samples]
        random.shuffle(randomized_samples)
        for sample, tester in zip(hash_samples, randomized_samples):
            compare_start = time.perf_counter()
            auth_biohash.bio_hash.BioHash.compare(sample, tester)
            compare_end = time.perf_counter()
            compare_timings.append(compare_end - compare_start)
    return len(hash_timings), statistics.mean(hash_timings), statistics.mean(compare_timings)

In [None]:
hash_count, average_hash_time, average_compare_time = simulate_model_executions(processed_data_map, subject_tokens_map, threshold_encoder)
print(f"Hashes computed and compared: {hash_count}")
print(f"Average hash time: {average_hash_time} seconds")
print(f"Average compare time: {average_compare_time} seconds")