# Setup

In [1]:
import numpy as np
import pandas as pd
import dataclasses
import enum
import typing
import random
import secrets
import time
import statistics
import eeg_bloom_template
import eeg_bloom_template.backend

from eeg_bloom_template.backend import BaseBloomFilterHashBackend
from eeg_auth_models_framework import data, pre_process, features, processor, normalization
from eeg_auth_models_framework.utils import conversion
from eeg_bloom_template.utils.iteration import iter_ratio_slices
from eeg_bloom_template.utils.orthonormalization import TokenDataGenerator

# Constants

In [2]:
AUTHENTICATION_THRESHOLDS = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0]
SEGMENTATION_RATIOS = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1]
FALSE_POSITIVE_RATES = [0.05, 0.1, 0.15, 0.2, 0.25, 0.3, 0.35, 0.4, 0.45, 0.5]
DATASET_SAMPLE_FREQ_HZ = 200
DATA_CHANNEL_NAMES = ['T7','F8','Cz','P4']
FREQUENCIES = [
    pre_process.FrequencyBand(lower=8.0, upper=12.0, label='Alpha'),
    pre_process.FrequencyBand(lower=12.0, upper=35.0, label='Beta'),
    pre_process.FrequencyBand(lower=4.0, upper=8.0, label='Theta'),
    pre_process.FrequencyBand(lower=35.0, upper=None, label='Gamma'),
    pre_process.FrequencyBand(lower=None, upper=None, label='Raw'),
]
WINDOW_SIZE = 1200
WINDOW_OVERLAP = 0
RESCALE_LOWER = 0
RESCALE_UPPER = 255
SAMPLE_PERCENTAGE = 0.1
RANDOM_SEED = 100000000000
RANDOM_GENERATOR = random.Random(RANDOM_SEED)
SYSTEMIC_SAMPLE_RATE = 3

# Utilities

In [3]:
class HashBackendWrapper(BaseBloomFilterHashBackend):
    def __init__(self, 
                 real_backend: BaseBloomFilterHashBackend, 
                 label: str):
        self.real_backend = real_backend
        self.label = label
        super().__init__()
    
    def run_hash_function(self, input_data: bytes) -> int:
        return self.real_backend.run_hash_function(input_data)


class TestResultType(enum.Enum):
    TRUE_POSITIVE = enum.auto()
    FALSE_POSITIVE = enum.auto()
    FALSE_NEGATIVE = enum.auto()
    TRUE_NEGATIVE = enum.auto()


@dataclasses.dataclass
class TestResultsSummary:
    true_positives: int = 0
    false_positives: int = 0
    false_negatives: int = 0
    true_negatives: int = 0
    
    @classmethod
    def merge_summaries(cls, 
                        summary_a: 'TestResultsSummary', 
                        summary_b: 'TestResultsSummary') -> 'TestResultsSummary':
        return TestResultsSummary(
            true_positives=summary_a.true_positives + summary_b.true_positives,
            false_positives=summary_a.false_positives + summary_b.false_positives,
            false_negatives=summary_a.false_negatives + summary_b.false_negatives,
            true_negatives=summary_a.true_negatives + summary_b.true_negatives
        )
    
    def increment_count(self, result_type: TestResultType):
        if result_type == TestResultType.TRUE_POSITIVE:
            self.true_positives += 1
        elif result_type == TestResultType.FALSE_POSITIVE:
            self.false_positives += 1
        elif result_type == TestResultType.FALSE_NEGATIVE:
            self.false_negatives += 1
        else:
            self.true_negatives += 1
    
    @property
    def accuracy(self) -> float:
        hits = self.true_positives + self.true_negatives
        total = (
            self.true_positives + self.true_negatives + 
            self.false_positives + self.false_negatives
        )
        return hits / total

    @property
    def false_accept_rate(self) -> float:
        return self.false_positives / (self.false_positives + self.true_negatives)
    
    @property
    def false_reject_rate(self) -> float:
        return self.false_negatives / (self.false_negatives + self.true_positives)
    
    @property
    def half_total_error_rate(self) -> float:
        return (self.false_accept_rate + self.false_reject_rate) / 2


@dataclasses.dataclass
class TemplateTest:
    expected_result: bool
    if_expected: TestResultType
    if_unexpected: TestResultType
    test_data: typing.List[np.ndarray]
    
    def run_test(self, test_template: eeg_bloom_template.EEGTemplate, test_threshold: float):
        comparison_result = test_template.compare(self.test_data)
        is_match = comparison_result.hit_ratio >= test_threshold
        if is_match != self.expected_result:
            return self.if_unexpected
        return self.if_expected
    

@dataclasses.dataclass
class TestTemplateData:
    template: eeg_bloom_template.EEGTemplate
    false_positive_rate: float
    hash_backend: eeg_bloom_template.backend.BaseBloomFilterHashBackend
    original_subject: str
    
    
@dataclasses.dataclass
class TemplateTestSet:
    threshold: float
    template_data: TestTemplateData
    positive_cases: typing.List[TemplateTest]
    negative_cases: typing.List[TemplateTest]

# Configuration

In [4]:
downloader = data.AuditoryDataDownloader()
reader = data.AuditoryDataReader()
converter = conversion.MNEDataFrameConverter(
    channels=DATA_CHANNEL_NAMES, 
    sample_frequency=DATASET_SAMPLE_FREQ_HZ
)

# Data Processing Setup

## Sample Data Processor

### Pre-Processing Steps

In [5]:
pre_process_steps = pre_process.PreProcessingPipeline([
    pre_process.EEGBandpassFilterStep(
        FREQUENCIES,
        converter
    ),
    pre_process.DataWindowStep(WINDOW_SIZE, WINDOW_OVERLAP)
])

### Feature Extraction Steps

In [6]:
feature_extraction_steps = features.FeatureExtractPipeline([
    features.StatisticalFeatureExtractor([
        features.StatisticalFeature.MIN,
        features.StatisticalFeature.MAX,
        features.StatisticalFeature.MEAN,
        features.StatisticalFeature.ZERO_CROSSING_RATE
    ])
])

### Normalization Steps

In [7]:
normalization_steps = normalization.NormalizationPipeline([
    normalization.RescaleNormalizationStep(RESCALE_LOWER, RESCALE_UPPER),
    normalization.HistogramEqualizationStep(RESCALE_LOWER, RESCALE_UPPER)
])

### Data Processor Setup

In [8]:
data_processor = processor.DataProcessor(
    pre_process=pre_process_steps,
    feature_extraction=feature_extraction_steps,
    normalization=normalization_steps
)
token_data_processor = processor.DataProcessor(
    pre_process=pre_process_steps,
    feature_extraction=feature_extraction_steps,
)

# Subject Data

In [9]:
RawDataMap = typing.Dict[str, typing.List[pd.DataFrame]]

In [10]:
def make_training_test_maps(data_map: RawDataMap) -> typing.Tuple[RawDataMap, RawDataMap]:
    result_training = {}
    result_test = {}

    for subject in data_map:
        result_training[subject] = []
        result_test[subject] = []
        for frame_data in data_map[subject]:
            training_frame, test_frame = make_training_test_frames(frame_data)
            result_training[subject].append(training_frame)
            result_test[subject].append(test_frame)

    return result_training, result_test


def make_training_test_frames(frame: pd.DataFrame) -> typing.Tuple[pd.DataFrame, pd.DataFrame]:
    training = frame[::SYSTEMIC_SAMPLE_RATE]
    test = frame.drop(training.index)
    return training, test

In [11]:
data_path = downloader.retrieve()
subject_data_map = reader.format_data(data_path)
subject_data_train, subject_data_test = make_training_test_maps(subject_data_map)

## Token Setup

In [12]:
subject_seeds_map = {subject: secrets.randbits(32) for subject in subject_data_map}
subject_tokens_map = {subject: TokenDataGenerator.generate_random_token() for subject in subject_data_map}

# Processing

## Hash Backend Construction

In [13]:
def generate_hash_backends(seed: int, token: str) -> typing.List[HashBackendWrapper]:
    return [
        HashBackendWrapper(
            eeg_bloom_template.backend.FNVBloomFilterBackend(),
            'FNV Backend'
        ),
        HashBackendWrapper(
            eeg_bloom_template.backend.MMH3BloomFilterBackend(),
            'MMH3 Backend (No Seed)'
        ),
        HashBackendWrapper(
            eeg_bloom_template.backend.MMH3BloomFilterBackend(
                seed=seed
            ),
            'MMH3 Backend (Seed)'
        ),
        HashBackendWrapper(
            eeg_bloom_template.backend.TokenBackend(
                token
            ),
            'Token Backend'
        )
    ]

## Template Setup

### Template Generation

In [14]:
def iter_test_templates(subject_template_data: typing.Dict[str, typing.List[np.ndarray]], 
                        subject_seeds: typing.Dict[str, int],
                        subject_tokens: typing.Dict[str, str]) -> typing.Iterator[TestTemplateData]:
    for subject in subject_template_data:
        subject_data = subject_template_data[subject]
        seed = subject_seeds[subject]
        token = subject_tokens[subject]
        for row_wise_setting in (True, False):
            for backend_option in generate_hash_backends(seed, token):
                for ratio in SEGMENTATION_RATIOS:
                    for rate in FALSE_POSITIVE_RATES:
                        template = eeg_bloom_template.EEGTemplate.make_template(
                            feature_data=subject_data,
                            hash_backend=backend_option,
                            segment_ratio=ratio,
                            false_positive_ratio=rate,
                            row_wise=row_wise_setting
                        )
                        yield TestTemplateData(
                            template=template, 
                            false_positive_rate=rate, 
                            hash_backend=backend_option,
                            original_subject=subject
                        )

## Sample Data Setup

In [15]:
processed_training_data = {
    subject: data_processor.process(subject_data_train[subject])
    for subject in subject_data_train
}
processed_testing_data = {
    subject: data_processor.process(subject_data_test[subject])
    for subject in subject_data_test
}

Creating RawArray with float64 data, n_channels=4, n_times=8000
    Range : 0 ... 7999 =      0.000 ...    39.995 secs
Ready.
Creating RawArray with float64 data, n_channels=4, n_times=8000
    Range : 0 ... 7999 =      0.000 ...    39.995 secs
Ready.
Creating RawArray with float64 data, n_channels=4, n_times=8000
    Range : 0 ... 7999 =      0.000 ...    39.995 secs
Ready.
Creating RawArray with float64 data, n_channels=4, n_times=8000
    Range : 0 ... 7999 =      0.000 ...    39.995 secs
Ready.
Creating RawArray with float64 data, n_channels=4, n_times=8000
    Range : 0 ... 7999 =      0.000 ...    39.995 secs
Ready.
Creating RawArray with float64 data, n_channels=4, n_times=13372
    Range : 0 ... 13371 =      0.000 ...    66.855 secs
Ready.
Creating RawArray with float64 data, n_channels=4, n_times=8000
    Range : 0 ... 7999 =      0.000 ...    39.995 secs
Ready.
Creating RawArray with float64 data, n_channels=4, n_times=8000
    Range : 0 ... 7999 =      0.000 ...    39.995 se

# Test Set Assembly

## Gathering Test Sets

In [16]:
def get_experiment_samples(data_map: typing.Dict[str, typing.List[np.ndarray]], 
                           target: str) -> typing.Tuple[typing.List[np.ndarray], typing.List[np.ndarray]]:
    positive_samples = list(
        iter_ratio_slices(
            data_map[target], 
            SAMPLE_PERCENTAGE
        )
    )
    negative_samples = []
    subjects_available = list(data_map.keys())
    subjects_available.remove(target)
    RANDOM_GENERATOR.shuffle(subjects_available)
    while subjects_available and len(negative_samples) < len(positive_samples):
        example_subject = subjects_available.pop()
        negative_samples.extend(
            list(
                iter_ratio_slices(
                    data_map[example_subject], 
                    SAMPLE_PERCENTAGE
                )
            )
        )
    return positive_samples, negative_samples


def iter_template_test_sets(training_data_map: typing.Dict[str, typing.List[np.ndarray]],
                            testing_data_map: typing.Dict[str, typing.List[np.ndarray]],
                            subject_seeds: typing.Dict[str, int],
                            subject_tokens: typing.Dict[str, str]) -> typing.Iterator[TemplateTestSet]:
    for template_data in iter_test_templates(training_data_map, subject_seeds, subject_tokens):
        for auth_threshold in AUTHENTICATION_THRESHOLDS:
            positive_samples, negative_samples = get_experiment_samples(
                testing_data_map, 
                template_data.original_subject
            )
            yield TemplateTestSet(
                threshold=auth_threshold,
                template_data=template_data,
                positive_cases=[
                    TemplateTest(
                        expected_result=True, 
                        test_data=sample,
                        if_expected=TestResultType.TRUE_POSITIVE,
                        if_unexpected=TestResultType.FALSE_NEGATIVE
                    )
                    for sample in positive_samples
                ],
                negative_cases=[
                    TemplateTest(
                        expected_result=False, 
                        test_data=sample,
                        if_expected=TestResultType.TRUE_NEGATIVE,
                        if_unexpected=TestResultType.FALSE_POSITIVE
                    )
                    for sample in negative_samples
                ]
            )

# Execute Tests

In [17]:
def run_template_test_set(test_set: TemplateTestSet) -> TestResultsSummary:
    summary = TestResultsSummary()
    for test_item in test_set.positive_cases:
        result_type = test_item.run_test(
            test_set.template_data.template,
            test_set.threshold
        )
        summary.increment_count(result_type)
    for test_item in test_set.negative_cases:
        result_type = test_item.run_test(
            test_set.template_data.template,
            test_set.threshold
        )
        summary.increment_count(result_type)
    return summary

In [18]:
results_data: typing.Dict[typing.Tuple[float, float, str, float, bool], TestResultsSummary] = {}
for template_test_set in iter_template_test_sets(processed_training_data, processed_testing_data, subject_seeds_map, subject_tokens_map):
    false_positive_rate = template_test_set.template_data.false_positive_rate
    threshold = template_test_set.threshold
    hash_backend_label = template_test_set.template_data.hash_backend.label
    segment_ratio = template_test_set.template_data.template.segment_ratio
    row_wise = template_test_set.template_data.template.row_wise
    result_key = (
        false_positive_rate,
        threshold,
        hash_backend_label,
        segment_ratio,
        row_wise
    )
    if result_key not in results_data:
        results_data[result_key] = TestResultsSummary()
    results_data[result_key] = TestResultsSummary.merge_summaries(
        results_data[result_key],
        run_template_test_set(template_test_set)
    )

In [19]:
results_data_rows = []
for data_key, results_summary in results_data.items():
    fpr, threshold, backend_label, segment_ratio, row_wise = data_key
    row_wise_label = 'Rows' if row_wise else 'Columns'
    results_data_rows.append([
        fpr, threshold, segment_ratio, row_wise_label,
        backend_label, results_summary.false_accept_rate, 
        results_summary.false_reject_rate, results_summary.half_total_error_rate, 
        results_summary.accuracy
    ])
results_dataframe = pd.DataFrame(
    results_data_rows, 
    columns=['FPR', 'Threshold', 'Segment Ratio', 'Processing', 'Backend', 'FAR', 'FRR', 'HTER', 'Accuracy']
)
results_dataframe.sort_values(by=['Accuracy'], ascending=False).head(10)

Unnamed: 0,FPR,Threshold,Segment Ratio,Processing,Backend,FAR,FRR,HTER,Accuracy
2553,0.3,0.4,0.6,Rows,MMH3 Backend (Seed),0.01845,0.174419,0.096434,0.905482
2412,0.1,0.3,0.5,Rows,MMH3 Backend (Seed),0.026923,0.186047,0.106485,0.893822
522,0.15,0.3,0.6,Rows,FNV Backend,0.059041,0.186047,0.122544,0.879017
1433,0.2,0.4,0.5,Rows,MMH3 Backend (No Seed),0.011538,0.24031,0.125924,0.874517
1542,0.25,0.3,0.6,Rows,MMH3 Backend (No Seed),0.077491,0.182171,0.129831,0.871456
1353,0.3,0.4,0.4,Rows,MMH3 Backend (No Seed),0.112628,0.158915,0.135771,0.865699
503,0.05,0.4,0.6,Rows,FNV Backend,0.0369,0.24031,0.138605,0.863894
2434,0.2,0.5,0.5,Rows,MMH3 Backend (Seed),0.034615,0.24031,0.137463,0.862934
2533,0.2,0.4,0.6,Rows,MMH3 Backend (Seed),0.04428,0.236434,0.140357,0.862004
412,0.1,0.3,0.5,Rows,FNV Backend,0.095745,0.186047,0.140896,0.861111


In [20]:
results_dataframe.sort_values(by=['HTER'], ascending=True).head(10)

Unnamed: 0,FPR,Threshold,Segment Ratio,Processing,Backend,FAR,FRR,HTER,Accuracy
2553,0.3,0.4,0.6,Rows,MMH3 Backend (Seed),0.01845,0.174419,0.096434,0.905482
2412,0.1,0.3,0.5,Rows,MMH3 Backend (Seed),0.026923,0.186047,0.106485,0.893822
522,0.15,0.3,0.6,Rows,FNV Backend,0.059041,0.186047,0.122544,0.879017
1433,0.2,0.4,0.5,Rows,MMH3 Backend (No Seed),0.011538,0.24031,0.125924,0.874517
1542,0.25,0.3,0.6,Rows,MMH3 Backend (No Seed),0.077491,0.182171,0.129831,0.871456
1353,0.3,0.4,0.4,Rows,MMH3 Backend (No Seed),0.112628,0.158915,0.135771,0.865699
2434,0.2,0.5,0.5,Rows,MMH3 Backend (Seed),0.034615,0.24031,0.137463,0.862934
503,0.05,0.4,0.6,Rows,FNV Backend,0.0369,0.24031,0.138605,0.863894
2533,0.2,0.4,0.6,Rows,MMH3 Backend (Seed),0.04428,0.236434,0.140357,0.862004
412,0.1,0.3,0.5,Rows,FNV Backend,0.095745,0.186047,0.140896,0.861111


In [21]:
print(f'Total number of test results: {len(results_data)}')

Total number of test results: 8000


# Simulated Execution

In [22]:
def simulate_model_executions(training_data_map: typing.Dict[str, typing.List[np.ndarray]],
                              testing_data_map: typing.Dict[str, typing.List[np.ndarray]],
                              backend: eeg_bloom_template.backend.BaseBloomFilterHashBackend):
    templates: typing.Dict[str, eeg_bloom_template.EEGTemplate] = {}
    template_timings: typing.List[float] = []
    compare_timings: typing.List[float] = []
    for subject, test_data in training_data_map.items():
        if len(template_timings) >= 20:
            break
        template_start = time.perf_counter()
        template = eeg_bloom_template.EEGTemplate.make_template(
            test_data,
            backend,
            0.1,
            0.1
        )
        template_end = time.perf_counter()
        template_timings.append(template_end - template_start)
        templates[subject] = template
    for subject, template in templates.items():
        if len(compare_timings) >= 20:
            break
        sample_data = testing_data_map[subject]
        compare_start = time.perf_counter()
        template.compare(sample_data)
        compare_end = time.perf_counter()
        compare_timings.append(compare_end - compare_start)
    return len(template_timings), statistics.mean(template_timings), statistics.mean(compare_timings)

In [23]:
fnv_backend = eeg_bloom_template.backend.FNVBloomFilterBackend()
mmh3_backend = eeg_bloom_template.backend.MMH3BloomFilterBackend()

In [24]:
fnv_executions, fnv_template_timing, fnv_compare_timing = simulate_model_executions(processed_training_data, processed_testing_data, fnv_backend)
print("FNV")
print(f"Executions: {fnv_executions}")
print(f"Average template generation time: {fnv_template_timing} seconds")
print(f"Average compare timing: {fnv_compare_timing} seconds")

FNV
Executions: 20
Average template generation time: 0.0011572210962185637 seconds
Average compare timing: 0.0017876223995699548 seconds


In [25]:
mmh3_executions, mmh3_template_timing, mmh3_compare_timing = simulate_model_executions(processed_training_data, processed_testing_data, mmh3_backend)
print("MMH3")
print(f"Executions: {mmh3_executions}")
print(f"Average template generation time: {mmh3_template_timing} seconds")
print(f"Average compare timing: {mmh3_compare_timing} seconds")

MMH3
Executions: 20
Average template generation time: 0.0006970085989451036 seconds
Average compare timing: 0.001056241150945425 seconds
