# Setup

In [1]:
import numpy as np
import pandas as pd
import dataclasses
import enum
import typing
import random
import fuzzy_hash_lib
import feature_encoding.base
import feature_encoding.threshold

from eeg_auth_models_framework import data, pre_process, features, processor, normalization
from eeg_auth_models_framework.utils import conversion
from auth_biohash import orthonormalization

# Constants

In [2]:
AUTHENTICATION_THRESHOLDS = [10, 20, 30, 40, 50, 60, 70, 80, 90]
DATASET_SAMPLE_FREQ_HZ = 200
DATA_CHANNEL_NAMES = ['T7','F8','Cz','P4']
FREQUENCIES = [
    pre_process.FrequencyBand(lower=8.0, upper=12.0, label='Alpha'),
    pre_process.FrequencyBand(lower=12.0, upper=35.0, label='Beta'),
    pre_process.FrequencyBand(lower=4.0, upper=8.0, label='Theta'),
    pre_process.FrequencyBand(lower=35.0, upper=None, label='Gamma'),
    pre_process.FrequencyBand(lower=None, upper=None, label='Raw'),
]
WINDOW_SIZE = 1200
WINDOW_OVERLAP = 0
BINARY_THRESHOLD = 127
RESCALE_LOWER = 0
RESCALE_UPPER = 255
RANDOM_SEED = 100000000000
RANDOM_GENERATOR = random.Random(RANDOM_SEED)

# Utilities

In [3]:
class TestResultType(enum.Enum):
    TRUE_POSITIVE = enum.auto()
    FALSE_POSITIVE = enum.auto()
    FALSE_NEGATIVE = enum.auto()
    TRUE_NEGATIVE = enum.auto()


@dataclasses.dataclass
class TestResultsSummary:
    true_positives: int = 0
    false_positives: int = 0
    false_negatives: int = 0
    true_negatives: int = 0
    
    @classmethod
    def merge_summaries(cls, 
                        summary_a: 'TestResultsSummary', 
                        summary_b: 'TestResultsSummary') -> 'TestResultsSummary':
        return TestResultsSummary(
            true_positives=summary_a.true_positives + summary_b.true_positives,
            false_positives=summary_a.false_positives + summary_b.false_positives,
            false_negatives=summary_a.false_negatives + summary_b.false_negatives,
            true_negatives=summary_a.true_negatives + summary_b.true_negatives
        )
    
    def increment_count(self, result_type: TestResultType):
        if result_type == TestResultType.TRUE_POSITIVE:
            self.true_positives += 1
        elif result_type == TestResultType.FALSE_POSITIVE:
            self.false_positives += 1
        elif result_type == TestResultType.FALSE_NEGATIVE:
            self.false_negatives += 1
        else:
            self.true_negatives += 1
    
    @property
    def accuracy(self) -> float:
        hits = self.true_positives + self.true_negatives
        total = (
            self.true_positives + self.true_negatives + 
            self.false_positives + self.false_negatives
        )
        return hits / total

    @property
    def false_accept_rate(self) -> float:
        return self.false_positives / (self.false_positives + self.true_negatives)
    
    @property
    def false_reject_rate(self):
        return self.false_negatives / (self.false_negatives + self.true_positives)
    
    @property
    def half_total_error_rate(self) -> float:
        return (self.false_accept_rate + self.false_reject_rate) / 2


@dataclasses.dataclass
class HashTest:
    expected_result: bool
    threshold: int
    if_expected: TestResultType
    if_unexpected: TestResultType
    hashes: typing.Tuple[fuzzy_hash_lib.FuzzyHash, fuzzy_hash_lib.FuzzyHash]
    
    def run_test(self):
        similarity = fuzzy_hash_lib.FuzzyHash.compare(self.hashes[0], self.hashes[1])
        is_match = similarity >= self.threshold
        if is_match != self.expected_result:
            return self.if_unexpected
        return self.if_expected
    
@dataclasses.dataclass
class ThresholdTestSet:
    threshold: int
    template_hash: fuzzy_hash_lib.FuzzyHash
    positive_cases: typing.List[fuzzy_hash_lib.FuzzyHash]
    negative_cases: typing.List[fuzzy_hash_lib.FuzzyHash]
    
@dataclasses.dataclass
class SubjectTestSet:
    subject_id: str
    threshold_tests: typing.List[ThresholdTestSet]

# Configuration

In [4]:
downloader = data.AuditoryDataDownloader()
reader = data.AuditoryDataReader()
converter = conversion.MNEDataFrameConverter(
    channels=DATA_CHANNEL_NAMES, 
    sample_frequency=DATASET_SAMPLE_FREQ_HZ
)

# Data Processing Setup

## Template Hash Processor

### Pre-Processing Steps

In [5]:
template_pre_process_steps = pre_process.PreProcessingPipeline([
    pre_process.EEGBandpassFilterStep(
        FREQUENCIES,
        converter
    )
])

### Feature Extraction Steps

In [6]:
template_feature_extraction_steps = features.FeatureExtractPipeline([
    features.StatisticalFeatureExtractor([
        features.StatisticalFeature.MIN,
        features.StatisticalFeature.MAX,
        features.StatisticalFeature.MEAN,
        features.StatisticalFeature.ZERO_CROSSING_RATE
    ])
])

### Normalization Steps

In [7]:
template_normalization_steps = normalization.NormalizationPipeline([
    normalization.RescaleNormalizationStep(RESCALE_LOWER, RESCALE_UPPER),
    normalization.HistogramEqualizationStep(RESCALE_LOWER, RESCALE_UPPER)
])

### Data Processor Setup

In [8]:
template_data_processor = processor.DataProcessor(
    pre_process=template_pre_process_steps,
    feature_extraction=template_feature_extraction_steps,
    normalization=template_normalization_steps
)
token_template_processor = processor.DataProcessor(
    pre_process=template_pre_process_steps,
    feature_extraction=template_feature_extraction_steps
)

## Sample Data Processor

### Pre-Processing Steps

In [9]:
pre_process_steps = pre_process.PreProcessingPipeline([
    pre_process.EEGBandpassFilterStep(
        FREQUENCIES,
        converter
    ),
    pre_process.DataWindowStep(WINDOW_SIZE, WINDOW_OVERLAP)
])

### Feature Extraction Steps

In [10]:
feature_extraction_steps = features.FeatureExtractPipeline([
    features.StatisticalFeatureExtractor([
        features.StatisticalFeature.MIN,
        features.StatisticalFeature.MAX,
        features.StatisticalFeature.MEAN,
        features.StatisticalFeature.ZERO_CROSSING_RATE
    ])
])

### Normalization Steps

In [11]:
normalization_steps = normalization.NormalizationPipeline([
    normalization.RescaleNormalizationStep(RESCALE_LOWER, RESCALE_UPPER),
    normalization.HistogramEqualizationStep(RESCALE_LOWER, RESCALE_UPPER)
])

### Data Processor Setup

In [12]:
data_processor = processor.DataProcessor(
    pre_process=pre_process_steps,
    feature_extraction=feature_extraction_steps,
    normalization=normalization_steps
)
token_processor = processor.DataProcessor(
    pre_process=pre_process_steps,
    feature_extraction=feature_extraction_steps
)

# Subject Data

In [13]:
data_path = downloader.retrieve()
subject_data_map = reader.format_data(data_path)

## Tokens

In [14]:
tokens_map = {subject: orthonormalization.TokenDataGenerator.generate_random_token() for subject in subject_data_map}

# Data Processing

## Template Hash Setup

### Processing

In [15]:
template_data_map = {
    subject: template_data_processor.process(subject_data_map[subject]) 
    for subject in subject_data_map
}
token_template_data_map = {
    subject: token_template_processor.process(subject_data_map[subject]) 
    for subject in subject_data_map
}

Creating RawArray with float64 data, n_channels=4, n_times=24000
    Range : 0 ... 23999 =      0.000 ...   119.995 secs
Ready.
Creating RawArray with float64 data, n_channels=4, n_times=24000
    Range : 0 ... 23999 =      0.000 ...   119.995 secs
Ready.
Creating RawArray with float64 data, n_channels=4, n_times=24000
    Range : 0 ... 23999 =      0.000 ...   119.995 secs
Ready.
Creating RawArray with float64 data, n_channels=4, n_times=24000
    Range : 0 ... 23999 =      0.000 ...   119.995 secs
Ready.
Creating RawArray with float64 data, n_channels=4, n_times=40114
    Range : 0 ... 40113 =      0.000 ...   200.565 secs
Ready.
Creating RawArray with float64 data, n_channels=4, n_times=24000
    Range : 0 ... 23999 =      0.000 ...   119.995 secs
Ready.
Creating RawArray with float64 data, n_channels=4, n_times=24000
    Range : 0 ... 23999 =      0.000 ...   119.995 secs
Ready.
Creating RawArray with float64 data, n_channels=4, n_times=24000
    Range : 0 ... 23999 =      0.000 ..

### Hashing

In [16]:
SubjectTemplatePair = typing.Tuple[str, fuzzy_hash_lib.FuzzyHash]
SubjectProcessedData = typing.Dict[str, typing.List[np.ndarray]]

In [17]:
def hash_vectors(vectors_to_hash: typing.List[np.ndarray], 
                 encoder: feature_encoding.base.BinaryEncoder,
                 token: str = None) -> typing.List[fuzzy_hash_lib.FuzzyHash]:
    normalized_vectors = vectors_to_hash
    token_data_generator = orthonormalization.TokenDataGenerator(token)
    normalizer = orthonormalization.TokenMatrixNormalization(token_data_generator)
    if token is not None:
        normalized_vectors = [normalizer.normalize(v) for v in normalized_vectors]
    binary_vectors = [encoder.encode(v) for v in normalized_vectors]
    return [fuzzy_hash_lib.FuzzyHash.from_text(bv) for bv in binary_vectors]


def iter_template_hashes(template_data: SubjectProcessedData, 
                         encoder: feature_encoding.base.BinaryEncoder,
                         tokens: typing.Dict[str, str] = None) -> typing.Iterator[SubjectTemplatePair]:
    for subject in template_data:
        token = tokens.get(subject) if tokens is not None else None
        hashes_list = hash_vectors(template_data[subject], encoder, token)
        if len(hashes_list) != 1:
            print(
                f'[warning] There should be only one template hash per subject, '
                f'but subject {subject} has {len(hashes_list)} template hashes'
            )
            continue
        yield subject, hashes_list[0]

## Sample Hashes Setup

### Subject Data

In [18]:
processed_data_map = {
    subject: data_processor.process(subject_data_map[subject])
    for subject in subject_data_map
}
token_processed_data_map = {
    subject: token_processor.process(subject_data_map[subject])
    for subject in subject_data_map
}

Creating RawArray with float64 data, n_channels=4, n_times=24000
    Range : 0 ... 23999 =      0.000 ...   119.995 secs
Ready.
Creating RawArray with float64 data, n_channels=4, n_times=24000
    Range : 0 ... 23999 =      0.000 ...   119.995 secs
Ready.
Creating RawArray with float64 data, n_channels=4, n_times=24000
    Range : 0 ... 23999 =      0.000 ...   119.995 secs
Ready.
Creating RawArray with float64 data, n_channels=4, n_times=24000
    Range : 0 ... 23999 =      0.000 ...   119.995 secs
Ready.
Creating RawArray with float64 data, n_channels=4, n_times=40114
    Range : 0 ... 40113 =      0.000 ...   200.565 secs
Ready.
Creating RawArray with float64 data, n_channels=4, n_times=24000
    Range : 0 ... 23999 =      0.000 ...   119.995 secs
Ready.
Creating RawArray with float64 data, n_channels=4, n_times=24000
    Range : 0 ... 23999 =      0.000 ...   119.995 secs
Ready.
Creating RawArray with float64 data, n_channels=4, n_times=24000
    Range : 0 ... 23999 =      0.000 ..

### Hashing

In [19]:
SubjectSamplesPair = typing.Tuple[str, typing.List[fuzzy_hash_lib.FuzzyHash]]

In [20]:
def iter_sample_hashes(data_map: SubjectProcessedData, 
                       encoder: feature_encoding.base.BinaryEncoder,
                       tokens: typing.Dict[str, str] = None) -> typing.Iterator[SubjectSamplesPair]:
    for subject in data_map:
        token = tokens.get(subject) if tokens is not None else None
        vectors = hash_vectors(data_map[subject], encoder, token)
        yield subject, vectors

# Test Set Assembly

## Gathering Test Sets

In [21]:
def iter_subject_test_sets(template_data: SubjectProcessedData, 
                           sample_data: SubjectProcessedData, 
                           encoder: feature_encoding.base.BinaryEncoder,
                           tokens: typing.Dict[str, str] = None) -> typing.Iterator[SubjectTestSet]:
    for target_subject, template_hash in iter_template_hashes(template_data, encoder, tokens):
        threshold_tests = []
        for threshold in AUTHENTICATION_THRESHOLDS:
            threshold_test_set = ThresholdTestSet(
                positive_cases=[], negative_cases=[], 
                template_hash=template_hash, threshold=threshold
            )
            for subject, hashes in iter_sample_hashes(sample_data, encoder, tokens):
                if subject == target_subject:
                    threshold_test_set.positive_cases.extend(hashes)
                else:
                    threshold_test_set.negative_cases.extend(hashes)
            threshold_tests.append(threshold_test_set)
        yield SubjectTestSet(
            subject_id=target_subject,
            threshold_tests=threshold_tests
        )

## Generating Hash Tests

In [22]:
ThresholdTestsPair = typing.Tuple[str, typing.List[HashTest]]

In [23]:
def make_hash_tests(test_set: ThresholdTestSet) -> typing.List[HashTest]:
    tests = []
    sample_size = min(len(test_set.positive_cases), len(test_set.negative_cases))
    should_match_cases: typing.List[fuzzy_hash_lib.FuzzyHash] = RANDOM_GENERATOR.sample(test_set.positive_cases, sample_size)
    should_not_match_cases: typing.List[fuzzy_hash_lib.FuzzyHash] = RANDOM_GENERATOR.sample(test_set.negative_cases, sample_size)
    for sample in should_match_cases:
        tests.append(
            HashTest(
                expected_result=True, 
                threshold=test_set.threshold, 
                hashes=(test_set.template_hash, sample),
                if_expected=TestResultType.TRUE_POSITIVE,
                if_unexpected=TestResultType.FALSE_NEGATIVE
            )
        )
    for sample in should_not_match_cases:
        tests.append(
            HashTest(
                expected_result=False, 
                threshold=test_set.threshold, 
                hashes=(test_set.template_hash, sample),
                if_expected=TestResultType.TRUE_NEGATIVE,
                if_unexpected=TestResultType.FALSE_POSITIVE
            )
        )
    return tests


def iter_hash_tests(template_data: SubjectProcessedData, 
                    sample_data: SubjectProcessedData, 
                    encoder: feature_encoding.base.BinaryEncoder,
                    tokens: typing.Dict[str, str] = None) -> ThresholdTestsPair:
    for subject_test_set in iter_subject_test_sets(template_data, sample_data, encoder, tokens):
        for threshold_test_data in subject_test_set.threshold_tests:
            yield threshold_test_data.threshold, make_hash_tests(threshold_test_data)

# Execute Tests

In [24]:
def run_hash_tests(hash_tests: typing.List[HashTest]) -> TestResultsSummary:
    summary = TestResultsSummary()
    for test in hash_tests:
        result_type = test.run_test()
        summary.increment_count(result_type)
    return summary

## No Tokens with Histogram Normalization

In [25]:
no_token_results = {}
threshold_encoder = feature_encoding.threshold.ThresholdBinaryEncoder(BINARY_THRESHOLD)
no_token_args = (
    template_data_map, processed_data_map, threshold_encoder
)
for threshold_type, hash_test_data in iter_hash_tests(*no_token_args):
    if threshold_type not in no_token_results:
        no_token_results[threshold_type] = TestResultsSummary()
    no_token_results[threshold_type] = TestResultsSummary.merge_summaries(
        no_token_results[threshold_type],
        run_hash_tests(hash_test_data)
    )

In [26]:
no_token_data = []
for threshold_type, result_summary in no_token_results.items():
    no_token_data.append([
        threshold_type, result_summary.false_accept_rate, 
        result_summary.false_reject_rate, result_summary.half_total_error_rate, 
        result_summary.accuracy
    ])
no_token_df = pd.DataFrame(no_token_data, columns=['Threshold', 'FAR', 'FRR', 'HTER', 'Accuracy'])
no_token_df.sort_values(by='HTER', ascending=True)

Unnamed: 0,Threshold,FAR,FRR,HTER,Accuracy
0,10,0.05569,0.181598,0.118644,0.881356
1,20,0.05569,0.181598,0.118644,0.881356
6,70,0.0,0.285714,0.142857,0.857143
2,30,0.004843,0.283293,0.144068,0.855932
4,50,0.002421,0.285714,0.144068,0.855932
7,80,0.002421,0.285714,0.144068,0.855932
3,40,0.004843,0.285714,0.145278,0.854722
5,60,0.004843,0.285714,0.145278,0.854722
8,90,0.004843,0.285714,0.145278,0.854722


## Tokens without Histogram Normalization

In [27]:
token_results = {}
token_args = (
    token_template_data_map, token_processed_data_map, threshold_encoder, tokens_map
)
for threshold_type, hash_test_data in iter_hash_tests(*token_args):
    if threshold_type not in token_results:
        token_results[threshold_type] = TestResultsSummary()
    token_results[threshold_type] = TestResultsSummary.merge_summaries(
        token_results[threshold_type],
        run_hash_tests(hash_test_data)
    )

In [28]:
token_data = []
for threshold_type, result_summary in token_results.items():
    token_data.append([
        threshold_type, result_summary.false_accept_rate, 
        result_summary.false_reject_rate, result_summary.half_total_error_rate, 
        result_summary.accuracy
    ])
token_df = pd.DataFrame(token_data, columns=['Threshold', 'FAR', 'FRR', 'HTER', 'Accuracy'])
token_df.sort_values(by='HTER', ascending=True)

Unnamed: 0,Threshold,FAR,FRR,HTER,Accuracy
0,10,0.0,0.714286,0.357143,0.642857
1,20,0.0,0.762712,0.381356,0.618644
2,30,0.0,0.920097,0.460048,0.539952
3,40,0.0,0.922518,0.461259,0.538741
4,50,0.0,0.922518,0.461259,0.538741
5,60,0.0,0.922518,0.461259,0.538741
6,70,0.0,0.922518,0.461259,0.538741
7,80,0.0,0.922518,0.461259,0.538741
8,90,0.0,0.922518,0.461259,0.538741


## Tokens with Histogram Normalization

In [29]:
token_norm_results = {}
token_norm_args = (
    template_data_map, processed_data_map, threshold_encoder, tokens_map
)
for threshold_type, hash_test_data in iter_hash_tests(*token_norm_args):
    if threshold_type not in token_norm_results:
        token_norm_results[threshold_type] = TestResultsSummary()
    token_norm_results[threshold_type] = TestResultsSummary.merge_summaries(
        token_norm_results[threshold_type],
        run_hash_tests(hash_test_data)
    )

In [30]:
token_norm_data = []
for threshold_type, result_summary in token_norm_results.items():
    token_norm_data.append([
        threshold_type, result_summary.false_accept_rate, 
        result_summary.false_reject_rate, result_summary.half_total_error_rate, 
        result_summary.accuracy
    ])
token_norm_df = pd.DataFrame(token_norm_data, columns=['Threshold', 'FAR', 'FRR', 'HTER', 'Accuracy'])
token_norm_df.sort_values(by='HTER', ascending=True)

Unnamed: 0,Threshold,FAR,FRR,HTER,Accuracy
0,10,0.0,0.450363,0.225182,0.774818
1,20,0.0,0.450363,0.225182,0.774818
2,30,0.0,0.530266,0.265133,0.734867
3,40,0.0,0.564165,0.282082,0.717918
4,50,0.0,0.564165,0.282082,0.717918
5,60,0.0,0.564165,0.282082,0.717918
6,70,0.0,0.564165,0.282082,0.717918
7,80,0.0,0.564165,0.282082,0.717918
8,90,0.0,0.564165,0.282082,0.717918
