# Setup

In [1]:
import numpy as np
import pandas as pd
import dataclasses
import enum
import typing
import random
import fuzzy_hash_lib
import feature_encoding.threshold

from eeg_auth_models_framework import data, pre_process, features, processor, normalization
from eeg_auth_models_framework.utils import conversion

# Constants

In [2]:
AUTHENTICATION_THRESHOLDS = [10, 20, 30, 40, 50, 60, 70, 80, 90]
DATASET_SAMPLE_FREQ_HZ = 200
DATA_CHANNEL_NAMES = ['T7','F8','Cz','P4']
FREQUENCIES = [
    pre_process.FrequencyBand(lower=8.0, upper=12.0, label='Alpha'),
    pre_process.FrequencyBand(lower=12.0, upper=35.0, label='Beta'),
    pre_process.FrequencyBand(lower=4.0, upper=8.0, label='Theta'),
    pre_process.FrequencyBand(lower=35.0, upper=None, label='Gamma'),
    pre_process.FrequencyBand(lower=None, upper=None, label='Raw'),
]
WINDOW_SIZE = 1200
WINDOW_OVERLAP = 0
BINARY_THRESHOLD = 127
RESCALE_LOWER = 0
RESCALE_UPPER = 255
RANDOM_SEED = 100000000000
RANDOM_GENERATOR = random.Random(RANDOM_SEED)

# Utilities

In [3]:
class TestResultType(enum.Enum):
    TRUE_POSITIVE = enum.auto()
    FALSE_POSITIVE = enum.auto()
    FALSE_NEGATIVE = enum.auto()
    TRUE_NEGATIVE = enum.auto()


@dataclasses.dataclass
class TestResultsSummary:
    true_positives: int = 0
    false_positives: int = 0
    false_negatives: int = 0
    true_negatives: int = 0
    
    def increment_count(self, result_type: TestResultType):
        if result_type == TestResultType.TRUE_POSITIVE:
            self.true_positives += 1
        elif result_type == TestResultType.FALSE_POSITIVE:
            self.false_positives += 1
        elif result_type == TestResultType.FALSE_NEGATIVE:
            self.false_negatives += 1
        else:
            self.true_negatives += 1
    
    @property
    def accuracy(self) -> float:
        hits = self.true_positives + self.true_negatives
        total = (
            self.true_positives + self.true_negatives + 
            self.false_positives + self.false_negatives
        )
        return hits / total

    @property
    def false_accept_rate(self) -> float:
        return self.false_positives / (self.false_positives + self.true_negatives)
    
    @property
    def false_reject_rate(self):
        return self.false_negatives / (self.false_negatives + self.true_positives)


@dataclasses.dataclass
class HashTest:
    expected_result: bool
    threshold: int
    if_expected: TestResultType
    if_unexpected: TestResultType
    hashes: typing.Tuple[fuzzy_hash_lib.FuzzyHash, fuzzy_hash_lib.FuzzyHash]
    
    def run_test(self):
        similarity = fuzzy_hash_lib.FuzzyHash.compare(self.hashes[0], self.hashes[1])
        is_match = similarity >= self.threshold
        if is_match != self.expected_result:
            return self.if_unexpected
        return self.if_expected
    
@dataclasses.dataclass
class ThresholdTestSet:
    threshold: int
    template_hash: fuzzy_hash_lib.FuzzyHash
    positive_cases: typing.List[fuzzy_hash_lib.FuzzyHash]
    negative_cases: typing.List[fuzzy_hash_lib.FuzzyHash]
    
@dataclasses.dataclass
class SubjectTestSet:
    subject_id: str
    threshold_tests: typing.List[ThresholdTestSet]

# Configuration

In [4]:
downloader = data.AuditoryDataDownloader()
reader = data.AuditoryDataReader()
converter = conversion.MNEDataFrameConverter(
    channels=DATA_CHANNEL_NAMES, 
    sample_frequency=DATASET_SAMPLE_FREQ_HZ
)

# Data Processing Setup

## Template Hash Processor

### Pre-Processing Steps

In [5]:
template_pre_process_steps = pre_process.PreProcessingPipeline([
    pre_process.EEGBandpassFilterStep(
        FREQUENCIES,
        converter
    )
])

### Feature Extraction Steps

In [6]:
template_feature_extraction_steps = features.FeatureExtractPipeline([
    features.StatisticalFeatureExtractor([
        features.StatisticalFeature.MIN,
        features.StatisticalFeature.MAX,
        features.StatisticalFeature.MEAN,
        features.StatisticalFeature.ZERO_CROSSING_RATE
    ])
])

### Normalization Steps

In [7]:
template_normalization_steps = normalization.NormalizationPipeline([
    normalization.RescaleNormalizationStep(RESCALE_LOWER, RESCALE_UPPER),
    normalization.HistogramEqualizationStep(RESCALE_LOWER, RESCALE_UPPER)
])

### Data Processor Setup

In [8]:
template_data_processor = processor.DataProcessor(
    pre_process=template_pre_process_steps,
    feature_extraction=template_feature_extraction_steps,
    normalization=template_normalization_steps
)

## Sample Data Processor

### Pre-Processing Steps

In [9]:
pre_process_steps = pre_process.PreProcessingPipeline([
    pre_process.EEGBandpassFilterStep(
        FREQUENCIES,
        converter
    ),
    pre_process.DataWindowStep(WINDOW_SIZE, WINDOW_OVERLAP)
])

### Feature Extraction Steps

In [10]:
feature_extraction_steps = features.FeatureExtractPipeline([
    features.StatisticalFeatureExtractor([
        features.StatisticalFeature.MIN,
        features.StatisticalFeature.MAX,
        features.StatisticalFeature.MEAN,
        features.StatisticalFeature.ZERO_CROSSING_RATE
    ])
])

### Normalization Steps

In [11]:
normalization_steps = normalization.NormalizationPipeline([
    normalization.RescaleNormalizationStep(RESCALE_LOWER, RESCALE_UPPER),
    normalization.HistogramEqualizationStep(RESCALE_LOWER, RESCALE_UPPER)
])

### Data Processor Setup

In [12]:
data_processor = processor.DataProcessor(
    pre_process=pre_process_steps,
    feature_extraction=feature_extraction_steps,
    normalization=normalization_steps
)

# Subject Data

In [13]:
data_path = downloader.retrieve()
subject_data_map = reader.format_data(data_path)

# Processing

## Metadata

In [14]:
metadata_map = {subject: data_processor.extract_metadata(subject_data_map[subject]) for subject in subject_data_map}

Creating RawArray with float64 data, n_channels=4, n_times=24000
    Range : 0 ... 23999 =      0.000 ...   119.995 secs
Ready.
Creating RawArray with float64 data, n_channels=4, n_times=24000
    Range : 0 ... 23999 =      0.000 ...   119.995 secs
Ready.
Creating RawArray with float64 data, n_channels=4, n_times=24000
    Range : 0 ... 23999 =      0.000 ...   119.995 secs
Ready.
Creating RawArray with float64 data, n_channels=4, n_times=24000
    Range : 0 ... 23999 =      0.000 ...   119.995 secs
Ready.
Creating RawArray with float64 data, n_channels=4, n_times=40114
    Range : 0 ... 40113 =      0.000 ...   200.565 secs
Ready.
Creating RawArray with float64 data, n_channels=4, n_times=24000
    Range : 0 ... 23999 =      0.000 ...   119.995 secs
Ready.
Creating RawArray with float64 data, n_channels=4, n_times=24000
    Range : 0 ... 23999 =      0.000 ...   119.995 secs
Ready.
Creating RawArray with float64 data, n_channels=4, n_times=24000
    Range : 0 ... 23999 =      0.000 ..

## Template Hash Setup

### Subject Data

In [15]:
template_data_map = {
    subject: template_data_processor.process(subject_data_map[subject], metadata_map[subject]) 
    for subject in subject_data_map
}

Creating RawArray with float64 data, n_channels=4, n_times=24000
    Range : 0 ... 23999 =      0.000 ...   119.995 secs
Ready.
Creating RawArray with float64 data, n_channels=4, n_times=24000
    Range : 0 ... 23999 =      0.000 ...   119.995 secs
Ready.
Creating RawArray with float64 data, n_channels=4, n_times=24000
    Range : 0 ... 23999 =      0.000 ...   119.995 secs
Ready.
Creating RawArray with float64 data, n_channels=4, n_times=24000
    Range : 0 ... 23999 =      0.000 ...   119.995 secs
Ready.
Creating RawArray with float64 data, n_channels=4, n_times=40114
    Range : 0 ... 40113 =      0.000 ...   200.565 secs
Ready.
Creating RawArray with float64 data, n_channels=4, n_times=24000
    Range : 0 ... 23999 =      0.000 ...   119.995 secs
Ready.
Creating RawArray with float64 data, n_channels=4, n_times=24000
    Range : 0 ... 23999 =      0.000 ...   119.995 secs
Ready.
Creating RawArray with float64 data, n_channels=4, n_times=24000
    Range : 0 ... 23999 =      0.000 ..

### Encoding

In [16]:
def hash_template_vectors(vectors_to_hash: typing.List[np.ndarray]) -> typing.List[fuzzy_hash_lib.FuzzyHash]:
    encoder = feature_encoding.threshold.ThresholdBinaryEncoder(BINARY_THRESHOLD)
    binary_vectors = [encoder.encode(v) for v in vectors_to_hash]
    return [fuzzy_hash_lib.FuzzyHash.from_text(bv) for bv in binary_vectors]


def make_template_hashes_map(template_data: typing.Dict[str, typing.List[np.ndarray]]) -> typing.Dict[str, fuzzy_hash_lib.FuzzyHash]:
    template_hashes = {}
    for subject in template_data:
        hash_list = hash_template_vectors(template_data[subject])
        if len(hash_list) != 1:
            print(
                f'Warning: there should be only one template hash per subject, '
                f'but subject {subject} has {len(hash_list)} template hashes'
            )
        template_hashes[subject] = hash_list[0]
    return template_hashes

In [17]:
subject_template_hashes_map = make_template_hashes_map(template_data_map)

## Sample Hashes Setup

### Subject Data

In [18]:
processed_data_map = {
    subject: data_processor.process(subject_data_map[subject], metadata_map[subject])
    for subject in subject_data_map
}

Creating RawArray with float64 data, n_channels=4, n_times=24000
    Range : 0 ... 23999 =      0.000 ...   119.995 secs
Ready.
Creating RawArray with float64 data, n_channels=4, n_times=24000
    Range : 0 ... 23999 =      0.000 ...   119.995 secs
Ready.
Creating RawArray with float64 data, n_channels=4, n_times=24000
    Range : 0 ... 23999 =      0.000 ...   119.995 secs
Ready.
Creating RawArray with float64 data, n_channels=4, n_times=24000
    Range : 0 ... 23999 =      0.000 ...   119.995 secs
Ready.
Creating RawArray with float64 data, n_channels=4, n_times=40114
    Range : 0 ... 40113 =      0.000 ...   200.565 secs
Ready.
Creating RawArray with float64 data, n_channels=4, n_times=24000
    Range : 0 ... 23999 =      0.000 ...   119.995 secs
Ready.
Creating RawArray with float64 data, n_channels=4, n_times=24000
    Range : 0 ... 23999 =      0.000 ...   119.995 secs
Ready.
Creating RawArray with float64 data, n_channels=4, n_times=24000
    Range : 0 ... 23999 =      0.000 ..

### Encoding

In [19]:
def hash_vectors(vectors_to_hash: typing.List[np.ndarray]) -> typing.List[fuzzy_hash_lib.FuzzyHash]:
    encoder = feature_encoding.threshold.ThresholdBinaryEncoder(BINARY_THRESHOLD)
    binary_vectors = [encoder.encode(v) for v in vectors_to_hash]
    return [fuzzy_hash_lib.FuzzyHash.from_text(bv) for bv in binary_vectors]

In [20]:
subject_hashes_map = {subject: hash_vectors(processed_data_map[subject]) for subject in processed_data_map}

# Test Set Assembly

## Gathering Test Sets

In [21]:
def make_threshold_test_sets(sample_hashes_map: typing.Dict[str, typing.List[fuzzy_hash_lib.FuzzyHash]],
                             template_hashes_map: typing.Dict[str, fuzzy_hash_lib.FuzzyHash],
                             target_subject: str) -> typing.List[ThresholdTestSet]:
    results = []
    for threshold in AUTHENTICATION_THRESHOLDS:
        threshold_test_set = ThresholdTestSet(
            positive_cases=[], negative_cases=[], 
            template_hash=template_hashes_map[target_subject], threshold=threshold
        )
        for subject in sample_hashes_map:
            if subject == target_subject:
                threshold_test_set.positive_cases.extend(sample_hashes_map[subject])
            else:
                threshold_test_set.negative_cases.extend(sample_hashes_map[subject])
        results.append(threshold_test_set)
    return results

In [22]:
subject_test_sets = [
    SubjectTestSet(
        subject, 
        make_threshold_test_sets(subject_hashes_map, subject_template_hashes_map, subject)
    )
    for subject in subject_hashes_map
]

## Generating Hash Tests

In [23]:
def make_hash_tests(test_set: ThresholdTestSet) -> typing.List[HashTest]:
    tests = []
    sample_size = min(len(test_set.positive_cases), len(test_set.negative_cases))
    should_match_cases: typing.List[fuzzy_hash_lib.FuzzyHash] = RANDOM_GENERATOR.sample(test_set.positive_cases, sample_size)
    should_not_match_cases: typing.List[fuzzy_hash_lib.FuzzyHash] = RANDOM_GENERATOR.sample(test_set.negative_cases, sample_size)
    for sample in should_match_cases:
        tests.append(
            HashTest(
                expected_result=True, 
                threshold=test_set.threshold, 
                hashes=(test_set.template_hash, sample),
                if_expected=TestResultType.TRUE_POSITIVE,
                if_unexpected=TestResultType.FALSE_NEGATIVE
            )
        )
    for sample in should_not_match_cases:
        tests.append(
            HashTest(
                expected_result=False, 
                threshold=test_set.threshold, 
                hashes=(test_set.template_hash, sample),
                if_expected=TestResultType.TRUE_NEGATIVE,
                if_unexpected=TestResultType.FALSE_POSITIVE
            )
        )
    return tests

def make_threshold_tests_map(subject_tests: typing.List[SubjectTestSet]) -> typing.Dict[str, typing.List[HashTest]]:
    threshold_tests = {str(threshold): [] for threshold in AUTHENTICATION_THRESHOLDS}
    for subject_test in subject_tests:
        for threshold_test_data in subject_test.threshold_tests:
            threshold_tests[str(threshold_test_data.threshold)].extend(
                make_hash_tests(threshold_test_data)
            )
    return threshold_tests

In [24]:
threshold_tests_map = make_threshold_tests_map(subject_test_sets)

# Execute Tests

In [25]:
def run_threshold_tests(test_data: typing.Dict[str, typing.List[HashTest]]) -> typing.Dict[str, TestResultsSummary]:
    results = {}
    for threshold in test_data:
        summary = TestResultsSummary()
        for test in test_data[threshold]:
            result_type = test.run_test()
            summary.increment_count(result_type)
        results[threshold] = summary
    return results

In [26]:
test_results_map = run_threshold_tests(threshold_tests_map)
data_results = []
for threshold_type, result_summary in test_results_map.items():
    data_results.append([
        threshold_type, result_summary.false_accept_rate, 
        result_summary.false_reject_rate, result_summary.accuracy
    ])
test_results = pd.DataFrame(data_results, columns=['Threshold', 'FAR', 'FRR', 'Accuracy'])
test_results

Unnamed: 0,Threshold,FAR,FRR,Accuracy
0,10,0.966102,0.021792,0.506053
1,20,0.98063,0.021792,0.498789
2,30,0.690073,0.142857,0.583535
3,40,0.048426,0.331719,0.809927
4,50,0.029056,0.331719,0.819613
5,60,0.029056,0.331719,0.819613
6,70,0.043584,0.331719,0.812349
7,80,0.016949,0.331719,0.825666
8,90,0.026634,0.331719,0.820823
