In [None]:
%run "./2_Methods.ipynb"

In [None]:
print(f'Configured user-id: {TARGET_CLASS_USER_ID}')
print(f'Configured password: {DATA_PASSWORD}')
print(f'Positive class data: {positive_class_data.shape[0]} samples with {feature_count} features')

# Data Analysis

### Min/Max Visualization

In [None]:
data_values = multi_class_data[DATA_COLS].values

max_dataframe = create_dataframe([np.max(data_values, axis=0)], 'max', DATA_COLS)
mean_dataframe = create_dataframe([np.mean(data_values, axis=0)], 'mean', DATA_COLS)
min_dataframe = create_dataframe([np.min(data_values, axis=0)], 'min', DATA_COLS)

combined_dataframe = pd.concat([max_dataframe, mean_dataframe, min_dataframe], ignore_index=True)
for key_col in KEY_COLS:
    combined_dataframe[key_col] = pd.to_numeric(DATA[key_col][0], downcast='unsigned')

plot_pp_data(combined_dataframe, DATA_PASSWORD, 
    target_category=None,
    plot_type='line', #'datapoints', 'violin', 'overlap', 'line'
    trim_outliers=True,
    display=True,
    save=False
)

### Sample Data Visualization

In [None]:
samples = multi_class_data[DATA_COLS].values[np.random.choice(len(multi_class_data), 20, replace=False)]
sample_dataframes = [create_dataframe([sample], f'sample-{idx}', DATA_COLS) for idx, sample in enumerate(samples)]

means = np.mean(samples, axis=0)
mean_dataframe = create_dataframe([means], 'mean', DATA_COLS)

combined_dataframe = pd.concat([*sample_dataframes, mean_dataframe], ignore_index=True)
for key_col in KEY_COLS:
    combined_dataframe[key_col] = pd.to_numeric(DATA[key_col][0], downcast='unsigned')

plot_pp_data(combined_dataframe, DATA_PASSWORD, 
    target_category=None,
    plot_type='line', #'datapoints', 'violin', 'overlap', 'line'
    trim_outliers=True,
    display=True,
    save=False
)

### Plot Input Data

In [None]:
%matplotlib inline

# for graph visualization purposes, comment out for computations:
binary_class_data.loc[binary_class_data[CLASS_COL] != positive_class, [CLASS_COL]] = 'negative'
binary_class_data.loc[binary_class_data[CLASS_COL] == positive_class, [CLASS_COL]] = 'positive'

plot_d_data(binary_class_data, DATA_PASSWORD, 
              plot_type=PLOT_TYPE, #'datapoints', 'violin', 'overlap', 'line'
              class_categories=['positive', 'negative'],
              target_category=None, 
              trim_outliers=True,
              display=True,
              width=12, height=6,
              save=False
         )
plot_pp_data(binary_class_data, DATA_PASSWORD, 
              plot_type=PLOT_TYPE, #'datapoints', 'violin', 'overlap', 'line'
              class_categories=['positive', 'negative'],
              target_category=None,
              trim_outliers=True,
              display=True,
              save=False
         )
plot_pr_data(binary_class_data, DATA_PASSWORD, 
              plot_type=PLOT_TYPE, #'datapoints', 'violin', 'overlap', 'line'
              class_categories=['positive', 'negative'],
              target_category=None, 
              trim_outliers=True,
              display=True,
              width=12, height=6,
              save=False
         )
plot_rp_data(binary_class_data, DATA_PASSWORD, 
              plot_type=PLOT_TYPE, #'datapoints', 'violin', 'overlap', 'line'
              class_categories=['positive', 'negative'],
              target_category=None,
              trim_outliers=True,
              display=True,
              width=12, height=6,
              save=False
         )
plot_rr_data(binary_class_data, DATA_PASSWORD, 
              plot_type=PLOT_TYPE, #'datapoints', 'violin', 'overlap', 'line'
              class_categories=['positive', 'negative'],
              target_category=None,
              trim_outliers=True,
              display=True,
              width=12, height=6,
              save=False
         )

# Data Augmentation

In [None]:
# Broken down into steps for visualization purposes, 
# actual generation for classification input data happens in the Autoencoder class
positive = positive_class_data[DATA_COLS].values
(normalized, maxima, minima) = normalize(positive)
generated_similar = synthesize_normal(normalized, 500)
denormalized = denormalize(generated_similar, maxima, minima) # = augmented
generated_dissimilar = synthesize_dissimilar(positive, 5000)
true_negative = negative_class_data[DATA_COLS].values

positive_dataframe = create_dataframe(positive, 'positive', DATA_COLS)
normalized_dataframe = create_dataframe(normalized, 'normalized', DATA_COLS)
generated_similar_dataframe = create_dataframe(generated_similar, 'similar (generated)', DATA_COLS)
denormalized_dataframe = create_dataframe(denormalized, 'denormalized (augmented)', DATA_COLS)
augmented_dataframe = create_dataframe(denormalized, 'augmented', DATA_COLS)
generated_dissimilar_dataframe = create_dataframe(generated_dissimilar, 'dissimilar (generated)', DATA_COLS)
true_negative_dataframe = create_dataframe(true_negative, 'negative', DATA_COLS)

#### POSITIVE class vs NEGATIVE class: 

In [None]:
combined_dataframe = pd.concat([positive_dataframe, true_negative_dataframe], ignore_index=True)
for key_col in KEY_COLS:
    combined_dataframe[key_col] = pd.to_numeric(DATA[key_col][0], downcast='unsigned')

plot_pp_data(combined_dataframe, DATA_PASSWORD, 
    class_categories=['positive', 'negative'],
    target_category=None,
    plot_type='line', #'datapoints', 'violin', 'overlap', 'line'
    trim_outliers=True,
    display=True,
    save=False
)

#### NORMALIZATION step:

In [None]:
combined_dataframe = pd.concat([positive_dataframe, normalized_dataframe], ignore_index=True)
for key_col in KEY_COLS:
    combined_dataframe[key_col] = pd.to_numeric(DATA[key_col][0], downcast='unsigned')

plot_pp_data(combined_dataframe, DATA_PASSWORD, 
    class_categories=['positive', 'normalized'],
    target_category=None,
    plot_type='line', #'datapoints', 'violin', 'overlap', 'line'
    trim_outliers=True,
    display=True,
    save=False
)

#### AUGMENTATION step (similar data generation):

In [None]:
combined_dataframe = pd.concat([normalized_dataframe, generated_similar_dataframe], ignore_index=True)
for key_col in KEY_COLS:
    combined_dataframe[key_col] = pd.to_numeric(DATA[key_col][0], downcast='unsigned')

plot_pp_data(combined_dataframe, DATA_PASSWORD, 
    class_categories=['normalized', 'similar (generated)'],
    target_category=None,
    plot_type='line', #'datapoints', 'violin', 'overlap', 'line'
    trim_outliers=True,
    display=True,
    save=False
)

#### DENORMALIZATION step (transforming augmented data to look like positive class):

In [None]:
combined_dataframe = pd.concat([positive_dataframe, denormalized_dataframe], ignore_index=True)
for key_col in KEY_COLS:
    combined_dataframe[key_col] = pd.to_numeric(DATA[key_col][0], downcast='unsigned')

plot_pp_data(combined_dataframe, DATA_PASSWORD, 
    class_categories=['positive', 'denormalized (augmented)'],
    target_category=None,
    plot_type='line', #'datapoints', 'violin', 'overlap', 'line'
    trim_outliers=True,
    display=True,
    save=False
)

#### GENERATING NEGATIVE data (dissimilar data generation):

In [None]:
combined_dataframe = pd.concat([augmented_dataframe, generated_dissimilar_dataframe], ignore_index=True)
for key_col in KEY_COLS:
    combined_dataframe[key_col] = pd.to_numeric(DATA[key_col][0], downcast='unsigned')

plot_pp_data(combined_dataframe, DATA_PASSWORD, 
    class_categories=['augmented', 'dissimilar (generated)'],
    target_category=None,
    plot_type='line', #'datapoints', 'violin', 'overlap', 'line'
    trim_outliers=True,
    display=True,
    save=False
)

#### POSITIVE vs GENERATED DISSIMILAR vs NEGATIVE classes:

In [None]:
combined_dataframe = pd.concat([positive_dataframe, generated_dissimilar_dataframe, true_negative_dataframe], ignore_index=True)
for key_col in KEY_COLS:
    combined_dataframe[key_col] = pd.to_numeric(DATA[key_col][0], downcast='unsigned')
    
plot_pp_data(combined_dataframe, DATA_PASSWORD, 
    class_categories=['positive', 'dissimilar (generated)', 'negative'],
    target_category=None,
    plot_type='line', #'datapoints', 'violin', 'overlap', 'line'
    trim_outliers=True,
    display=True,
    save=False
)

# Autoencoder Evaluation

In [None]:
%run "./3_Autoencoder.ipynb"

In [None]:
X = positive_class_data[DATA_COLS].values
N = negative_class_data[DATA_COLS].values

In [None]:
# Hyperparameter evaluation code moved to 3_Autoencoder.ipynb

# Evaluating autoencoders and writing to .csv file for analysis
# find_autoencoder_hyperparameters(X, DATA_PASSWORD, direct_to_file=True)

### Evaluating Best Performing Autoencoders:

In [None]:
def plot_autoencoder_training_results(autoencoders, n=3):
    for ae in autoencoders[:n]:
        ae.print_evaluation(show_history=True, verbose=True)  
        pred_positive = ae.predict(X[np.random.choice(len(X), 100, replace=False)])
        pred_negative = ae.predict(N[np.random.choice(len(N), 100, replace=False)])
    
        ae_pos = ae.generate_positive(1000)
        ae_neg = ae.generate_negative(1000)
        
        ae_positive_dataframe = create_dataframe(ae_pos, 'ae positive', DATA_COLS)
        ae_negative_dataframe = create_dataframe(ae_neg, 'ae negative', DATA_COLS)
        pred_positive_dataframe = create_dataframe(pred_positive, 'pred positive', DATA_COLS)
        pred_negative_dataframe = create_dataframe(pred_negative, 'pred negative', DATA_COLS)
    
        combined_dataframe = pd.concat([ae_positive_dataframe, ae_negative_dataframe, pred_positive_dataframe, pred_negative_dataframe], ignore_index=True)
        for key_col in KEY_COLS:
            combined_dataframe[key_col] = pd.to_numeric(DATA[key_col][0], downcast='unsigned')
        
        plot_pp_data(combined_dataframe, DATA_PASSWORD, 
            class_categories=['ae positive', 'ae negative', 'pred positive', 'pred negative'],
            target_category=None,
            plot_type='line', #'datapoints', 'violin', 'overlap', 'line'
            trim_outliers=True,
            display=True,
            save=False
        )
        print('---'*20)

In [None]:
reset_random_state(RANDOM_STATE)

X_train, X_test, _, _ = train_test_split(X, X, test_size=0.2, shuffle=True, random_state=random_state)

adagrad_autoencoders = find_autoencoder_hyperparameters(X_train, DATA_PASSWORD, optimizers=['Adagrad'])  
rms_autoencoders = find_autoencoder_hyperparameters(X_train, DATA_PASSWORD, optimizers=['RMSprop'])  

In [None]:
adagrad_aes = [autoencoder for _, autoencoder in [(eval, ae) for eval, ae in zip(adagrad_autoencoders[0], adagrad_autoencoders[1])] if (autoencoder.euclidean_dist_delta) > 0]
adagrad_aes = sorted(adagrad_aes, key=lambda ae: ae.euclidean_dist_ratio, reverse=True)

rmsprop_aes = [autoencoder for _, autoencoder in [(eval, ae) for eval, ae in zip(rms_autoencoders[0], rms_autoencoders[1])] if (autoencoder.euclidean_dist_delta) > 0]
rmsprop_aes = sorted(rmsprop_aes, key=lambda ae: ae.euclidean_dist_ratio, reverse=True)

#plot_autoencoder_training_results(adagrad_aes)
#plot_autoencoder_training_results(rmsprop_aes)

In [None]:
adagrad_best_ae = adagrad_aes[0]
adagrad_best_ae.print_evaluation(show_history=True, verbose=True)  

rmsprop_best_ae = rmsprop_aes[0]
rmsprop_best_ae.print_evaluation(show_history=True, verbose=True)  

### Selection of Promising Autoencoders:

# Classifier Evaluation

In [None]:
%run "./4_Classifier.ipynb"

In [None]:
# Hyperparameter evaluation code moved to 4_Classifier.ipynb
# find_classifier_hyperparameters(autoencoders[:10], direct_to_file=True)
# find_classifier_hyperparameters([adagrad_best_ae, rmsprop_best_ae], direct_to_file=True)

### Best Classifier Found

In [None]:
reset_random_state(RANDOM_STATE)

TANH_BETA = 0.0
classifier = Classifier(rmsprop_best_ae, 'Adadelta', 0.045, 'Huber', epochs=800,
                        early_stopping_patience=50, cross_validation_split=1, optimizer_kwargs={'weight_decay': 0.0005}) 
classifier.print_evaluation(show_history=True)
print('Evaluation with unseen, real typing data: ')
positive_test_data = create_dataframe(X_test, positive_class, DATA_COLS)
evaluate_authentication(classifier, positive_test_data, negative_class_data)
print('---'*20)

### Analizing False Positive Data

In [None]:
from collections import Counter

pred_positive_probabilities = flatten(classifier.predict(X_test))
pred_negative_probabilities = flatten(classifier.predict(N))

pred_positive = [round(elem) for elem in pred_positive_probabilities]
pred_negative = [round(elem) for elem in pred_negative_probabilities]

true_positive = [feature for feature, prediction in zip(X_test, pred_positive) if prediction == 1]
false_negative = [feature for feature, prediction in zip(X_test, pred_positive) if prediction == 0]
true_negative = [feature for feature, prediction in zip(N, pred_negative) if prediction == 0]
false_positive = [feature for feature, prediction in zip(N, pred_negative) if prediction == 1]

print(f'True Positives: {len(true_positive)}/{len(X_test)}')
print(f'False Negatives: {len(false_negative)}/{len(X_test)}')
print(f'True Negatives: {len(true_negative)}/{len(N)}')
print(f'False Positives: {len(false_positive)}/{len(N)}')

true_positive_proba = [probability for probability, prediction in zip(pred_positive_probabilities, pred_positive) if prediction == 1]
false_negative_proba = [probability for probability, prediction in zip(pred_positive_probabilities, pred_positive) if prediction == 0]
true_negative_proba = [probability for probability, prediction in zip(pred_negative_probabilities, pred_negative) if prediction == 0]
false_positive_proba = [probability for probability, prediction in zip(pred_negative_probabilities, pred_negative) if prediction == 1]

print(f'Average prediction probability of True Positives: {np.mean(true_positive_proba)}')
print(f'Average prediction probability of False Positives: {np.mean(false_positive_proba)}')
print(f'Average prediction probability of True Negatives: {np.mean(true_negative_proba)}')
print(f'Average prediction probability of False Negatives: {np.mean(false_negative_proba)}')

false_positive_users = [user_id for user_id, prediction in zip(multi_class_data[multi_class_data[CLASS_COL] != positive_class_id][CLASS_COL].values, pred_negative) if prediction == 1]
false_positive_user_counts = Counter(false_positive_users)

print(f'False positive users: {len(set(false_positive_users))}')
print('User   |   Count')
print('----------------')
for userid, count in sorted(false_positive_user_counts.items()):
    if count > 10:
        print(f'{userid:7} {count:5}')


false_positive_dataframe = create_dataframe(false_positive, 'false positive', DATA_COLS)
true_positive_dataframe = create_dataframe(positive, 'true positive', DATA_COLS)
combined_dataframe = pd.concat([true_positive_dataframe, false_positive_dataframe], ignore_index=True)
for key_col in KEY_COLS:
    combined_dataframe[key_col] = pd.to_numeric(DATA[key_col][0], downcast='unsigned')

plot_pp_data(combined_dataframe, DATA_PASSWORD, 
    class_categories=['true positive', 'false positive'],
    target_category=None,
    plot_type='line', #'datapoints', 'violin', 'overlap', 'line'
    trim_outliers=True,
    display=True,
    save=False
)