In [2]:
%run "./1_Config.ipynb"

Available CPUs: 12
Configured user-id: 999
Configured password: trustno1
Positive class data: 100 samples with 36 features


### Async Wrapper Method

In [5]:
def background(f):
    def wrapped(*args, **kwargs):
        return asyncio.get_event_loop().run_in_executor(None, f, *args, **kwargs)

    return wrapped

### Random Repeatability Reset Method

In [6]:
# IMPORTANT: Requires sequential computations on one CPU, multithreading can break repeatability!
def reset_random_state(seed_value=0): 
    os.environ['PYTHONHASHSEED']=str(seed_value) 
    os.environ['TF_DETERMINISTIC_OPS'] = '1'
    random.seed(seed_value) 
    np.random.seed(seed_value)
    tf.random.set_seed(seed_value)
    tf.compat.v1.set_random_seed(3)

    tf.keras.backend.clear_session()
    tf.compat.v1.reset_default_graph()
    session_conf = tf.compat.v1.ConfigProto(intra_op_parallelism_threads=1, inter_op_parallelism_threads=1)
    sess = tf.compat.v1.Session(graph=tf.compat.v1.get_default_graph(), config=session_conf)
    tf.compat.v1.keras.backend.set_session(sess)

### Data Util Methods

In [7]:
def flatten(l):
    return [item for sublist in l for item in sublist]

def dict_mean(dictionaries):
    result_dict = {}
    for key in dictionaries[0].keys():
        result_dict[key] = sum(dictionary[key] for dictionary in dictionaries) / len(dictionaries)
    return result_dict
    
def is_float(num):
    try:
        floatnum = float(num)
        return True
    except ValueError:
        return False
        
def synthesize_normal(data, samples=None):
    if samples == None: 
        samples = len(data)
        
    means = np.mean(data, axis=0)
    stds = np.std(data, axis=0)
    
    return np.array([[np.random.normal(mean, std) for mean, std in tuple(zip(means, stds))] for i in range(samples)])

def synthesize_dissimilar(data, samples=None):
    if samples == None: 
        samples = len(data)
    
    (normalized, maxima, minima) = normalize(data)
    means = np.mean(normalized, axis=0)
    average_mse = np.mean((normalized-means)**2, axis=1).mean()
    
    uniform_data = []
    for count in range(samples):
        sample = [np.random.uniform(0, 1) for i in range(len(means))] # TODO: generate samples in the range of 0 to 2*max(feature)
        error = np.mean((sample-means)**2)
        while error < np.sqrt(average_mse/4):
            sample = [np.random.uniform(0, 1) for i in range(len(means))]
            error = np.mean((sample-means)**2)
            
        uniform_data.append(sample)

    denormalized = denormalize(np.array(uniform_data), maxima, minima)
    
    return denormalized

def sample_rows(data, n):
    if(n > len(data)):
        raise ValueError(f'Array has fewer rows ({len(data)}) than requested number of samples {n}!')
    
    return data[np.random.choice(len(data), n, replace=False), :]

def normalize(data): 
    maxima=np.max(data, axis=0)
    minima=np.min(data, axis=0)

    normalized = np.array(np.transpose([(data[:,idx] - minimum) / (maximum - minimum) for idx, (minimum, maximum) in enumerate(tuple(zip(minima, maxima)))]))

    return (normalized, maxima, minima)

def denormalize(data, maxima, minima):
    return np.array(np.transpose([data[:,idx] * (maximum - minimum) + minimum for idx, (minimum, maximum) in enumerate(tuple(zip(minima, maxima)))]))

def augment(data, target_n):
    (normalized, maxima, minima) = normalize(data)
    synthesized = synthesize_normal(normalized, target_n)
    
    return denormalize(synthesized, maxima, minima)

def drop_outliers(data, target_col, outlier_count=6, outlier_ratio=None):
    if outlier_count is not None and outlier_ratio is not None:
        raise ValueError(f'Both \'outlier_count\' and \'outlier_ratio\' specified! Pick only one.')
        
    drop_count = 0
    if outlier_ratio is not None:
        drop_count = int(outlier_ratio * len(data_col) / 2)
    elif outlier_count is not None:
        if outlier_count % 2 != 0:
            warnings.warn('The parameter \'outlier_count\' is not even. Actually removed outliers will be rounded down to nearest even number (half on each side).')
        drop_count = int(outlier_count / 2.0)
    else: 
        warnings.warn(f'Neither \'outlier_count\' nor \'outlier_ratio\' specified! No outliers will be removed.')
        return data_col
    
    if drop_count <= 0:
        warnings.warn(f'Neither \'outlier_count\' nor \'outlier_ratio\' specified! No outliers will be removed.')
        return
    
    data.sort_values(by=[target_col], inplace=True)
    data.drop(data.head(drop_count).index, inplace=True)
    data.drop(data.tail(drop_count).index, inplace=True)

def create_dataframe(target_data, target_class, data_cols):
    dataframe = pd.DataFrame(target_data, columns=data_cols)
    dataframe[CLASS_COL] = target_class
    
    return dataframe

### Plotting & Data Visualization Methods

In [2]:
def plot_data(data, category_col, target_cols, col_title_gen, ylabel, 
            title, legend_title, file, path=f'img/data-visualisations', 
            plot_type='datapoints', keys=None,
            target_category=TARGET_CLASS_USER_ID, class_categories=CLASS_CATEGORIES,
            display=DISPLAY_PLOTS, save=SAVE_PLOTS, transparency=TRANSPARENT_BACKGROUND,
            width=None, height=8, cols_share_y=True, trim_outliers=False):
    if not display and not save:
        return
    
    if isinstance(target_cols, str):
        # target_cols is regex -> get real target cols
        target_col_regex = re.compile(target_cols)
        target_cols = [col for col in data.columns.values if target_col_regex.match(col)]
    
    if len(target_cols) == 0:
        raise ValueError(f'No target columns found to plot.')
        
    plot_columns = [category_col, *target_cols]    
    for col_name in plot_columns:
        if col_name not in data.columns:
            raise ValueError(f'Invalid column name: {col_name}.')
            
    if keys is None:
        keys = get_keys(data)

    target_data=data[plot_columns].copy(deep=True)
    if class_categories == [] and target_category != None:
        target_data = target_data[target_data[category_col] == target_category]
        target_category = None
    
    if target_category is not None:
        target_data.loc[target_data[category_col] != target_category, [category_col]] = class_categories[1]
        target_data.loc[target_data[category_col] == target_category, [category_col]] = class_categories[0]
    else:
        for category in sorted(target_data[category_col].unique()):
            target_data.loc[target_data[category_col] == category, [category_col]] = f'{category}'
    
    unique_categories = sorted(target_data[category_col].unique())    
    order = class_categories 
    if target_category is None and len(order) == 0:
        order = [str(i) for i in sorted([int(j) for j in unique_categories])] if all([is_float(category) for category in unique_categories]) else [str(i) for i in sorted(unique_categories)]
    
    plt.close('all')
    plt.ioff()
    fig = plt.figure()

    plot_data = target_data.copy(deep=True)
    legend_artists = None
    if plot_type == 'line':
        # ignoring trim outliers      
        col_titles = [col_title_gen(keys, col_idx) if callable(col_title_gen) else col_title_gen for col_idx, _ in enumerate(target_cols)]

        if width == None:
            width = len(col_titles)*2
        
        ax = plt.gca()
        legend_artists = plot_lines(ax, plot_data[[category_col, *target_cols]], category_col, target_cols, order, col_titles=col_titles, trim_outliers=drop_outliers)

        ax.get_legend().set(visible=False)

        ax.set_xlabel('Keys')
        ax.set_ylabel('Seconds')
        ax.xaxis.set_ticks(ax.get_xticks(), labels=col_titles)

    else:
        grid = gridspec.GridSpec(1, len(target_cols), wspace=0.0, hspace=0.0) # 1 row, no spacing between axes

        if width == None: 
            width = 2*len(target_cols)*len(plot_data[category_col].unique())
        
        for col_idx, target_column in enumerate(target_cols):
            sharey = None if (col_idx==0 or cols_share_y) else plt.subplot(grid[0, 0])
            ax = plt.subplot(grid[0, col_idx], sharey=sharey)
            col_title = col_title_gen(keys, col_idx) if callable(col_title_gen) else col_title_gen
            
            if trim_outliers:
                drop_outliers(plot_data, target_column)
            
            mean_var_vals = [(plot_data[plot_data[category_col] == category][target_column].mean(), plot_data[plot_data[category_col] == category][target_column].std()) for category in order]
            x_tick_labels = [f'μ: {mean:.3f}\nσ: {std:.3f}' for mean, std in mean_var_vals]
            
            match plot_type:
                case 'datapoints':
                    legend_artists = plot_column_datapoints(ax, plot_data[[category_col, target_column]], category_col, target_column, order, col_title=col_title, trim_outliers=drop_outliers)
                case 'violin':
                    legend_artists = plot_column_violins(ax, plot_data[[category_col, target_column]], category_col, target_column, order, col_title=col_title, trim_outliers=drop_outliers)
                case 'overlap':
                    legend_artists = plot_column_overlap(ax, plot_data[[category_col, target_column]], category_col, target_column, order, col_title=col_title, trim_outliers=drop_outliers)
                case _:
                    raise ValueError(f'Invalid \'plot_type\' parameter: {plot_type}.')    
                    
            ax.xaxis.set_ticks(ax.get_xticks(), labels=x_tick_labels)
            if col_idx!=0:
                ax.yaxis.set_ticks([])
    
            ax.get_legend().set(visible=False)
            ax.margins(x=1.0/len(unique_categories))
    
            ax.set_xlabel(None)
            if col_idx==0:
                ax.set_ylabel(ylabel, fontsize=14)
            else:
                ax.set_ylabel(None)
    
            ax.set_title(col_title)        
        
        
    plt.suptitle(title, y=0.975, fontsize=18)    
    handles, labels = legend_artists
    fig.legend(title=legend_title, handles=handles, labels=labels, loc="center left", bbox_to_anchor=(0.91, 0.5))

    if height == None: 
        height = 8
        
    fig.set_size_inches(width, height)
        
    if save:
        save_plot(f'{file}', path, transparency)
    if display:
        display_plot()

def plot_lines(ax, data, category_col, target_cols, order, trim_outliers=False, col_titles = None):  
    unique_categories = np.sort(data[category_col].unique()).astype(str)
   
    dataset = []
    for category in unique_categories:
        category_data = data[data[category_col] == category]
        for idx, target_col in enumerate(target_cols):
            dataset.append({
                category_col: category,
                'x': target_col,
                'y': np.mean(category_data[target_col]),
                'min': np.mean(category_data[target_col])-np.abs(np.std(category_data[target_col])),
                'max': np.mean(category_data[target_col])+np.abs(np.std(category_data[target_col]))
            })
        
    dataframe = pd.DataFrame(dataset, columns=[category_col, 'x', 'y', 'min', 'max'])
    sns.lineplot(ax=ax, data=dataframe, x='x', y='y', hue=category_col, hue_order=order, palette="Set2")

    if len(order) <= 2:
        for category in order:
            ax.fill_between(range(len(col_titles)), 
                            dataframe[dataframe[category_col] == category]['min'], 
                            dataframe[dataframe[category_col] == category]['max'], 
                            alpha=0.4)
        
    return ax.get_legend_handles_labels()
    
def plot_column_datapoints(ax, data, category_col, target_col, order, trim_outliers=False, col_title = None):  
    sns.stripplot(ax=ax, data=data, x=category_col, y=target_col, hue=category_col, order=order, hue_order=order, marker='o', size=10, alpha=.35, jitter=True, palette="Set2")    
    
    return ax.get_legend_handles_labels()
    
def plot_column_violins(ax, data, category_col, target_col, order, trim_outliers=False, col_title = None):                
    sns.violinplot(ax=ax, data=data, x=category_col, y=target_col, scale='width', dodge=False, hue=category_col, order=order, hue_order=order, scale_hue=False, palette='Set2')
    
    return ax.get_legend_handles_labels()
    
def plot_column_overlap(ax, data, category_col, target_col, order, trim_outliers=False, col_title = None):            
    sns.violinplot(ax=ax, data=data, x=category_col, y=target_col, hue=category_col, order=order, hue_order=order, palette='Set2', scale='width', dodge=False, inner=None)
    ret = ax.get_legend_handles_labels()
    
    xlim = ax.get_xlim()
    ylim = ax.get_ylim()
    for violin in ax.collections:
        bbox = violin.get_paths()[0].get_extents()
        x0, y0, width, height = bbox.bounds
        violin.set_clip_path(plt.Rectangle((x0, y0), width / 2, height, transform=ax.transData))
    
    old_len_collections = len(ax.collections)
    sns.stripplot(ax=ax, data=data, x=category_col, y=target_col, hue=category_col, order=order, hue_order=order, palette='Set2', dodge=False)
    for dots in ax.collections[old_len_collections:]:
        dots.set_offsets(dots.get_offsets() + np.array([0.12, 0]))
    
    sns.boxplot(ax=ax, data=data, x=category_col, y=target_col, hue=category_col, order=order, hue_order=order, palette='Set2', saturation=1, width=0.3, boxprops={'zorder': 3, 'facecolor': 'none'}, dodge=False)
    
    ax.set_xlim(xlim)
    ax.set_ylim(ylim)
    
    return ret

NameError: name 'TARGET_CLASS_USER_ID' is not defined

### Data Visualization Util Methods

In [9]:
def display_plot():
    plt.show()
    plt.close('all')

def save_plot(file, path, transparency):
    if not os.path.exists(path):
        os.makedirs(path)
    plt.savefig(f'{path}/{file}.png', format='png', dpi=100, transparent=transparency)
    
def concat_images(file, images, path, orientation='vertical'):
    if len(images) < 2:
        return 
    
    images = [Image.open(f'{path}/{image}.png') for image in images]
    widths, heights = zip(*(i.size for i in images))
    
    match orientation:
        case 'horizontal':    
            total_width = sum(widths)
            max_height = max(heights)

            new_im = Image.new('RGBA', (total_width, max_height))

            x_offset = 0
            for im in images:
              new_im.paste(im, (x_offset,0))
              x_offset += im.size[0]

            new_im.save(filename)
        case 'vertical':    
            total_height = sum(heights)
            max_width = max(widths)

            new_im = Image.new('RGBA', (max_width, total_height))
            
            y_offset = 0
            for im in images:
              new_im.paste(im, (0,y_offset))
              y_offset += im.size[1]

            new_im.save(f'{path}/{file}.png')
        case _:
            raise ValueError(f'Orientation \'{orientation}\' is invalid. Use \'horizontal\' or \'vertical\'.')

def get_keys(data):
    key_re = re.compile('key\d+')
    target_cols = [col for col in data.columns.values if key_re.match(col)]
    return [chr(data.at[0, key_col]) for key_col in target_cols]

### Feature-Specific Data Visualization Methods

In [6]:
col_title_gen_keys = lambda keys, idx : keys[idx].lower().replace(' ', '␣')
col_title_gen_relations = lambda keys, idx : f'{keys[idx]}_→_{keys[idx+1]}'.lower().replace(' ', '␣').replace('_', ' ')

def plot_d_data(data, password, legend_title = 'class', file = 'press-duration-times', path=f'img/data-visualisations', 
            plot_type='datapoints', keys=None,
            target_category=TARGET_CLASS_USER_ID, class_categories=CLASS_CATEGORIES,
            display=DISPLAY_PLOTS, save=SAVE_PLOTS, transparency=TRANSPARENT_BACKGROUND,
            width=None, height=8, cols_share_y=True, trim_outliers=False):
    plot_data(data, 'userId', DURATION_COL_PATTERN, col_title_gen_keys, 'seconds', 
              f'Press-Duration Times: \'{password}\'', legend_title,
              file, path, plot_type, keys, target_category, class_categories,
              display, save, transparency, width, height, cols_share_y, trim_outliers)
    
def plot_pp_data(data, password, legend_title = 'class', file = 'press-press-times', path=f'img/data-visualisations', 
            plot_type='datapoints', keys=None,
            target_category=TARGET_CLASS_USER_ID, class_categories=CLASS_CATEGORIES,
            display=DISPLAY_PLOTS, save=SAVE_PLOTS, transparency=TRANSPARENT_BACKGROUND,
            width=None, height=8, cols_share_y=True, trim_outliers=False):
    plot_data(data, 'userId', PP_COL_PATTERN, col_title_gen_relations, 'seconds', 
              f'Press-Press Times: \'{password}\'', legend_title,
              file, path, plot_type, keys, target_category, class_categories,
              display, save, transparency, width, height, cols_share_y, trim_outliers)

def plot_pr_data(data, password, legend_title = 'class', file = 'press-release-times', path=f'img/data-visualisations', 
            plot_type='datapoints', keys=None,
            target_category=TARGET_CLASS_USER_ID, class_categories=CLASS_CATEGORIES,
            display=DISPLAY_PLOTS, save=SAVE_PLOTS, transparency=TRANSPARENT_BACKGROUND,
            width=None, height=8, cols_share_y=True, trim_outliers=False):
    plot_data(data, 'userId', PR_COL_PATTERN, col_title_gen_relations, 'seconds', 
              f'Press-Release Times: \'{password}\'', legend_title,
              file, path, plot_type, keys, target_category, class_categories,
              display, save, transparency, width, height, cols_share_y, trim_outliers)

def plot_rp_data(data, password, legend_title = 'class', file = 'release-press-times', path=f'img/data-visualisations', 
            plot_type='datapoints', keys=None,
            target_category=TARGET_CLASS_USER_ID, class_categories=CLASS_CATEGORIES,
            display=DISPLAY_PLOTS, save=SAVE_PLOTS, transparency=TRANSPARENT_BACKGROUND,
            width=None, height=8, cols_share_y=True, trim_outliers=False):
    plot_data(data, 'userId', RP_COL_PATTERN, col_title_gen_relations, 'seconds', 
              f'Release-Press Times: \'{password}\'', legend_title,
              file, path, plot_type, keys, target_category, class_categories,
              display, save, transparency, width, height, cols_share_y, trim_outliers)

def plot_rr_data(data, password, legend_title = 'class', file = 'release-release-times', path=f'img/data-visualisations', 
            plot_type='datapoints', keys=None,
            target_category=TARGET_CLASS_USER_ID, class_categories=CLASS_CATEGORIES,
            display=DISPLAY_PLOTS, save=SAVE_PLOTS, transparency=TRANSPARENT_BACKGROUND,
            width=None, height=8, cols_share_y=True, trim_outliers=False):
    plot_data(data, 'userId', RR_COL_PATTERN, col_title_gen_relations, 'seconds', 
              f'Release-Release Times: \'{password}\'', legend_title,
              file, path, plot_type, keys, target_category, class_categories,
              display, save, transparency, width, height, cols_share_y, trim_outliers)

### Alternative Input Data Visualization Methods

In [5]:
def display_key_data_errorbars(title, filename, subplot_label_lambda, col_prefix, cols_count, 
                               group_data, group_labels, lgroup_labels, group_colors, drop_ratio = 0.025): 
    #data has to be an array
    fig = plt.figure()    
    grid = gridspec.GridSpec(1, cols_count, wspace=0.0, hspace=0.0)
    data_dim = len(group_data)   
        
    dict_temp = {}
    for group_idx in range(data_dim):
        dict_temp[group_labels[group_idx]] = []
    
    cols = copy.deepcopy(dict_temp)
    mus = copy.deepcopy(dict_temp)
    sigmas = copy.deepcopy(dict_temp)
    mins = copy.deepcopy(dict_temp)
    maxs = copy.deepcopy(dict_temp)
    
    ax_min = float("inf")  
    for col_idx in range(cols_count): 
        
        sharey = None if col_idx==0 else plt.subplot(grid[0, 0])
        ax = plt.subplot(grid[0, col_idx], sharey=sharey)            
        
        for group_idx, gdata in enumerate(group_data):
            col = gdata[col_prefix+str(col_idx)]
        
            col_mu, col_sigma = col.agg([np.mean, np.std])
            col = drop_outliers(col, drop=drop_ratio)
            col_min, col_max = np.min(col), np.max(col)

            col = np.array(col)
            col_mu = np.array(col_mu)
            col_sigma = np.array(col_sigma)
            col_min = np.array(col_min)
            col_max = np.array(col_max)
            col_mid = (col_min + col_max)/2
            
            ax_min = min(col_min, ax_min)
            
            ax.errorbar(np.array([group_idx+1]), col_mid, yerr=col_mid-col_min, fmt='none', 
                capsize=4, ecolor=group_colors[data_dim], elinewidth=2, capthick=2);
            ax.errorbar(np.array([group_idx+1]), col_mu, yerr=col_sigma, color=group_colors[group_idx][0], fmt='.', 
                        capsize=0.0, ecolor=group_colors[group_idx][1], elinewidth=6, markersize=9);
            
        ax.set_xticks([])
        if(col_idx > 0): 
            plt.setp(ax.get_yticklabels(), visible=False)
            plt.tick_params(left = False)
        ax.set_title(subplot_label_lambda(col_idx), fontname='Fira Code', fontsize=12, y=1.01)
        ax.set_xlim([0, data_dim+1])
        ax.spines['top'].set_visible(False)
                
    plt.suptitle(title, y=1.0, fontsize=18)
    
    legend_markers = [
        Line2D([0], [0], color=group_colors[data_dim], lw=2)
    ]
    legend_labels = [
        '95% of datapoints'
    ]
    for group_idx in range(data_dim):
        legend_markers.append(Line2D([0], [0], 
             markerfacecolor=group_colors[group_idx][0], 
             color=group_colors[group_idx][1],
             markersize=6, marker='o', lw=6)
        )
        
        legend_labels.append(f'µ, σ ({lgroup_labels[group_idx]})')

    fig.legend(legend_markers, legend_labels, fontsize=12, loc='upper center', bbox_to_anchor=(0.5, 0.08), fancybox=True, ncol = 3)
    
    
    display_plot(filename, 16, 6, path='img/data-visualisations')
    
def display_data_density(title, img_name, cols_prefix, shared_metric, metric_label_lambda, label, max_samples, data, colors, drop_ratio = 0.025):
    filtered = data.filter(regex='^'+cols_prefix + '|^userId', axis=1)   
    df = pd.DataFrame(columns=['user', 'value', shared_metric])
                
    for col_idx, col in enumerate(filtered.columns.values): #.iloc[:1].values[0]
        if col == 'userId':
            continue
        kwargs = {shared_metric : lambda idx : col_idx-1}
    
        series = filtered[['userId', col]].rename(columns = {'userId': 'user', col: 'value'}).assign(**kwargs)

        series.loc[series['user'] != MY_USER_ID, ['user']] = 'actor'
        series.loc[series['user'] == MY_USER_ID, ['user']] = 'user' 
        
        user_series = series[series['user'] == 'user']
        actor_series = drop_outliers(series[series['user'] == 'actor'], by='value', drop=drop_ratio)
                
        samples = min(max_samples, len(user_series), len(actor_series))
        user_series = user_series.sample(samples, random_state=RANDOM_STATE)
        actor_series = actor_series.sample(samples, random_state=RANDOM_STATE)
                
        df = pd.concat([df, user_series], ignore_index = True)
        df = pd.concat([df, actor_series], ignore_index = True)  
            
    bin_min = df['value'].min()
    bin_max = df['value'].max()
    
    if DISPLAY_PLOTS: 
        grid = sns.FacetGrid(df, col=shared_metric, palette="muted", hue='user', hue_kws={'color': colors})       
        #grid.map_dataframe(sns.histplot, x='value', kde=True, bins=15, binrange=(bin_min, bin_max))
        grid.map_dataframe(sns.kdeplot, x='value', fill=True)
        grid.set_axis_labels(label, "Density")
        grid.figure.subplots_adjust(wspace=0, hspace=0)    
        grid.fig.suptitle(title, fontsize=18, y=1.1)   
        grid.add_legend(title='', loc='lower center', fancybox=True, ncol = 2, bbox_to_anchor=(0.5, -0.1))
                
        for col_idx, col in enumerate(filtered.columns.values):
            if col == 'userId':
                continue
            ax = grid.facet_axis(0, col_idx-1)
            ax.set_title(metric_label_lambda(data, col_idx), fontname='Fira Code', fontsize=12, y=1.0)
            ax.spines['right'].set_visible(True)
        
        if SAVE_PLOTS:
            path = 'img/data-visualisations'
            if not os.path.exists(path):
                os.makedirs(path)
            plt.savefig(f'{path}/{img_name}.png', format='png', dpi=100, transparent=TRANSPARENT_BACKGROUND)

### ML-training History Visualization Methods

In [None]:
def plot_training_loss(fit_history): 
    print(fit_history)
    epochs = len(fit_history.epoch)
    hist_dataframe = pd.DataFrame(fit_history.history)
    
    loss_cols = [col for col in hist_dataframe.columns.values if re.compile('.*loss.*').match(col)]
    
    pd.DataFrame(hist_dataframe)[loss_cols].plot(
        figsize=(8, 5), xlim=[0, epochs-1], grid=True, xlabel="Epoch"
    )
    plt.legend(loss_cols)
    plt.show()


def plot_training_history(classifier_history):
    epochs = len(classifier_history.epoch)
    hist_dataframe = pd.DataFrame(classifier_history.history)
    #print(hist_dataframe)
    
    loss_cols = [col for col in hist_dataframe.columns.values if re.compile('.*loss.*').match(col)]
    val_cols = [col for col in hist_dataframe.columns.values if re.compile('val.*').match(col)]
    count_cols = [col for col in hist_dataframe.columns.values if re.compile('.*(t|f).*(p|n).*').match(col) and col not in loss_cols and col not in val_cols]
    metric_cols = [col for col in hist_dataframe.columns.values if re.compile('.*').match(col) and col not in loss_cols and col not in count_cols and col not in val_cols]
    
    pd.DataFrame(hist_dataframe)[loss_cols].plot(
        figsize=(8, 5), xlim=[0, epochs-1], grid=True, xlabel="Epoch"
    )
    plt.legend(loss_cols)
    plt.show()
    
    pd.DataFrame(hist_dataframe)[count_cols].plot(
        figsize=(8, 5), xlim=[0, epochs-1], grid=True, xlabel="Epoch" #ylim=[0, samples*0.55], 
    )
    plt.legend(count_cols)
    plt.show()
    
    pd.DataFrame(hist_dataframe)[metric_cols].plot(
        figsize=(8, 5), xlim=[0, epochs-1], grid=True, xlabel="Epoch"
    )
    plt.legend(metric_cols)
    plt.show()

### ML-Evaluation Methods

In [4]:
def evaluate_authentication(classifier, positive_dataset, negative_dataset):
    X = positive_dataset[DATA_COLS].values
    N = negative_dataset[DATA_COLS].values
    
    pred_pos = [round(elem) for elem in flatten(classifier.predict(X))]
    pred_neg = [round(elem) for elem in flatten(classifier.predict(N))]

    absolute_pos = pred_pos.count(positive_class)
    absolute_neg = pred_neg.count(negative_class)

    percent_pos = float(pred_pos.count(positive_class))/len(pred_pos)
    percent_neg = float(pred_neg.count(negative_class))/len(pred_neg)
    
    print(f'Predicted positives: {absolute_pos}/{len(pred_pos)} ({percent_pos*100:.3f}%)')
    print(f'Predicted negatives: {absolute_neg}/{len(pred_neg)} ({percent_neg*100:.3f}%)')

    return (percent_pos, percent_neg)

### Experimental Methods

In [None]:
def augment2(data, target_n):
    (normalized, maxima, minima) = normalize(data)
    synthesized = synthesize_normal(normalized, target_n)

    reshaped_means = np.mean(normalized, axis=0).reshape(1, -1)
    cosine_similarities = cosine_similarity(reshaped_means, random_generated)[0]
    
    # Replace samples in dataset B with entirely new data if similarity above threshold
    for idx, similarity in enumerate(cosine_similarities):
        while similarity < 0.9:
            print("(SN) similarity too small: ", similarity)
            replacement = synthesize_normal2(random_generated, 1)
            synthesized[idx] = replacement
            similarity = cosine_similarity(reshaped_means, replacement.reshape(1, -1))[0][0]
    
    return denormalize(synthesized, maxima, minima)


def synthesize_dissimilar2(data, samples=None):
    if samples == None:
        samples = len(data)

    (normalized, maxima, minima) = normalize(data)
    means = np.mean(normalized, axis=0)
    
    cosine_similarity_matrix = np.dot(data - means, (data - means).T)
    cosine_similarity_matrix /= np.outer(np.linalg.norm(data - means, axis=1), np.linalg.norm(data - means, axis=1))

    target_stds = np.std(normalized, axis=0) * 2 # ADJUST THIS AND CHECK

    dissimilar_data = np.array([[np.random.normal(mean, std) for mean, std in tuple(zip(means, target_stds))] for i in range(samples)])

    pca = PCA(n_components=dissimilar_data.shape[1])
    pca.fit(dissimilar_data)
    dissimilar_pca = pca.transform(dissimilar_data)
    
    dissimilar_pca *= (np.mean(means) / np.mean(dissimilar_pca))
    dissimilar_denormalized = denormalize(pca.inverse_transform(dissimilar_pca), maxima, minima)
    print(dissimilar_denormalized.shape)
    
    return dissimilar_denormalized