In [1]:
import os
from os.path import join
import pandas as pd
from pathlib import Path
import pdpipe as pdp
import sys
import numpy as np
from tqdm import tqdm

project_root = Path('..')
sys.path.append(os.path.abspath(project_root))
from notebooks.utils import init_data_dir, extract_author_texts  # noqa

from notebooks import pipes
from notebooks.profiles import EuclideanProfile
from notebooks.feature_extractors import HeuristicsExtractor
from notebooks.thresholders import SimpleAccuracyThresholder

init_data_dir(project_root)

preprocess_path = join(project_root, Path('data/preprocess'))
outputs_path = join(project_root, 'outputs')

train_df = pd.read_hdf(join(preprocess_path, 'bawe_train_sentences.hdf5'))
valid_df = pd.read_hdf(join(preprocess_path, 'bawe_valid_sentences.hdf5'))

train_df = train_df.rename(columns={"sentence": "text"})
valid_df = valid_df.rename(columns={"sentence": "text"})

In [2]:
feature_extractors = [(HeuristicsExtractor(), "heuristics_extractor")]

profiles = [(EuclideanProfile(), "euclidean_distance_profile")]

thresholders = [(SimpleAccuracyThresholder(), "accuracy_thresholder")]

In [3]:
preprocessed_dfs = []

for feature_extractor, display_name in feature_extractors:
    train_path = join(preprocess_path, f"bawe_train_preprocessed_{display_name}.hdf5")
    valid_path = join(preprocess_path, f"bawe_valid_preprocessed_{display_name}.hdf5")

    preprocessed_train_exists = os.path.exists(train_path)
    preprocessed_valid_exists = os.path.exists(valid_path)

    if not (preprocessed_train_exists and preprocessed_valid_exists):
        print(f"Preprocessing train dataset for {display_name}", flush=True)
        preprocessed_train_df = feature_extractor(train_df, show_loading=True)
        print(f"Preprocessing valid dataset for {display_name}", flush=True)
        preprocessed_valid_df = feature_extractor(valid_df, show_loading=True)

        preprocessed_train_df.to_hdf(train_path, key=f"bawe_train_preprocessed_{display_name}")
        preprocessed_valid_df.to_hdf(valid_path, key=f"bawe_valid_preprocessed_{display_name}")
    else:
        preprocessed_train_df = pd.read_hdf(train_path)
        preprocessed_valid_df = pd.read_hdf(valid_path)

    preprocessed_dfs.append((preprocessed_train_df, preprocessed_valid_df, display_name))

In [6]:
def train_threshold(profile, df, thresholder):
    author_set = set(df.index.get_level_values(0))

    print("Training...", flush=True)
    distance_sets = []
    true_flag_sets = []
    for author in tqdm(author_set):
        profile.reset()

        author_texts, rest_df = extract_author_texts(author, df)
        profile.feed(author_texts)
        distances = profile.distances(rest_df)

        true_flags = distances.index.get_level_values(0) != author

        distance_sets.append(distances[0].to_numpy())
        true_flag_sets.append(true_flags.to_numpy())

    distances = np.concatenate(distance_sets)
    true_flags = np.concatenate(true_flag_sets)

    return thresholder(distances, true_flags)


def test_profile(profile, threshold, df):
    author_set = set(df.index.get_level_values(0))

    print("Testing...", flush=True)
    flag_sets = []
    true_flag_sets = []
    for author in tqdm(author_set):
        profile.reset()

        author_texts, rest_df = extract_author_texts(author, df)
        profile.feed(author_texts)
        distances = profile.distances(rest_df)

        flags = distances[0] > threshold
        true_flags = distances.index.get_level_values(0) != author

        flag_sets.append(flags.to_numpy())
        true_flag_sets.append(true_flags.to_numpy())

    flags = np.concatenate(flag_sets)
    true_flags = np.concatenate(true_flag_sets)


score_data = []
model_names = []

for profile, profile_name in profiles:
    for thresholder, thresholder_name in thresholders:
        for preprocessed_train_df, preprocessed_valid_df, extractor_name in preprocessed_dfs:
            threshold = train_threshold(profile, preprocessed_train_df, thresholder)
            profile.reset()

            scores = test_profile(profile, threshold, preprocessed_valid_df)
            score_data.append(scores)
            model_names.append(f"{profile_name}-{thresholder_name}-{extractor_name}")

KeyError: 'author'

In [7]:
preprocessed_dfs[0][0].index.get_level_values(0)

Int64Index([   1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
            ...
            6998, 6998, 6998, 6998, 6998, 6998, 6998, 6998, 6998, 6998],
           dtype='int64', name='author', length=147807)

In [3]:
# pospca_extractor = OldPOSPCAExtractor(25, 10)
# pospca_profile = MahalanobisProfile(pospca_extractor)

heuristics_extractor = HeuristicExtractor(4)
heuristics_profile = MahalanobisProfile(heuristics_extractor)

pos2gram_extractor = OldPOS2GramExtractor(paragraph_length=1, best=20)
pos2gram_profile = MahalanobisProfile(pos2gram_extractor)

combined_extractor = ConcatExtractor(heuristics_extractor, pos2gram_extractor)
combined_profile = MahalanobisProfile(combined_extractor)

profiles = [heuristics_profile, pos2gram_profile, combined_profile]
profile_names = ['Heuristics', 'POS Bigrams', 'Combined']

In [4]:
benchmark_results = benchmark_profiles(grouped_valid_df, profiles,
                                       show_loading=True, names=profile_names, samples=20, authors_per_sample=5)

benchmark_results

100%|##########| 20/20 [13:51<00:00, 41.56s/it]


Unnamed: 0,flag,Heuristics,POS Bigrams,Combined
0,True,0.134598,0.999421,0.998544
1,True,0.497050,0.966469,0.959002
2,True,0.192381,0.868973,0.706363
3,True,0.484490,0.831687,0.509005
4,True,0.622277,0.223367,0.004089
...,...,...,...,...
206,True,0.597925,1.000000,1.000000
207,True,0.653880,1.000000,1.000000
208,True,0.987519,1.000000,1.000000
209,True,0.699435,1.000000,1.000000


In [5]:
benchmark_flags = benchmark_results.copy()

benchmark_flags[profile_names] = benchmark_flags[profile_names] < 0.85

In [6]:
benchmark_flags

Unnamed: 0,flag,Heuristics,POS Bigrams,Combined
0,True,True,False,False
1,True,True,False,False
2,True,True,False,True
3,True,True,True,True
4,True,True,True,True
...,...,...,...,...
206,True,True,False,False
207,True,True,False,False
208,True,False,False,False
209,True,True,False,False


In [7]:
positives_selection = benchmark_flags['flag']
negatives_selection = np.logical_not(benchmark_flags['flag'])

all_positives = positives_selection.sum()
all_negatives = negatives_selection.sum()

true_negatives = np.logical_not(benchmark_flags[negatives_selection][profile_names]).sum()
true_positives = benchmark_flags[positives_selection][profile_names].sum()

false_positives = np.logical_not(benchmark_flags[negatives_selection][profile_names]).sum()

sensitivity = true_positives / all_positives
specificity = true_negatives / all_negatives

precision = true_positives / (true_positives + false_positives)

balanced_accuracy = (sensitivity + specificity) / 2

train_benchmarks = pd.DataFrame(
    data=[balanced_accuracy, specificity, sensitivity, precision],
    index=['balanced accuracy', 'specificity', 'sensitivity (recall)',
           'precision']).T

train_benchmarks

Unnamed: 0,balanced accuracy,specificity,sensitivity (recall),precision
Heuristics,0.676396,0.5,0.852792,0.96
POS Bigrams,0.545323,0.857143,0.233503,0.793103
Combined,0.516679,0.571429,0.461929,0.919192


In [14]:
train_benchmarks.to_hdf(join(outputs_path, 'bawe_train_benchmarks.hdf5'), key='bawe_train_benchmarks')

In [20]:
benchmark_flags[negatives_selection][profile_names].sum()

pospca        10
heuristics     3
dtype: int64

In [16]:
7 / (10)

0.7

In [5]:
benchmark_results.to_hdf(join(preprocess_path, 'benchmark_results.hdf5'), key='benchmark_results')

In [6]:
benchmark_results = pd.read_hdf(join(preprocess_path, 'benchmark_results.hdf5'))

benchmark_results

Unnamed: 0,flag,pospca,heuristics
0,False,0.000000,0.000000
1,True,0.000000,0.000000
2,True,0.000000,0.000000
3,True,0.000000,0.000000
4,True,0.000000,0.000000
...,...,...,...
978,True,0.000000,0.005036
979,True,0.002207,0.006970
980,True,0.000000,0.338741
981,True,0.000000,0.108420
