In [1]:
import os
from os.path import join
import pandas as pd
from pathlib import Path
import pdpipe as pdp
import sys
import numpy as np

project_root = Path('..')
sys.path.append(os.path.abspath(project_root))
from notebooks.utils import init_data_dir  # noqa

from notebooks.benchmarking import benchmark_profiles  # noqa
from notebooks import pipes
from notebooks.profiles import MahalanobisProfile
from notebooks.feature_extractors import HeuristicExtractor, OldPOS2GramExtractor, ConcatExtractor

init_data_dir(project_root)

preprocess_path = join(project_root, Path('data/preprocess'))
outputs_path = join(project_root, 'outputs')

train_df = pd.read_hdf(join(preprocess_path, 'bawe_train_sentences.hdf5'))
valid_df = pd.read_hdf(join(preprocess_path, 'bawe_valid_sentences.hdf5'))

In [2]:
pipeline = pipes.GroupSentences(n=50)

grouped_train_df = pipeline(train_df)

grouped_valid_df = pipeline(valid_df)

In [3]:
# pospca_extractor = OldPOSPCAExtractor(25, 10)
# pospca_profile = MahalanobisProfile(pospca_extractor)

heuristics_extractor = HeuristicExtractor(4)
heuristics_profile = MahalanobisProfile(heuristics_extractor)

pos2gram_extractor = OldPOS2GramExtractor(paragraph_length=1, best=20)
pos2gram_profile = MahalanobisProfile(pos2gram_extractor)

combined_extractor = ConcatExtractor(heuristics_extractor, pos2gram_extractor)
combined_profile = MahalanobisProfile(combined_extractor)

profiles = [heuristics_profile, pos2gram_profile, combined_profile]
profile_names = ['Heuristics', 'POS Bigrams', 'Combined']

In [4]:
benchmark_results = benchmark_profiles(grouped_valid_df, profiles,
                                       show_loading=True, names=profile_names, samples=20, authors_per_sample=5)

benchmark_results

100%|##########| 20/20 [13:51<00:00, 41.56s/it]


Unnamed: 0,flag,Heuristics,POS Bigrams,Combined
0,True,0.134598,0.999421,0.998544
1,True,0.497050,0.966469,0.959002
2,True,0.192381,0.868973,0.706363
3,True,0.484490,0.831687,0.509005
4,True,0.622277,0.223367,0.004089
...,...,...,...,...
206,True,0.597925,1.000000,1.000000
207,True,0.653880,1.000000,1.000000
208,True,0.987519,1.000000,1.000000
209,True,0.699435,1.000000,1.000000


In [5]:
benchmark_flags = benchmark_results.copy()

benchmark_flags[profile_names] = benchmark_flags[profile_names] < 0.85

In [6]:
benchmark_flags

Unnamed: 0,flag,Heuristics,POS Bigrams,Combined
0,True,True,False,False
1,True,True,False,False
2,True,True,False,True
3,True,True,True,True
4,True,True,True,True
...,...,...,...,...
206,True,True,False,False
207,True,True,False,False
208,True,False,False,False
209,True,True,False,False


In [7]:
positives_selection = benchmark_flags['flag']
negatives_selection = np.logical_not(benchmark_flags['flag'])

all_positives = positives_selection.sum()
all_negatives = negatives_selection.sum()

true_negatives = np.logical_not(benchmark_flags[negatives_selection][profile_names]).sum()
true_positives = benchmark_flags[positives_selection][profile_names].sum()

false_positives = np.logical_not(benchmark_flags[negatives_selection][profile_names]).sum()

sensitivity = true_positives / all_positives
specificity = true_negatives / all_negatives

precision = true_positives / (true_positives + false_positives)

balanced_accuracy = (sensitivity + specificity) / 2

train_benchmarks = pd.DataFrame(
    data=[balanced_accuracy, specificity, sensitivity, precision],
    index=['balanced accuracy', 'specificity', 'sensitivity (recall)',
           'precision']).T

train_benchmarks

Unnamed: 0,balanced accuracy,specificity,sensitivity (recall),precision
Heuristics,0.676396,0.5,0.852792,0.96
POS Bigrams,0.545323,0.857143,0.233503,0.793103
Combined,0.516679,0.571429,0.461929,0.919192


In [14]:
train_benchmarks.to_hdf(join(outputs_path, 'bawe_train_benchmarks.hdf5'), key='bawe_train_benchmarks')

In [20]:
benchmark_flags[negatives_selection][profile_names].sum()

pospca        10
heuristics     3
dtype: int64

In [16]:
7 / (10)

0.7

In [5]:
benchmark_results.to_hdf(join(preprocess_path, 'benchmark_results.hdf5'), key='benchmark_results')

In [6]:
benchmark_results = pd.read_hdf(join(preprocess_path, 'benchmark_results.hdf5'))

benchmark_results

Unnamed: 0,flag,pospca,heuristics
0,False,0.000000,0.000000
1,True,0.000000,0.000000
2,True,0.000000,0.000000
3,True,0.000000,0.000000
4,True,0.000000,0.000000
...,...,...,...
978,True,0.000000,0.005036
979,True,0.002207,0.006970
980,True,0.000000,0.338741
981,True,0.000000,0.108420
