In [1]:
import os
from os.path import join
import pandas as pd
from pathlib import Path
import pdpipe as pdp
import sys
import numpy as np

project_root = Path('..')
sys.path.append(os.path.abspath(project_root))
from notebooks.utils import init_data_dir  # noqa

from notebooks.benchmarking import benchmark_profiles  # noqa
from notebooks import pipes
from notebooks.profiles import MahalanobisProfile
from notebooks.feature_extractors import HeuristicExtractor, POS2GramExtractor, ConcatExtractor

init_data_dir(project_root)

preprocess_path = join(project_root, Path('data/preprocess'))
outputs_path = join(project_root, 'outputs')

train_df = pd.read_hdf(join(preprocess_path, 'bawe_train_sentences.hdf5'))
valid_df = pd.read_hdf(join(preprocess_path, 'bawe_valid_sentences.hdf5'))

In [2]:
pipeline = pipes.GroupSentences(n=50)

grouped_train_df = pipeline(train_df)

grouped_valid_df = pipeline(valid_df)

In [3]:
# pospca_extractor = POSPCAExtractor(25, 10)
# pospca_profile = MahalanobisProfile(pospca_extractor)

heuristics_extractor = HeuristicExtractor(1)
heuristics_profile = MahalanobisProfile(heuristics_extractor)

pos2gram_extractor = POS2GramExtractor(paragraph_length=1)
pos2gram_profile = MahalanobisProfile(pos2gram_extractor)

combined_extractor = ConcatExtractor(heuristics_extractor, pos2gram_extractor)
combined_profile = MahalanobisProfile(combined_extractor)

profiles = [heuristics_profile, pos2gram_profile, combined_profile]
profile_names = ['Heuristics', 'POS Bigrams', 'Combined Heuristics/Bigrams']

In [4]:
benchmark_results = benchmark_profiles(grouped_train_df, profiles,
                                       show_loading=True, names=profile_names,
                                       authors_per_sample=2, samples=2)

benchmark_results

100%|##########| 2/2 [00:12<00:00,  6.43s/it]


Unnamed: 0,flag,Heuristics,POS Bigrams,Combined Heuristics/Bigrams
0,False,0.330821,0.0,0.0
1,True,0.894906,0.0,0.0
2,True,0.897627,0.0,0.0
3,True,0.0,0.0,0.0
4,True,0.0,0.0,0.0
5,True,0.0,0.0,0.0
6,True,0.0,0.0,0.0
7,True,0.0,0.0,0.0


In [7]:
benchmark_flags = benchmark_results.copy()

benchmark_flags[profile_names] = benchmark_flags[profile_names] < 0.85

In [52]:
benchmark_flags

Unnamed: 0,flag,pospca,heuristics
0,False,True,True
1,True,True,True
2,True,True,True
3,True,True,True
4,True,True,True
...,...,...,...
978,True,True,True
979,True,True,True
980,True,True,True
981,True,True,True


In [10]:
positives_selection = benchmark_flags['flag']
negatives_selection = np.logical_not(benchmark_flags['flag'])

all_positives = positives_selection.sum()
all_negatives = negatives_selection.sum()

true_negatives = np.logical_not(benchmark_flags[negatives_selection][profile_names]).sum()
true_positives = benchmark_flags[positives_selection][profile_names].sum()

false_positives = np.logical_not(benchmark_flags[negatives_selection][profile_names]).sum()

sensitivity = true_positives / all_positives
specificity = true_negatives / all_negatives

precision = true_positives / (true_positives + false_positives)

balanced_accuracy = (sensitivity + specificity) / 2

train_benchmarks = pd.DataFrame(data=[balanced_accuracy, specificity, sensitivity, precision],
             index=['balanced accuracy', 'specificity', 'sensitivity (recall)', 'precision']).T

In [14]:
train_benchmarks.to_hdf(join(outputs_path, 'bawe_train_benchmarks.hdf5'), key='bawe_train_benchmarks')

In [20]:
benchmark_flags[negatives_selection][profile_names].sum()

pospca        10
heuristics     3
dtype: int64

In [16]:
7 / (10)

0.7

In [5]:
benchmark_results.to_hdf(join(preprocess_path, 'benchmark_results.hdf5'), key='benchmark_results')

In [6]:
benchmark_results = pd.read_hdf(join(preprocess_path, 'benchmark_results.hdf5'))

benchmark_results

Unnamed: 0,flag,pospca,heuristics
0,False,0.000000,0.000000
1,True,0.000000,0.000000
2,True,0.000000,0.000000
3,True,0.000000,0.000000
4,True,0.000000,0.000000
...,...,...,...
978,True,0.000000,0.005036
979,True,0.002207,0.006970
980,True,0.000000,0.338741
981,True,0.000000,0.108420
