In [1]:
import os
from os.path import join
import pandas as pd
from pathlib import Path
import pdpipe as pdp
import sys
import numpy as np

project_root = Path('..')
sys.path.append(os.path.abspath(project_root))
from notebooks.utils import init_data_dir  # noqa

from notebooks.benchmarking import benchmark_profiles  # noqa
from notebooks import pipes
from notebooks.profiles import MahalanobisProfile
from notebooks.feature_extractors import HeuristicExtractor, OldPOS2GramTokenExtractor, ConcatExtractor
from notebooks.segmentation import POSSentenceTokenizer

init_data_dir(project_root)

preprocess_path = join(project_root, Path('data/preprocess'))
outputs_path = join(project_root, 'outputs')

train_df = pd.read_hdf(join(preprocess_path, 'bawe_train_sentences.hdf5'))
valid_df = pd.read_hdf(join(preprocess_path, 'bawe_valid_sentences.hdf5'))

In [2]:
pipeline = pipes.GroupSentences(n=50)

grouped_train_df = pipeline(train_df)

grouped_valid_df = pipeline(valid_df)

In [3]:
# grouped_train_df = grouped_train_df.sample(frac=0.05)

In [3]:
sentence_tokenizer = POSSentenceTokenizer()

tokenized_sentences = sentence_tokenizer.tokenize_list(grouped_train_df['sentence'].to_list(), 
                                                       show_loading=True)

grouped_train_df['sentence'] = tokenized_sentences

grouped_train_df

100%|##########| 116050/116050 [16:02<00:00, 120.53it/s]


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,sentence
author,text_id,group_position,sentence_position,Unnamed: 4_level_1
1,1,0,0,"[SCONJ, DET, ADJ, NOUN, DET, NOUN, ADP, NOUN, ..."
1,1,0,1,"[ADV, NOUN, AUX, VERB, ADP, DET, ADJ, NOUN, PU..."
1,1,0,2,"[DET, NOUN, ADP, NOUN, AUX, VERB, ADJ, NOUN, A..."
1,1,0,3,"[DET, ADV, ADJ, PUNCT, CCONJ, ADP, NOUN, ADP, ..."
1,1,0,4,"[ADV, PRON, AUX, ADV, VERB, DET, NOUN, DET, NO..."
...,...,...,...,...
6998,0,1,45,"[NUM, PUNCT, NOUN, PUNCT, PROPN, PROPN, PROPN,..."
6998,0,1,46,"[X, INTJ]"
6998,0,1,47,[X]
6998,0,1,48,"[DET, NOUN, VERB, SCONJ, DET, ADJ, NOUN, ADP, ..."


In [6]:
bigram_extractor = OldPOS2GramTokenExtractor()

bigram_matrix = bigram_extractor(grouped_train_df['sentence'].to_list())

In [4]:
# grouped_train_df['sentence'] = bigram_matrix

In [5]:
# grouped_train_df

In [17]:
bigram_means = np.mean(bigram_matrix, axis=0)

sorted_indices = np.flip(np.argsort(bigram_means))

In [21]:
np.reshape(np.arange(bigram_matrix.shape[1]), [18, 18])

array([[  0,   1,   2,   3,   4,   5,   6,   7,   8,   9,  10,  11,  12,
         13,  14,  15,  16,  17],
       [ 18,  19,  20,  21,  22,  23,  24,  25,  26,  27,  28,  29,  30,
         31,  32,  33,  34,  35],
       [ 36,  37,  38,  39,  40,  41,  42,  43,  44,  45,  46,  47,  48,
         49,  50,  51,  52,  53],
       [ 54,  55,  56,  57,  58,  59,  60,  61,  62,  63,  64,  65,  66,
         67,  68,  69,  70,  71],
       [ 72,  73,  74,  75,  76,  77,  78,  79,  80,  81,  82,  83,  84,
         85,  86,  87,  88,  89],
       [ 90,  91,  92,  93,  94,  95,  96,  97,  98,  99, 100, 101, 102,
        103, 104, 105, 106, 107],
       [108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120,
        121, 122, 123, 124, 125],
       [126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138,
        139, 140, 141, 142, 143],
       [144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156,
        157, 158, 159, 160, 161],
       [162, 163, 164, 165, 166, 167,

In [16]:
sorted_indices

array([ 97, 127, 138,   7,  23,  25, 133,  90, 289,  70, 293, 129, 210,
       142, 178, 130, 209, 295,  18, 227, 228, 223, 156,  29, 288,  52,
       221, 183, 300, 196,  79, 224, 101, 232,  36, 135,  12, 106,   0,
        56,   1,  48,  28, 290, 323, 220, 297,  34, 217, 187, 151, 298,
       131, 216, 239, 301, 128,  57,  88, 205,  72, 218, 202, 226,  77,
       199, 214,  26,  37,  83,  63,  93,  59,   4,  41,  54, 317,  38,
       169, 322, 244, 206, 207, 201,  74, 139, 305, 134, 126, 136,  92,
        30, 233,   9, 180,  19, 292, 219, 299, 259, 102, 140, 229, 241,
        46, 137, 313,  98, 318, 165, 304, 192,  82, 143,  13,  20, 278,
        39, 296, 145,  73, 162,  95,  91,  43,  11, 182, 164, 234, 161,
       181, 159, 215, 235,  75, 144,  49,  16, 245, 155,  55,  80, 100,
       148, 152, 225, 200, 291, 174,  44, 160,  66, 230,   2,  84, 246,
        61, 312,  45,  40, 250, 189,  47,  67,   8,  81, 185, 203, 167,
        85, 236,  62,  64,  22, 247, 141, 173,   5, 277, 190,   

In [27]:
bigram_means[sorted_indices]

bigram_means[sorted_indices][:50]

array([1.36292977, 1.34356743, 1.28274882, 1.19939681, 1.03973287,
       0.60799655, 0.58798794, 0.57928479, 0.57735459, 0.53457131,
       0.48220595, 0.46037053, 0.43155536, 0.40498923, 0.34790177,
       0.33040069, 0.31571736, 0.28701422, 0.28251616, 0.27965532,
       0.26696252, 0.26181818, 0.2496941 , 0.2214735 , 0.21445928,
       0.21146919, 0.21124515, 0.21121069, 0.19103835, 0.17193451,
       0.16771219, 0.16188712, 0.1617751 , 0.16021542, 0.15523481,
       0.15338216, 0.15090047, 0.14846187, 0.14788453, 0.14566135,
       0.14455838, 0.14413615, 0.14400689, 0.13901766, 0.13398535,
       0.13221887, 0.13171047, 0.13061611, 0.12849634, 0.1283757 ])

In [28]:
sorted_indices[:50]

array([ 97, 127, 138,   7,  23,  25, 133,  90, 289,  70, 293, 129, 210,
       142, 178, 130, 209, 295,  18, 227, 228, 223, 156,  29, 288,  52,
       221, 183, 300, 196,  79, 224, 101, 232,  36, 135,  12, 106,   0,
        56,   1,  48,  28, 290, 323, 220, 297,  34, 217, 187])

In [30]:
np.save('../notebooks/resources/best_bigrams.npy', sorted_indices)

In [3]:
# pospca_extractor = POSPCAExtractor(25, 10)
# pospca_profile = MahalanobisProfile(pospca_extractor)

heuristics_extractor = HeuristicExtractor(4)
heuristics_profile = MahalanobisProfile(heuristics_extractor)

# pos2gram_extractor = POS2GramExtractor(paragraph_length=1)
# pos2gram_profile = MahalanobisProfile(pos2gram_extractor)

# combined_extractor = ConcatExtractor(heuristics_extractor, pos2gram_extractor)
# combined_profile = MahalanobisProfile(combined_extractor)

profiles = [heuristics_profile]
profile_names = ['Heuristics']

In [4]:
benchmark_results = benchmark_profiles(grouped_valid_df, profiles,
                                       show_loading=True, names=profile_names)

benchmark_results

100%|##########| 10/10 [08:03<00:00, 48.31s/it]


Unnamed: 0,flag,Heuristics
0,False,0.960581
1,True,0.865407
2,True,0.822791
3,True,0.353465
4,True,0.090379
...,...,...
657,True,0.360285
658,True,0.235129
659,True,0.261533
660,True,0.269341


In [5]:
benchmark_flags = benchmark_results.copy()

benchmark_flags[profile_names] = benchmark_flags[profile_names] < 0.85

In [6]:
benchmark_flags

Unnamed: 0,flag,Heuristics
0,False,False
1,True,False
2,True,True
3,True,True
4,True,True
...,...,...
657,True,True
658,True,True
659,True,True
660,True,True


In [8]:
positives_selection = benchmark_flags['flag']
negatives_selection = np.logical_not(benchmark_flags['flag'])

all_positives = positives_selection.sum()
all_negatives = negatives_selection.sum()

true_negatives = np.logical_not(benchmark_flags[negatives_selection][profile_names]).sum()
true_positives = benchmark_flags[positives_selection][profile_names].sum()

false_positives = np.logical_not(benchmark_flags[negatives_selection][profile_names]).sum()

sensitivity = true_positives / all_positives
specificity = true_negatives / all_negatives

precision = true_positives / (true_positives + false_positives)

balanced_accuracy = (sensitivity + specificity) / 2

train_benchmarks = pd.DataFrame(
    data=[balanced_accuracy, specificity, sensitivity, precision],
    index=['balanced accuracy', 'specificity', 'sensitivity (recall)',
           'precision']).T

train_benchmarks

Unnamed: 0,balanced accuracy,specificity,sensitivity (recall),precision
Heuristics,0.784977,0.625,0.944954,0.991974


In [14]:
train_benchmarks.to_hdf(join(outputs_path, 'bawe_train_benchmarks.hdf5'), key='bawe_train_benchmarks')

In [20]:
benchmark_flags[negatives_selection][profile_names].sum()

pospca        10
heuristics     3
dtype: int64

In [16]:
7 / (10)

0.7

In [5]:
benchmark_results.to_hdf(join(preprocess_path, 'benchmark_results.hdf5'), key='benchmark_results')

In [6]:
benchmark_results = pd.read_hdf(join(preprocess_path, 'benchmark_results.hdf5'))

benchmark_results

Unnamed: 0,flag,pospca,heuristics
0,False,0.000000,0.000000
1,True,0.000000,0.000000
2,True,0.000000,0.000000
3,True,0.000000,0.000000
4,True,0.000000,0.000000
...,...,...,...
978,True,0.000000,0.005036
979,True,0.002207,0.006970
980,True,0.000000,0.338741
981,True,0.000000,0.108420
