In [1]:
import os
from os.path import join
import pandas as pd
from pathlib import Path
import pdpipe as pdp
import sys
import numpy as np
from tqdm import tqdm

project_root = Path('..')
sys.path.append(os.path.abspath(project_root))
from notebooks.utils import init_data_dir, extract_author_texts  # noqa

from notebooks import pipes
from notebooks.profiles import EuclideanProfile, NaiveBayesProfile
from notebooks import benchmarking as bench
from notebooks.feature_extractors import HeuristicsExtractor, FunctionWordCounter, POS2GramCounter
from notebooks.thresholders import SimpleAccuracyThresholder, SimpleThresholder

init_data_dir(project_root)

preprocess_path = join(project_root, Path('data/preprocess'))
outputs_path = join(project_root, 'outputs')

train_df = pd.read_hdf(join(preprocess_path, 'bawe_train_sentences.hdf5'))
valid_df = pd.read_hdf(join(preprocess_path, 'bawe_valid_sentences.hdf5'))

train_df = train_df.rename(columns={"sentence": "text"})
valid_df = valid_df.rename(columns={"sentence": "text"})

In [2]:
# feature_extractors = [(HeuristicsExtractor(), "heuristics_extractor")]
feature_extractors = [(POS2GramCounter(), "pos2gram_counter"), (FunctionWordCounter(), "function_word_counter")]

profiles = [(EuclideanProfile(), "euclidean_distance_profile")]
# profiles = [(NaiveBayesProfile(), "naive_bayes_profile")]

thresholders = [(SimpleThresholder(bench.balanced_accuracies), "balanced_accuracy_thresholder")]

In [3]:
preprocessed_dfs = []

for feature_extractor, display_name in feature_extractors:
    train_path = join(preprocess_path, f"bawe_train_preprocessed_{display_name}.hdf5")
    valid_path = join(preprocess_path, f"bawe_valid_preprocessed_{display_name}.hdf5")

    preprocessed_train_exists = os.path.exists(train_path)
    preprocessed_valid_exists = os.path.exists(valid_path)

    if not (preprocessed_train_exists and preprocessed_valid_exists):
        print(f"Preprocessing train dataset for {display_name}", flush=True)
        preprocessed_train_df = feature_extractor(train_df, show_loading=True)
        print(f"Preprocessing valid dataset for {display_name}", flush=True)
        preprocessed_valid_df = feature_extractor(valid_df, show_loading=True)

        preprocessed_train_df.to_hdf(train_path, key=f"bawe_train_preprocessed_{display_name}")
        preprocessed_valid_df.to_hdf(valid_path, key=f"bawe_valid_preprocessed_{display_name}")
    else:
        preprocessed_train_df = pd.read_hdf(train_path)
        preprocessed_valid_df = pd.read_hdf(valid_path)

    preprocessed_dfs.append((preprocessed_train_df, preprocessed_valid_df, display_name))

Preprocessing train dataset for pos2gram_counter


100%|##########| 238088/238088 [24:22<00:00, 162.85it/s]


Preprocessing valid dataset for pos2gram_counter


100%|##########| 58833/58833 [06:35<00:00, 148.57it/s]


Preprocessing train dataset for function_word_counter


100%|##########| 238088/238088 [00:46<00:00, 5104.86it/s]


Preprocessing valid dataset for function_word_counter


100%|##########| 58833/58833 [00:11<00:00, 5170.17it/s]


In [6]:
function_train, function_valid, _ = preprocessed_dfs[1]

In [47]:
overall_var = function_train.var()

average_author_var = function_train.groupby(level="author").var().mean()

In [48]:
useful_features = ((overall_var - average_author_var) / overall_var) > 0

actual_words = useful_features[useful_features].index.tolist()

In [49]:
better_function_train, better_function_valid = function_train[actual_words], function_valid[actual_words]

In [41]:
def train_threshold(profile, df, thresholder):
    author_set = set(df.index.get_level_values(0))

    print("Training...", flush=True)
    distance_sets = []
    true_flag_sets = []
    for author in tqdm(author_set):
        profile.reset()

        author_texts, rest_df = extract_author_texts(author, df)
        profile.feed(author_texts)
        distances = profile.distances(rest_df)

        true_flags = distances.index.get_level_values(0) != author

        distance_sets.append(distances.to_numpy())
        true_flag_sets.append(true_flags)

    distances = np.concatenate(distance_sets)
    true_flags = np.concatenate(true_flag_sets)

    return thresholder(distances, true_flags)


def test_profile(profile, threshold, df):
    author_set = set(df.index.get_level_values(0))

    print("Testing...", flush=True)
    flag_sets = []
    true_flag_sets = []
    for author in tqdm(author_set):
        profile.reset()

        author_texts, rest_df = extract_author_texts(author, df)
        profile.feed(author_texts)
        distances = profile.distances(rest_df)

        flags = distances > threshold
        true_flags = distances.index.get_level_values(0) != author

        flag_sets.append(flags.to_numpy())
        true_flag_sets.append(true_flags)

    flags = np.concatenate(flag_sets)
    true_flags = np.concatenate(true_flag_sets)

    return [bench.balanced_accuracy(flags, true_flags)]

In [42]:
threshold = train_threshold(EuclideanProfile(), better_function_train, SimpleThresholder(bench.balanced_accuracies))

Training...


100%|██████████| 527/527 [05:37<00:00,  1.56it/s]


In [45]:
scores = test_profile(EuclideanProfile(), threshold, better_function_valid)

Testing...


100%|██████████| 521/521 [05:33<00:00,  1.56it/s]


In [46]:
scores

[array([0.5210756])]

In [61]:
better_function_train = function_train[actual_words]

chosen_author_texts = better_function_train.loc[(6212,)]

author_means = function_train[actual_words].groupby(level="author").mean()
other_author_means = author_means.drop(index=6212)

In [62]:
other_author_means

Unnamed: 0_level_0,0,7,12,13,32,33,37,38,41,43,...,72,79,82,90,91,95,96,98,103,106
author,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.567308,0.971154,0.096154,0.163462,0.048077,0.163462,0.000000,0.000000,0.134615,0.692308,...,0.048077,1.903846,0.019231,0.000000,0.711538,0.288462,0.000000,0.480769,0.000000,0.201923
2,0.640777,1.242718,0.126214,0.194175,0.058252,0.087379,0.058252,0.000000,0.000000,0.893204,...,0.000000,2.359223,0.019417,0.106796,1.223301,0.592233,0.000000,0.417476,0.009709,0.310680
3,0.396896,0.494457,0.039911,0.108647,0.002217,0.053215,0.011086,0.037694,0.013304,0.472284,...,0.015521,1.492239,0.011086,0.026608,0.439024,0.133038,0.013304,0.066519,0.044346,0.095344
4,0.463415,0.990244,0.009756,0.297561,0.014634,0.014634,0.004878,0.039024,0.029268,0.658537,...,0.000000,2.531707,0.039024,0.004878,0.882927,0.219512,0.000000,0.112195,0.000000,0.141463
5,0.435897,0.474359,0.044872,0.064103,0.000000,0.012821,0.000000,0.019231,0.032051,0.448718,...,0.000000,1.371795,0.000000,0.025641,0.442308,0.051282,0.044872,0.012821,0.000000,0.102564
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6210,0.647727,0.528409,0.102273,0.125000,0.022727,0.005682,0.000000,0.005682,0.017045,0.312500,...,0.000000,1.659091,0.090909,0.000000,0.494318,0.409091,0.005682,0.017045,0.000000,0.215909
6211,0.292308,0.373077,0.050769,0.135385,0.003846,0.054615,0.030000,0.048462,0.097692,0.353077,...,0.010000,1.528462,0.025385,0.003846,0.423077,0.150769,0.082308,0.063846,0.013077,0.111538
6214,0.334963,0.594132,0.141809,0.110024,0.012225,0.034230,0.000000,0.000000,0.012225,0.760391,...,0.004890,1.767726,0.012225,0.019560,0.542787,0.239609,0.000000,0.207824,0.000000,0.134474
6215,0.541850,0.792952,0.125551,0.240088,0.017621,0.024229,0.000000,0.000000,0.063877,0.568282,...,0.000000,1.409692,0.026432,0.000000,0.482379,0.191630,0.033040,0.182819,0.002203,0.136564


In [63]:
chosen_author_means = chosen_author_texts.groupby(level="text_id").mean()
chosen_author_means
np.linalg.norm(chosen_author_means.iloc[0] - chosen_author_means.iloc[1])

0.3417540749586829

In [64]:
np.mean(np.linalg.norm(chosen_author_means.iloc[0] - other_author_means, axis=1))

1.046756212038611

In [4]:
# heuristics_train = preprocessed_dfs[0][0]
# heuristics_test = preprocessed_dfs[0][1]

# scaled_heuristics_train = (heuristics_train - heuristics_train.mean()) / heuristics_train.std()
# scaled_heuristics_test = (heuristics_test - heuristics_test.mean()) / heuristics_test.std()
# preprocessed_dfs.append((scaled_heuristics_train, scaled_heuristics_test, "scaled_heuristics"))

# function_words_train = preprocessed_dfs[0][0]
# function_words_test = preprocessed_dfs[0][1]
# pca_train = function_words_train

# pca_standardized = (pca_train - pca_train.mean()) / pca_train.std()

# pca_cov = pca_standardized.cov()

# pca_eigvals, pca_eigvecs = np.linalg.eig(pca_cov)
# sort_indices = np.flip(np.argsort(pca_eigvals))
# pca_eigvals, pca_eigvecs = pca_eigvals[sort_indices], pca_eigvecs[sort_indices]

# transformation_matrix0 = pca_eigvecs[:, :5]
# transformation_matrix1 = pca_eigvecs[:, :10]
# transformation_matrix2 = pca_eigvecs[:, :15]
# transformation_matrix3 = pca_eigvecs[:, :20]

# pca_function_words_train0 = function_words_train.dot(transformation_matrix0)
# pca_function_words_test0 = function_words_test.dot(transformation_matrix0)
# pca_function_words_train1 = function_words_train.dot(transformation_matrix1)
# pca_function_words_test1 = function_words_test.dot(transformation_matrix1)
# pca_function_words_train2 = function_words_train.dot(transformation_matrix2)
# pca_function_words_test2 = function_words_test.dot(transformation_matrix2)
# pca_function_words_train3 = function_words_train.dot(transformation_matrix3)
# pca_function_words_test3 = function_words_test.dot(transformation_matrix3)

# preprocessed_dfs.append((pca_function_words_train0, pca_function_words_test0, "pca_function_words0"))
# preprocessed_dfs.append((pca_function_words_train1, pca_function_words_test1, "pca_function_words1"))
# preprocessed_dfs.append((pca_function_words_train2, pca_function_words_test2, "pca_function_words2"))
# preprocessed_dfs.append((pca_function_words_train3, pca_function_words_test3, "pca_function_words3"))

# function_words_train = preprocessed_dfs[0][0]
# function_words_test = preprocessed_dfs[0][1]
# pca_train = function_words_train

# pca_standardized = (pca_train - pca_train.mean()) / pca_train.std()

# pca_cov = pca_standardized.cov()

# pca_eigvals, pca_eigvecs = np.linalg.eig(pca_cov)
# sort_indices = np.flip(np.argsort(pca_eigvals))
# pca_eigvals, pca_eigvecs = pca_eigvals[sort_indices], pca_eigvecs[sort_indices]

# transformation_matrix0 = pca_eigvecs[:, :5]
# transformation_matrix1 = pca_eigvecs[:, :10]
# transformation_matrix2 = pca_eigvecs[:, :15]
# transformation_matrix3 = pca_eigvecs[:, :20]

# pca_function_words_train0 = function_words_train.dot(transformation_matrix0)
# pca_function_words_test0 = function_words_test.dot(transformation_matrix0)
# pca_function_words_train1 = function_words_train.dot(transformation_matrix1)
# pca_function_words_test1 = function_words_test.dot(transformation_matrix1)
# pca_function_words_train2 = function_words_train.dot(transformation_matrix2)
# pca_function_words_test2 = function_words_test.dot(transformation_matrix2)
# pca_function_words_train3 = function_words_train.dot(transformation_matrix3)
# pca_function_words_test3 = function_words_test.dot(transformation_matrix3)

# preprocessed_dfs.append((pca_function_words_train0, pca_function_words_test0, "pca_function_words0"))
# preprocessed_dfs.append((pca_function_words_train1, pca_function_words_test1, "pca_function_words1"))
# preprocessed_dfs.append((pca_function_words_train2, pca_function_words_test2, "pca_function_words2"))
# preprocessed_dfs.append((pca_function_words_train3, pca_function_words_test3, "pca_function_words3"))

In [5]:
def train_threshold(profile, df, thresholder):
    author_set = set(df.index.get_level_values(0))

    print("Training...", flush=True)
    distance_sets = []
    true_flag_sets = []
    for author in tqdm(author_set):
        profile.reset()

        author_texts, rest_df = extract_author_texts(author, df)
        profile.feed(author_texts)
        distances = profile.distances(rest_df)

        true_flags = distances.index.get_level_values(0) != author

        distance_sets.append(distances.to_numpy())
        true_flag_sets.append(true_flags)

    distances = np.concatenate(distance_sets)
    true_flags = np.concatenate(true_flag_sets)

    return thresholder(distances, true_flags)


def test_profile(profile, threshold, df):
    author_set = set(df.index.get_level_values(0))

    print("Testing...", flush=True)
    flag_sets = []
    true_flag_sets = []
    for author in tqdm(author_set):
        profile.reset()

        author_texts, rest_df = extract_author_texts(author, df)
        profile.feed(author_texts)
        distances = profile.distances(rest_df)

        flags = distances > threshold
        true_flags = distances.index.get_level_values(0) != author

        flag_sets.append(flags.to_numpy())
        true_flag_sets.append(true_flags)

    flags = np.concatenate(flag_sets)
    true_flags = np.concatenate(true_flag_sets)

    return [bench.balanced_accuracy(flags, true_flags)]


score_data = []
thresholds = []
model_names = []

for profile, profile_name in profiles:
    for thresholder, thresholder_name in thresholders:
        for preprocessed_train_df, preprocessed_valid_df, extractor_name in preprocessed_dfs:
            threshold = train_threshold(profile, preprocessed_train_df, thresholder)
            thresholds.append(threshold)
            profile.reset()

            scores = test_profile(pr
                                  ofile, threshold, preprocessed_valid_df)
            score_data.append(scores)
            model_names.append(f"{profile_name}-{thresholder_name}-{extractor_name}")

Training...


100%|##########| 527/527 [07:08<00:00,  1.23it/s]


Testing...


100%|##########| 521/521 [07:03<00:00,  1.23it/s]


In [6]:
score_data, model_names

([[array([0.49067909])]],
 ['euclidean_distance_profile-balanced_accuracy_thresholder-pos2gram_counter'])

In [7]:
# test_profile(EuclideanProfile(), thresholds[0], preprocessed_dfs[0][0])

In [8]:
# sentence_count = len(function_words_train)

# where_true = ((function_words_train.sum() / (sentence_count / 100)) > 1)

# chosen_word_indices = where_true[where_true].index.tolist()

In [9]:
# with open("../notebooks/resources/original_function_words.txt") as f:
#     words = f.readlines()

In [10]:
# chosen_words = [words[chosen_index] for chosen_index in chosen_word_indices]

In [11]:
# with open("../notebooks/resources/filtered_function_words.txt", "w") as f:
#     f.writelines(chosen_words)

The threshold may be overfitting to individual author texts, not the dataset

In [12]:
# score_data = np.concatenate([results[None, :] for results in score_data])

# results_df = pd.DataFrame(np.array(score_data), index=model_names, columns=["balanced_accuracy"])

# results_df