In [1]:
import pandas as pd
import numpy as np
from pathlib import Path
from os.path import join
from tqdm import tqdm
import random
from matplotlib import pyplot as plt
import math

project_root = Path('..')

preprocess_path = join(project_root, Path('data/preprocess'))
random.seed(10)

Thoughts: thresholds need to be custom to each profile. This would be a good opportunity to try to make a profile that gets more confident the more data about the author it has. The profile could model distributions of the differences between its sentences to its mean and other people's sentences to its mean. Then it can use these two distributions to determine which is more likely for incoming sentences. 

Question: Can euclidean distances from the mean or cosine similarites be treated as normal random variables?

In [2]:
# Using function words for these experiments
function_words_train = pd.read_hdf(join(preprocess_path, "bawe_train_preprocessed_function_word_counter.hdf5"))
pos_bigrams_train = pd.read_hdf(join(preprocess_path, "bawe_train_preprocessed_pos2gram_counter.hdf5"))
# function_words_train = pd.concat([function_words_train, pos_bigrams_train], axis=1)
function_words_train = pos_bigrams_train

function_words_train

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,0,1,2,3,4,5,6,7,8,9,...,314,315,316,317,318,319,320,321,322,323
author,text_id,sentence_position,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1
1,0,0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0,1,0.0,1.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0,2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0,3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0,4,0.0,0.0,0.0,0.0,2.0,0.0,0.0,4.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6998,2,175,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6998,2,176,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6998,2,177,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6998,2,178,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [3]:
def select_good_features(df):
    overall_var = df.var()

    author_vars = df.groupby(level="author").var()

    mean_explained_var = (overall_var - author_vars).mean()

    # Features that reduce the variance within classes should hopefully be good
    # features.
    selections = mean_explained_var > 0

    # The index of selctions should be the columns of the dataframe given the
    # last few operations.
    chosen_columns = selections[selections].index.tolist()

    return df[chosen_columns]

In [4]:
filtered_train = select_good_features(function_words_train)

filtered_train

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,0,6,7,11,14,17,18,23,28,29,...,309,311,312,315,316,317,318,319,322,323
author,text_id,sentence_position,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1
1,0,0,1.0,0.0,2.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0,1,0.0,0.0,4.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0,2,0.0,0.0,1.0,0.0,0.0,0.0,0.0,3.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0,3,0.0,0.0,1.0,0.0,0.0,0.0,1.0,2.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0,4,0.0,0.0,4.0,0.0,0.0,0.0,1.0,3.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6998,2,175,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6998,2,176,0.0,0.0,1.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6998,2,177,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6998,2,178,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [5]:
authors = filtered_train.index.get_level_values("author")
author_set = list(set(authors))

experiment_authors = random.sample(author_set, 5)

experiment_authors

[6123, 34, 3079, 6002, 6132]

In [6]:
chosen_author = experiment_authors[1]

chosen_author_sentences = filtered_train.loc[chosen_author]

chosen_author_sentences

Unnamed: 0_level_0,Unnamed: 1_level_0,0,6,7,11,14,17,18,23,28,29,...,309,311,312,315,316,317,318,319,322,323
text_id,sentence_position,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
0,0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
0,1,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
0,2,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
0,3,0.0,0.0,2.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
0,4,1.0,0.0,1.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2,86,0.0,0.0,0.0,2.0,0.0,0.0,2.0,3.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,87,0.0,0.0,2.0,0.0,0.0,0.0,1.0,3.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,88,0.0,0.0,3.0,0.0,0.0,0.0,1.0,1.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,89,2.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


Question: If I take the mean of each sentence distance and include it in the mean instead of excluding it, does this affect the mean/var distance?

In [7]:
included_sentences_distances = np.linalg.norm(chosen_author_sentences - chosen_author_sentences.mean(), axis=1)

included_sentences_distances_mean = included_sentences_distances.mean()
included_sentences_distances_var = included_sentences_distances.var()


def get_excluded_sentence_distances(array):
    def sentence_distance(i):
        selection = [True] * len(array)
        selection[i] = False
        return np.linalg.norm(array[i] - np.mean(array[selection]))

    return np.array([sentence_distance(index) for index in range(len(array))])


excluded_sentence_distances = get_excluded_sentence_distances(chosen_author_sentences.to_numpy())

excluded_sentence_distances_mean = excluded_sentence_distances.mean()
excluded_sentence_distances_var = excluded_sentence_distances.var()

included_sentences_distances_mean, excluded_sentence_distances_mean

(3.967883958830325, 4.676969135651933)

Answer: Yes :(, the mean is consistently much lower because the sentence is included. This is unfortunate because the other method was much more efficient to compute.

In [8]:
# plt.hist(excluded_sentence_distances)

# plt.show()

Answer to previous question: Euclidean distances are not normal (no doy why would they be? euclidean distances from same distribution mean should on average be close to 0 and can't be lower than that).

Question: What if I just pick a threshold that accounts for 95% of the author's sentences.

In [9]:
# chosen_text = chosen_author_sentences.loc[0]

# chosen_sentence_distances = get_excluded_sentence_distances(chosen_text.to_numpy())

# index_threshold = math.floor(len(chosen_sentence_distances) * 0.6)
# threshold = np.sort(chosen_sentence_distances)[index_threshold]

# threshold, index_threshold

In [10]:
chosen_author_sentences

Unnamed: 0_level_0,Unnamed: 1_level_0,0,6,7,11,14,17,18,23,28,29,...,309,311,312,315,316,317,318,319,322,323
text_id,sentence_position,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
0,0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
0,1,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
0,2,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
0,3,0.0,0.0,2.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
0,4,1.0,0.0,1.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2,86,0.0,0.0,0.0,2.0,0.0,0.0,2.0,3.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,87,0.0,0.0,2.0,0.0,0.0,0.0,1.0,3.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,88,0.0,0.0,3.0,0.0,0.0,0.0,1.0,1.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,89,2.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [11]:
chosen_text = chosen_author_sentences.drop(index=(0,))

chosen_text

Unnamed: 0_level_0,Unnamed: 1_level_0,0,6,7,11,14,17,18,23,28,29,...,309,311,312,315,316,317,318,319,322,323
text_id,sentence_position,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
1,0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1,1.0,0.0,3.0,0.0,0.0,0.0,2.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,2,0.0,0.0,2.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,4,0.0,0.0,2.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2,86,0.0,0.0,0.0,2.0,0.0,0.0,2.0,3.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,87,0.0,0.0,2.0,0.0,0.0,0.0,1.0,3.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,88,0.0,0.0,3.0,0.0,0.0,0.0,1.0,1.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,89,2.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [12]:
normalized_text = (chosen_text - chosen_text.mean()) / chosen_text.var()
# Come back here on error

chosen_cov = chosen_text.cov()

eig_values, eig_vectors = np.linalg.eig(chosen_cov)

eig_sum = np.sum(eig_values)

k = 15

phi_list = chosen_text - chosen_text.mean()

omega_list = np.sum((phi_list[:k] * eig_vectors[:, :k].T), axis=1).to_numpy()

phi_hat = np.sum((omega_list[:k] * eig_vectors[:, :k]), axis=1)
phi_hat

array([ 4.47920538e-02,  0.00000000e+00,  1.53224178e+00, -5.83697816e-02,
        0.00000000e+00,  0.00000000e+00,  4.02294904e-01,  7.29088295e-01,
        5.07357202e-01, -8.64291239e-01,  0.00000000e+00,  0.00000000e+00,
        5.25315395e-02,  0.00000000e+00,  0.00000000e+00,  0.00000000e+00,
        8.46131985e-01, -9.50091789e-03,  0.00000000e+00,  2.60812712e-01,
        0.00000000e+00,  8.96931489e-01, -2.98463533e-01,  2.76977652e-02,
        0.00000000e+00,  0.00000000e+00,  0.00000000e+00,  0.00000000e+00,
        0.00000000e+00,  0.00000000e+00,  0.00000000e+00,  0.00000000e+00,
        0.00000000e+00,  0.00000000e+00,  0.00000000e+00,  0.00000000e+00,
        0.00000000e+00,  1.33804924e+00,  3.68772350e-01, -2.02845384e-01,
       -8.09682659e-03,  3.40950395e-01,  1.79054456e-02,  7.74634833e-01,
        3.94648118e-02, -8.62435692e-02,  0.00000000e+00,  0.00000000e+00,
       -8.09682659e-03, -2.83131320e-02, -4.37047634e-02,  0.00000000e+00,
       -9.40831849e-02,  

In [13]:
profile_mean = chosen_text.mean()

cutoff_texts = chosen_text

diffs = cutoff_texts - chosen_text.mean() - phi_hat

distances = np.linalg.norm(diffs, axis=1)

distances

array([ 4.30909881,  3.90129018,  3.86625715,  5.04261801,  3.3712273 ,
        3.66192225,  5.06568392,  4.76133649,  4.55022367,  3.96980319,
        5.03162199,  2.83967262,  3.66514367,  5.27022859,  3.00600394,
        4.22968855,  6.15763125,  7.84372377,  4.52777673,  5.18957802,
        5.07791596,  5.57726838,  3.33463897,  3.35520677,  4.23218185,
        4.54975521,  4.8711415 ,  4.74949844,  3.6370815 ,  3.96055303,
        4.02838983,  3.48791901,  4.14312726,  5.95506037, 16.90054984,
        3.86542881,  4.76030941,  5.59526493,  4.86466519,  2.9577449 ,
        3.90253871,  4.20538188,  4.90648844,  3.48049663,  3.85128782,
        4.00324898,  4.2801974 ,  4.62460439,  4.31129011,  5.23356298,
        3.44311664,  3.84637224,  3.71014078,  3.06425761,  3.32134882,
        3.93140969,  4.13547139,  4.14939764,  4.33059168,  6.43962412,
        4.23297427,  9.64627545,  3.63704886,  5.33862037,  4.50509317,
        4.73252506,  4.04926737,  5.62434613,  4.68161136,  3.99

In [14]:
cutoff = np.mean(distances) + (np.std(distances) / 3.5)

cutoff

5.757019085834935

In [15]:
same_distances = distances[distances > cutoff]

In [16]:
outlier_cutoff = np.mean(same_distances) + (np.std(same_distances) / 3.5)

np.sum(same_distances > outlier_cutoff) / distances.shape[0]

0.03787878787878788

In [17]:
suspect_texts = filtered_train.loc[experiment_authors[4], 0]
# suspect_texts = chosen_author_sentences.loc[1]

suspect_diffs = suspect_texts - profile_mean - phi_hat

suspect_distances = np.linalg.norm(suspect_diffs, axis=1)

first_suspect_distances = suspect_distances[suspect_distances > cutoff]

np.sum(first_suspect_distances > outlier_cutoff) / len(suspect_distances)
# np.sum(first_suspect_distances > outlier_cutoff) / len(first_suspect_distances)

0.018518518518518517

In [18]:
# def euclidean_distance(mean, df):
#     return np.linalg.norm(mean - df, axis=1)


# chosen_text_mean = chosen_text.mean()

# same_texts = chosen_author_sentences.drop(index=(0,))
# other_author_texts = filtered_train.drop(index=(chosen_author,))

# same_sentence_classifications = pd.DataFrame(
#     euclidean_distance(chosen_text_mean, same_texts) > threshold, index=same_texts.index
# )
# other_sentence_classifications = pd.DataFrame(
#     euclidean_distance(chosen_text_mean, other_author_texts) > threshold, index=other_author_texts.index
# )

# same_text_classifications = same_sentence_classifications.groupby(level=("text_id")).mean() > 0.5
# other_text_classifications = other_sentence_classifications.groupby(level=("author", "text_id")).mean() > 0.5

# same_flags = same_text_classifications.sum()
# same_length = len(same_text_classifications)
# other_flags = other_text_classifications.sum()
# other_length = len(other_text_classifications)

# tnr = (same_length - same_flags) / same_length
# tpr = other_flags / other_length

# tnr[0], tpr[0]