# Set Up
Set up the Google Colab environment and import dependent libraries.

In [None]:
#Loading data from Google drive
import os

try:
  from google.colab import drive
  drive.mount('/content/drive')
  os.chdir("/content/drive/My Drive/PAN14_Code")
  RUNNING_COLAB = True
except ImportError:
  print("I have a sneaking suspicion that I'm not running on Google Colab")
  RUNNING_COLAB = False


In [None]:
# Lambda to print module versions
ver = lambda module : print(f"{module.__name__}=={module.__version__}")

In [None]:
import sys
import json
import math
import csv
import numpy as np
import glob
import pickle
import itertools
from collections import Counter
ver(np)

#pip install nltk
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
#nltk.download(["punkt", "stopwords","wordnet"])
ver(nltk)

# Uncomment to suppress tensorflow output, if need
#os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'

#pip install tensorflow
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.models import Sequential
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.optimizers.schedules import PolynomialDecay
from tensorflow.keras.callbacks import EarlyStopping
ver(tf)

#pip install gensim
import gensim
from gensim.models import Word2Vec
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from gensim.test.utils import common_texts
ver(gensim)

#pip install scikit-learn
from sklearn.metrics import f1_score, accuracy_score, roc_auc_score
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.cluster import KMeans
from scipy.spatial.distance import cosine
from sklearn.svm import SVC
from sklearn.model_selection import KFold
from collections import defaultdict

#pip install networkx
import networkx as nx
import random
from tqdm import tqdm
from urllib.request import urlretrieve
ver(nx)

#pip install matplotlib
import matplotlib.pyplot as plt
import string

In [None]:
def nltk_setup(path = None):
  """Initialize and download the right modules"""
  if type(path) is None:
    # No change to default path (defaults to user home directory)
    nltk.download(["punkt", "stopwords","wordnet"])
  else:
    # Change default path
    nltk.data.path = [ path ]
    nltk.download(["punkt", "stopwords","wordnet"], download_dir=nltk.data.path[0])

nltk_setup(f"{os.path.curdir}/nltk_data")

In [None]:
def print_colab_stats():
  # GPU info
  gpu_info = !nvidia-smi
  gpu_info = '\n'.join(gpu_info)
  if gpu_info.find('failed') >= 0:
    print('Not connected to a GPU')
  else:
    print(gpu_info)

  # Memory Info
  from psutil import virtual_memory
  ram_gb = virtual_memory().total / 1e9
  print('Your runtime has {:.1f} gigabytes of available RAM\n'.format(ram_gb))

  if ram_gb < 20:
    print('Not using a high-RAM runtime')
  else:
    print('You are using a high-RAM runtime!')

if RUNNING_COLAB: print_colab_stats()

# Data Process
Read the data in and save it in the dict.

In [None]:
def get_data_directory_path(subdirectory):
    return os.path.join('data', subdirectory)


def get_json_file_path(data_directory, file_name):
    return os.path.join(data_directory, file_name)


def read_json_file(file_path):
    with open(file_path) as file:
        data = json.load(file)
    return data


def extract_text_from_files(file_paths):
    known_text, unknown_text = [], []

    for file_path in file_paths:
        text_lines = []
        with open(file_path, 'r') as file:
            for line in file:
                cleaned_line = line.strip().lstrip("\ufeff")
                text_lines.append(cleaned_line)
        if 'unknown' in file_path:
            unknown_text.append(text_lines)
        else:
            known_text.append(text_lines)

    return known_text, unknown_text


def build_corpus(data_directory, content_data, label_data):
    corpus = {}

    for index in tqdm(range(len(content_data['problems']))):
        problem_file_paths = glob.glob(os.path.join(data_directory, content_data['problems'][index], '*'))

        if not problem_file_paths:
            continue

        known_text, unknown_text = extract_text_from_files(problem_file_paths)
        label = 1 if label_data['problems'][index]['answer'] == 'Y' else 0

        corpus[index] = {
            'known': known_text,
            'unknown': unknown_text,
            'label': label
        }

    return corpus

In [None]:
# Get data path
train_data_directory = get_data_directory_path('train_data')
validation_data_directory = get_data_directory_path('val_data')
test_data_directory = get_data_directory_path('test_data')

In [None]:
# Train
train_content = read_json_file(get_json_file_path(train_data_directory, 'contents.json'))
train_labels = read_json_file(get_json_file_path(train_data_directory, 'truth.json'))
# # Val
#validation_content = read_json_file(get_json_file_path(validation_data_directory, 'contents.json'))
#validation_labels = read_json_file(get_json_file_path(validation_data_directory, 'truth.json'))
# Test
test_content = read_json_file(get_json_file_path(test_data_directory, 'contents.json'))
test_labels = read_json_file(get_json_file_path(test_data_directory, 'truth.json'))

In [None]:
# Get train corpus
train_corpus = build_corpus(train_data_directory, train_content, train_labels)
# # Get val corpus
#val_corpus = build_corpus(validation_data_directory, validation_content, validation_labels)
# Get test corpus
test_corpus = build_corpus(test_data_directory, test_content, test_labels)

In [None]:
# Split the training data into training and validation
def split_data_into_train_and_val(data_dict, test_size=0.2, random_state=42):
    document_ids, labels = zip(*[(doc_id, data['label']) for doc_id, data in data_dict.items()])

    train_ids, val_ids, train_labels, val_labels = train_test_split(document_ids, labels, test_size=test_size, random_state=random_state)

    train_data = {doc_id: data_dict[doc_id] for doc_id in train_ids}
    validation_data = {doc_id: data_dict[doc_id] for doc_id in val_ids}

    return train_data, validation_data

train_corpus, val_corpus = split_data_into_train_and_val(data_dict=train_corpus)

# Train Word2Vec Model

In [None]:
def preprocess_text(text):
    """
    Preprocess a given text by tokenizing, removing punctuation and numbers,
    removing stop words, and lemmatizing.

    Args:
        text (str): The text to preprocess.

    Returns:
        list: The preprocessed text as a list of tokens.
    """
    if not isinstance(text, str):
        text = str(text)

    # Tokenize the text into words
    tokens = word_tokenize(text.lower())

    # Remove punctuation and numbers
    table = str.maketrans('', '', string.punctuation + string.digits)
    tokens = [word.translate(table) for word in tokens]

    # Remove stop words
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if (not word in stop_words) and (word != '')]

    # Lemmatize words
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(word) for word in tokens]

    return tokens

def train_word2vec_model(data, vector_size):
    """
    Train a word2vec model using the given data.

    Args:
        data (dict): The data to use for training the model.
        vector_size (int): The size of the word vectors in the model.

    Returns:
        gensim.models.Word2Vec: The trained word2vec model.
    """
    corpus = []

    # Process all articles in the data
    for articles in tqdm(data.values(), total=len(data)):
        all_articles = []
        all_articles.extend(articles['known'])
        all_articles.extend(articles['unknown'])

        for article in all_articles:
            for line in article:
                text = line.strip()
                tokens = preprocess_text(text)
                corpus.append(tokens)

    # Train the word2vec model
    word2vec_model = gensim.models.Word2Vec(vector_size=vector_size, window=5, min_count=1, workers=4)
    word2vec_model.build_vocab(corpus)
    word2vec_model.train(corpus, total_examples=word2vec_model.corpus_count, epochs=word2vec_model.epochs)

    return word2vec_model

In [None]:
# Size of word vectors in the word2vec model
w2v_vector_size = 300

In [None]:
# Train a word2vec model using the training corpus
word2vec_model = train_word2vec_model(train_corpus, w2v_vector_size)

# Vectorize Text Data

In [None]:
def convert_text_to_vector(texts, model, vector_size):
    """
    Convert a list of texts into their corresponding word2vec vectors
    """
    vectors = []
    for text in texts:
        words = preprocess_text(text)
        vector = np.sum([model.wv[word] for word in words if word in model.wv], axis=0)
        word_count = np.sum([word in model.wv for word in words])
        if word_count != 0:
            vector /= word_count
        else:
          vector = np.zeros(vector_size)
        vectors.append(vector)
    return vectors

In [None]:
def count_punctuations(texts):
  """
  Count the frequency of different punctuations in the texts
  """
  # Define punctuations to count
  punctuations = set(['.', ',', ';', ':', '!', '?', '-', '(', ')', '\"', '\'', '`', '/'])

  # Initialize dictionary to count punctuations
  punctuations_count = {p: 0 for p in punctuations}

  # Count punctuations in text_list
  for text in texts:
      for char in text:
          if char in punctuations:
              punctuations_count[char] += 1

  # Return list of punctuation counts
  return list(punctuations_count.values())

In [None]:
def analyze_sentence_lengths(sentences):
  """
  Analyze the lengths of sentences
  """
  sentence_lengths = [len(sentence.split()) for sentence in sentences]
  average_length = np.mean(sentence_lengths)
  count_over_avg = np.sum([length > average_length for length in sentence_lengths])
  count_under_avg = np.sum([length < average_length for length in sentence_lengths])
  count_avg = len(sentence_lengths) - count_over_avg - count_under_avg

  return [count_over_avg, count_under_avg, count_avg, average_length]

In [None]:
def analyze_words(texts):
    """
    Analyze the words used in the texts
    """
    words = []
    stop_words = set(stopwords.words('english'))
    lemmatizer = WordNetLemmatizer()
    for text in texts:
        tokenized = word_tokenize(text.lower())
        processed = [lemmatizer.lemmatize(word) for word in tokenized if word not in stop_words]
        words += processed
    word_freq = nltk.FreqDist(words)
    rare_count = np.sum([freq <= 2 for word, freq in word_freq.items()])
    long_count = np.sum([len(word) > 6 for word in words])
    word_lengths = [len(word) for word in words]
    average_length = np.mean(word_lengths)
    count_over_avg = np.sum([length > average_length for length in word_lengths])
    count_under_avg = np.sum([length < average_length for length in word_lengths])
    count_avg = len(word_lengths) - count_over_avg - count_under_avg
    ttr = len(set(words)) / len(words) if words else 0

    return [rare_count, long_count, count_over_avg, count_under_avg, count_avg, ttr]

In [None]:
def calculate_style_vector(texts):
  """
  Calculate the style vector of the texts
  """
  punctuation_vec = count_punctuations(texts)     # Punctuations stylistic features
  sentence_vec = analyze_sentence_lengths(texts)  # Sentences stylistic features
  word_vec = analyze_words(texts)                 # Words stylistic features
  word_count = np.sum([len(text.split()) for text in texts])

  vector = np.concatenate((punctuation_vec, sentence_vec, word_vec))

  return vector / word_count if word_count else vector

In [None]:
def get_vectors(texts, w2v_model, vector_size):
  res = []
  for text in texts:
    w2v_vec = np.mean(convert_text_to_vector(text, w2v_model, vector_size), axis=0)
    style_vec = calculate_style_vector(text)
    res.append(np.concatenate((w2v_vec, style_vec), axis=None))
    # res.append(w2v_vec)

  return res

In [None]:
def vectorize_text_data(data, w2v_model):
  """
  Build author data from the corpus
  """
  res = {}
  for key,val in tqdm(data.items(), total=len(data)):
    if len(val['unknown']) == 0:
      continue
    res[key] = {
        'known': get_vectors(val['known'], w2v_model),
        'unknown': get_vectors(val['unknown'], w2v_model),
        'label': val['label']
    }

  return res

In [None]:
train_data = vectorize_text_data(train_corpus, word2vec_model)
val_data = vectorize_text_data(val_corpus, word2vec_model)
test_data = vectorize_text_data(test_corpus, word2vec_model)

# Build Triplet Samples

In [None]:
# Random triplet mining
def build_random_triplet_sample(data):
  """
  This function creates random triplet samples from the input data
  """

  keys_list = list(data.keys())
  triplet_samples = {}

  # Initialize the lists for storing the anchor, positive, and negative samples
  anchors, positives, negatives = [], [], []

  for key,val in tqdm(data.items(), total=len(data)):
    n = len(val['known'])
    for i in range(n):
      for j in range(i+1, n):
        anchors.append(val['known'][i])
        positives.append(val['known'][j])
        # Get negative sample
        while True:
          random_key = random.choices(keys_list, k=1)
          if random_key != key:
            break
        random_neg_sample = random.choices(data[random_key[0]]['known'], k=1)
        negatives.append(random_neg_sample[0])


  # Build triplet sample
  for i in range(len(anchors)):
    triplet_samples[i] = {
        'anchor': anchors[i],
        'positive': positives[i],
        'negative': negatives[i]
    }

  return triplet_samples

In [None]:
random_triplet_samples = build_random_triplet_sample(train_data)

In [None]:
anchor_data = np.array([data['anchor'] for data in random_triplet_samples.values()])
positive_data = np.array([data['positive'] for data in random_triplet_samples.values()])
negative_data = np.array([data['negative'] for data in random_triplet_samples.values()])
labels_data = np.array([0 for _ in random_triplet_samples.values()])

In [None]:
val_random_triplet_samples = build_random_triplet_sample(val_data)

In [None]:
val_anchor_data = np.array([data['anchor'] for data in val_random_triplet_samples.values()])
val_positive_data = np.array([data['positive'] for data in val_random_triplet_samples.values()])
val_negative_data = np.array([data['negative'] for data in val_random_triplet_samples.values()])
val_labels_data = np.array([0 for _ in val_random_triplet_samples.values()])

# Build SiameseNet Model

## Model Frame

In [None]:
class SiameseNet(tf.keras.Model):
    def __init__(self, base_network, clf_network):
        super().__init__()
        self.base = base_network
        self.clf = clf_network

    def call(self, inputs):
        anchor = inputs[0]
        positive = inputs[1]
        negative = inputs[2]

        output_anchor = self.base(anchor)
        output_positive = self.base(positive)
        output_negative = self.base(negative)

        # Anchor - Positive
        x1 = tf.concat([output_anchor, output_positive], axis=-1)
        x1_out = self.clf(x1)

        # Anchor - Negative
        x2 = tf.concat([output_anchor, output_negative], axis=-1)
        x2_out = self.clf(x2)

        return (x1_out, x2_out)

In [None]:
def create_dense_block(x, units, dropout_rate, l1_reg, l2_reg):
    x = tf.keras.layers.Dense(units, kernel_regularizer=tf.keras.regularizers.l1_l2(l1=l1_reg, l2=l2_reg))(x)
    x = tf.keras.layers.BatchNormalization()(x)
    x = tf.keras.layers.Activation('relu')(x)
    return tf.keras.layers.Dropout(dropout_rate)(x)

In [None]:
# Define the base network
def create_base_network(embedding_dim, dropout_rate=0.4, l1_reg=0.001, l2_reg=0.001):
    input = tf.keras.layers.Input(shape=embedding_dim)
    x = tf.keras.layers.BatchNormalization()(input)

    x = create_dense_block(x, 256, dropout_rate, l1_reg, l2_reg)
    x = create_dense_block(x, 128, dropout_rate, l1_reg, l2_reg)
    x = create_dense_block(x, 64, dropout_rate, l1_reg, l2_reg)

    x = tf.keras.layers.Dense(64, activation='linear')(x)

    return tf.keras.Model(inputs=input, outputs=x)

In [None]:
def create_clf_network(input_shape, dropout_rate=0.5, l1_reg=0.003, l2_reg=0.003):
    input = tf.keras.layers.Input(shape=(input_shape,))
    x = tf.keras.layers.BatchNormalization()(input)

    x = create_dense_block(x, 128, dropout_rate, l1_reg, l2_reg)
    x = create_dense_block(x, 64, dropout_rate, l1_reg, l2_reg)
    x = create_dense_block(x, 32, dropout_rate, l1_reg, l2_reg)

    x = tf.keras.layers.Dense(1, activation='sigmoid')(x)

    return tf.keras.Model(inputs=input, outputs=x)


In [None]:
def customer_loss(y_true, y_pred):
    AP = y_pred[0]
    AN = y_pred[1]

    loss = 1.0 - AP + AN

    return loss

## Construct the Model

In [None]:
# Define the embedding dimension
embedding_dim = anchor_data[0].shape

# Create base network
base_network = create_base_network(embedding_dim)
clf_network = create_clf_network(base_network.output_shape[1]*2)

siamese_model = SiameseNet(base_network, clf_network)

In [None]:
input_anchor = tf.keras.layers.Input(shape=embedding_dim)
input_positive = tf.keras.layers.Input(shape=embedding_dim)
input_negative = tf.keras.layers.Input(shape=embedding_dim)

In [None]:
# Assemble siameseNet model
siamese_model.compile(optimizer='adam',
                      loss=customer_loss)

In [None]:
checkpoint_path = "model_weights/cp_build.ckpt"
checkpoint_dir = os.path.dirname(checkpoint_path)

# Create a callback that saves the model's weights
cp_save = tf.keras.callbacks.ModelCheckpoint(filepath=checkpoint_path,
                                             save_weights_only=True,
                                             verbose=1)

## Load SiameseNet Model Weights

In [None]:
#latest = tf.train.latest_checkpoint(checkpoint_dir)
#siamese_model.load_weights(latest)
# Disabled for now since model loading is implemented under 'Production Use' at the end

## Train SiameseNet Model

### Train on Random Triplet Samples

In [None]:
# Train siameseNet model
early_stopping = EarlyStopping(monitor='val_loss', patience=100, verbose=1)
siamese_history = siamese_model.fit([anchor_data, positive_data, negative_data], labels_data,
                  epochs=1000,
                  validation_data=([val_anchor_data, val_positive_data, val_negative_data], val_labels_data),
                  callbacks=[early_stopping, cp_save])

In [None]:
loss = siamese_history.history['loss']
val_loss = siamese_history.history['val_loss']

# Draw
plt.plot(loss, label='Training Loss')
plt.plot(val_loss, label='Validation Loss')
plt.title('Loss and Validation Loss')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()
plt.show()

### Train on Semi-Hard Triplet Samples

#### Semi-Hard Samples Construct

In [None]:
# Build semi-hard triplet sample candidates
def build_triplet_sample_candidates(data):
  res = {}

  keys = []
  anchors = []
  positives = []

  for key,val in tqdm(data.items(), total=len(data)):
    n = len(val['known'])
    for i in range(n-1):
      keys.append(key)
      anchors.append(val['known'][i])
      positives.append(val['known'][i+1:])

  for i in range(len(keys)):
    res[i] = {
        'key': keys[i],
        'anchor': anchors[i],
        'positives': positives[i]
    }

  return res

In [None]:
triplet_sample_candidates = build_triplet_sample_candidates(train_data)

In [None]:
def create_negative_vectors_dict(data):
    negative_vectors_dict = {}
    key_list = list(data.keys())

    for key in tqdm(key_list, total=len(key_list)):
        negative_vectors_dict[key] = []
        for k,v in data.items():
            if k != key:
                for vec in v['known']:
                    negative_vectors_dict[key].append(vec)
    return negative_vectors_dict

In [None]:
negative_vectors_dict = create_negative_vectors_dict(train_data)

In [None]:
def select_random_from_list(input_list):
    """
    Selects a random item from a list.
    """
    return input_list[np.random.randint(0, len(input_list))]

# def select_negative_vectors(negatives, key):
#     """
#     Collects all negative vectors except for the one corresponding to the key.
#     """
#     return [vec for k,v in negatives.items() if k != key for vec in v['known']]

def get_random_triplet(sample, negatives):
    """
    This function takes a sample and negatives, and returns a random triplet of anchor, positive, and negative.
    """
    # Select the positive vector
    positive = select_random_from_list(sample['positives'])

    # Select the negative vector
    negative = select_random_from_list(negatives[sample['key']])

    return sample['anchor'], positive, negative

def get_hard_triplet(sample, negatives, base_model, clf_model,):
    """
    This function takes a sample, negatives, and a model, and returns a hard triplet of anchor, positive, and negative.
    The sample with the lowest probability is the hardest positive sample,
    while a high probability indicates that the model is confident in classifying it as positive.
    Therefore, the lowest probability implies that the model has incorrectly classified it.
    """
    anchor_rep = base_model.predict(np.array([sample['anchor']]), verbose=0)

    ### ------ Positive ------ ###
    # Compute distances between anchor and all positive vectors
    positive_reps = base_model.predict(np.array(sample['positives']), verbose=0)
    AP_reps = []
    for rep in positive_reps:
        comb = np.concatenate((anchor_rep[0], rep), axis=None)
        AP_reps.append(comb)

    # Select the hardest positive (the one with the lowest probability)
    positive_distances = clf_model.predict(np.array(AP_reps), verbose=0)
    hardest_positive = sample['positives'][np.argmin(positive_distances)]


    ### ------ Negative ------ ###
    # Collect all negative vectors and compute distances to anchor
    negative_vectors = negatives[sample['key']]
    negative_reps = base_model.predict(np.array(negative_vectors), verbose=0)
    AN_reps = []
    for rep in negative_reps:
        comb = np.concatenate((anchor_rep[0], rep), axis=None)
        AN_reps.append(comb)

    # Select the hardest negative (the one with the highest probability)
    negative_distances = clf_model.predict(np.array(AN_reps), verbose=0)
    hardest_negative = negative_vectors[np.argmax(negative_distances)]

    # # positive_distances = [compute_cosine_distance(pos_rep, anchor_rep[0]) for pos_rep in positive_reps]
    # positive_distances = [np.sum(np.square(pos_rep - anchor_rep[0])) for pos_rep in positive_reps]

    # # Select the hardest positive (the one with the largest distance)
    # hardest_positive = sample['positives'][np.argmax(positive_distances)]

    # # Collect all negative vectors and compute distances to anchor
    # negative_vectors = select_negative_vectors(negatives, sample['key'])
    # negative_reps = model.predict(np.array(negative_vectors), verbose=0)
    # negative_distances = [np.sum(np.square(neg_rep - anchor_rep[0])) for neg_rep in negative_reps]

    # # Select the hardest negative (the one with the smallest distance)
    # hardest_negative = negative_vectors[np.argmin(negative_distances)]

    return sample['anchor'], hardest_positive, hardest_negative

def get_triplet(sample, negatives, base_model, clf_model, hard_triplet_probability):
    """
    This function decides between selecting a hard triplet or a random triplet based on the hard_triplet_probability.
    """
    if np.random.rand() < hard_triplet_probability:
        # With a certain probability, choose the hardest triplet
        return get_hard_triplet(sample, negatives, base_model, clf_model)
    else:
        # Otherwise, choose a random triplet
        return get_random_triplet(sample, negatives)


#### Training on Semi-Hard Samples

In [None]:
num_epochs = 100
patience = 10
previous_loss = float('inf')

hard_triplet_probability_start=0.5
hard_triplet_probability_end=0.8

early_stopping_2 = EarlyStopping(monitor='loss', patience=patience, verbose=0)

In [None]:
# Initial probability of selecting a hard triplet
triplet_select_probability = hard_triplet_probability_start

# Iterate over each epoch
for epoch in tqdm(range(num_epochs)):
  # Initialize empty lists for anchor, positive, negative samples and labels
  anchor_samples = []
  positive_samples = []
  negative_samples = []
  labels = []

  # Iterate over triplet samples
  for _, sample in triplet_sample_candidates.items():
    # Get the anchor, positive, negative samples
    anchor, positive, negative = get_triplet(sample, negative_vectors_dict, base_network, clf_network, triplet_select_probability)
    # Add samples to their respective lists
    anchor_samples.append(anchor)
    positive_samples.append(positive)
    negative_samples.append(negative)
    labels.append(0)

  # Convert lists to numpy arrays
  anchor_samples = np.array(anchor_samples)
  positive_samples = np.array(positive_samples)
  negative_samples = np.array(negative_samples)
  labels = np.array(labels)

  # Train the model on current epoch's data
  siamese_model.fit([anchor_samples, positive_samples, negative_samples], labels,
                    epochs=50,
                    verbose=1,
                    callbacks=[early_stopping_2, cp_save])

  # Gradually increase the probability of choosing a hard triplet
  triplet_select_probability += (hard_triplet_probability_end - hard_triplet_probability_start) / num_epochs

  # Uncomment the following section for Early Stopping
  # Check if current epoch is a 'patience' epoch
  if epoch % patience == 0 and epoch != 0:
    current_loss = siamese_model.history.history['loss'][-1]
    # Check if loss is increasing or constant, if yes, then stop training
    if current_loss >= previous_loss:
      print("Early stopping triggered. Stopping training.")
      break
    else:
      # Update previous loss with current loss
      previous_loss = current_loss

# Inference and Validation

In [None]:
def generate_concatenated_vectors(data, base_network):
  concatenated_vectors = []
  labels = []

  for k, v in tqdm(data.items(), total=len(data)):
    # Process known vectors
    known_feature_vectors = base_network.predict(np.array(v['known']), verbose=0)

    # Process unknown vectors
    unknown_feature_vectors = base_network.predict(np.array(v['unknown']), verbose=0)

    # Compute the average feature vector
    author_representation = np.mean(known_feature_vectors, axis=0)
    unknown_representation = np.mean(unknown_feature_vectors, axis=0)

    concate_vec = np.concatenate((author_representation, unknown_representation), axis=None)

    concatenated_vectors.append(concate_vec)
    labels.append(v['label'])

  return np.array(concatenated_vectors), np.array(labels)

In [None]:
# Build train siamese_embedding dataset
train_siamese_vec, train_siamese_labels = generate_concatenated_vectors(train_data, base_network)

# Build val siamese_embedding dataset
val_siamese_vec, val_siamese_labels = generate_concatenated_vectors(val_data, base_network)

# Build test siamese_embedding dataset
test_siamese_vec, test_siamese_labels = generate_concatenated_vectors(test_data, base_network)

In [None]:
clf_network.compile(optimizer='adam',
                    loss='binary_crossentropy',
                    metrics=['accuracy', tf.keras.metrics.AUC()])

In [None]:
clf_early_stopping = EarlyStopping(monitor='val_loss', patience=100, verbose=1)
clf_history = clf_network.fit(train_siamese_vec, train_siamese_labels,
                              epochs=1000,
                              verbose=1,
                              validation_data = (val_siamese_vec, val_siamese_labels),
                              callbacks=[clf_early_stopping])

In [None]:
res = clf_network.evaluate(test_siamese_vec, test_siamese_labels)

In [None]:
res = clf_network.evaluate(test_siamese_vec, test_siamese_labels)

# Calculate Score

In [None]:
def calculate_score(y_predict, y_true):
    n = len(y_predict)
    n_correct = 0
    n_unknown = 0

    for i in range(n):
        if y_predict[i] > 0.5:
            prediction = 1
        elif y_predict[i] < 0.5:
            prediction = 0
        else:
            n_unknown += 1
            continue

        if prediction == y_true[i]:
            n_correct += 1

    c_1 = (n_correct + (n_unknown * n_correct / n)) / n
    auc = tf.keras.metrics.AUC()(y_true, y_predict)
    score = auc.numpy() * c_1

    return c_1, auc.numpy(), score

In [None]:
nn_pred = clf_network.predict(test_siamese_vec)
c_1, auc, score = calculate_score(nn_pred, test_siamese_labels)

print("C@1:", round(c_1, 3))
print("AUC:", round(auc, 3))
print("Final Score:", round(score, 3))

# Production Use

### Saving/Loading Models

In [None]:
# Save the generated Word2Vec model
default_word2vec_save = "word2vec.model"
word2vec_model.save(default_word2vec_save)

# Save weights for the generated SiameseNet model
default_checkpoints_file = "model_weights/cp.ckpt"
siamese_model.save_weights(default_checkpoints_file)

In [None]:
# Load a saved Word2Vec Model
default_word2vec_save = "word2vec.model"
loadW2v = lambda path : gensim.models.Word2Vec.load(path)

# Rebuild a SiameseNet model from saved weights
def buildSiameseNet(checkpoint_file: str, embedding_dim: tuple = (323,)) -> SiameseNet:
	"""Construct the SiameseNet model using code from PAN14_data_demo.ipynb
		using saved weights at checkpoint_dir
		embedding_dim defines the input shape for the model, the default from the build process is (323,None)"""

	# Create sub-model frame
	base_network = create_base_network(embedding_dim)
	clf_network = create_clf_network(base_network.output_shape[1]*2)

	# Create main model frame
	siamese_model = SiameseNet(base_network, clf_network)
	
	# Compile models (not necessary if it doesn't need to be trained further)
	#siamese_model.compile(optimizer='adam', loss=customer_loss)
	#clf_network.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy', tf.keras.metrics.AUC()])

	siamese_model.load_weights(checkpoint_file).expect_partial()
	
	return siamese_model

### Proof-of-concept
General loading and testing functions to demonstrate the model running over singlar profiles

In [None]:
### Utility functions, not directly related to the model but used alot later ###
def unwrap(var):
	"""Function to extract variables nested inside 1-element lists/arrays"""
	is_array = lambda var : isinstance(var, (list, tuple, set, np.ndarray))
	while is_array(var) and len(var) == 1: var = var[0]
	return var

def choose_random_folders(path, num):
		"""Return a list of random folders from a given root"""
		dirs = [ os.path.join(path, x) for x in os.listdir(path) if os.path.isdir(os.path.join(path, x)) ]
		return random.choices(dirs, k=num)

print(f"Floats will need to be within {sys.float_info.epsilon} to be considered equal!")
def float_cmp(f1, f2, epsilon=sys.float_info.epsilon) -> tuple[bool,float]:
	"""Basic float compare, True is values are approx. equal. Also return the difference regardless"""
	diff = abs(f1 - f2)
	return diff <= epsilon, diff

def dataset_compare(setA:list, setB:list, verbose = False) -> None:
	"""Compare all elements in two lists of flaots and returns a similarity score + max deviation"""

	if len(setA) != len(setB): print("WARNING: arrays are different lengths!")
	min_len = min(len(setA), len(setB))

	matches = 0
	max_diff = 0.0
	for i in range(min_len):
		res, diff = float_cmp(setA[i], setB[i])
		if diff > max_diff: max_diff = diff

		if res:
			matches += 1
		elif verbose:
			# Optionally print every single value that doesn't pass
			print(f"{setA[i]} != {setB[i]} (diff = {diff})")
	
	score = matches / min_len
	print(f"{score:.1%} pass, with max devation of {max_diff:.9%}")

def deep_clean(path, inc_root = False):
	"""Clean up (remove) the contents of a folder and optionally the folder itself"""

	if not os.path.isdir(path):
		print(f"Invalid path: {path}")
		return

	if len(os.listdir(path)):
		for root, dirs, files in os.walk(path, topdown=False):
			for file in files:
				os.remove(os.path.join(root, file))
			for dir in dirs:
				os.rmdir(os.path.join(root, dir))
	else:
		print(f"{path} is already empty!")

	if inc_root:
		os.rmdir(path)
		print(f"Removed {path} and contents")
	else:
		print(f"Removed contents of {path}")

In [None]:
def load_from_dir(path) -> dict:
	"""Load raw known/unknown text data from a folder"""
	files = []
	for file in os.listdir(path):
		files.append(os.path.join(path, file))
	
	texts = {}
	texts['known'], texts['unknown'] = extract_text_from_files(files)

	return texts

def vectorize_single(known,unknown,w2v_model):
	"""Converts a single set of texts (instead of the entire corpus) to vectors"""
	vectors = {
		'known': get_vectors(known, w2v_model),
		'unknown': get_vectors(unknown, w2v_model)
		# 'label' not included since it's not relevant outside of training
	}
	return vectors

def concatenate_vectors_single(vectors, base_network):
	known_feature_vectors = base_network.predict(np.array(vectors['known']), verbose=0)
	unknown_feature_vectors = base_network.predict(np.array(vectors['unknown']), verbose=0)

	author_representation = np.mean(known_feature_vectors, axis=0)
	unknown_representation = np.mean(unknown_feature_vectors, axis=0)

	concate_vec = np.concatenate((author_representation, unknown_representation), axis=None)
	return concate_vec

def predict_once(texts, w2v = word2vec_model, base = base_network, clf = clf_network) -> float:
	"""Run the model to make a single prediction"""
	known_data = texts['known']
	unknown_data = texts['unknown']
	
	# Early return if there is no unknown text included
	if len(unknown_data) == 0:
		return 0.0

	vectors = vectorize_single(known_data, unknown_data, w2v)
	concats = concatenate_vectors_single(vectors, base)

	prediction = clf.predict(np.expand_dims(concats, axis=0), verbose=0)

	# Convert the Tensor to a numpy array and flatten, as output shape will be (1,1)
	return unwrap(prediction)

In [None]:
### Run 'predict_once' using both the live and saved/loaded model on random data to compare results ###
print(f"{'Loaded':>20} vs {'Generated':<20} (Pass)  Source")

def load_and_test(path):
  loaded_word2vec_model = loadW2v(default_word2vec_save)
  loaded_siamese_model = buildSiameseNet(default_checkpoints_file)

  # Using the base_network and clf_network variables from the generation
  loaded = predict_once(load_from_dir(path), loaded_word2vec_model, loaded_siamese_model.base, loaded_siamese_model.clf)
  generated = predict_once(load_from_dir(path), word2vec_model, base_network, clf_network)
  print(f"{loaded:>20} vs {generated:<20} ({loaded == generated})  {path}")

# If the saving/loading process works then values should ideally be *exactly* the same
for dir in choose_random_folders("./data/test_data", 10):
  load_and_test(dir)

for dir in choose_random_folders("./data/train_data", 10):
  load_and_test(dir)

In [None]:
### Test the validity of 'predict_once' by comparing to
# the generated batch prediction results (from earlier) ###
 
def test_predict_function(corpus, concats, name=None):
  generated = [ unwrap(x) for x in clf_network.predict(concats, verbose=0) ]
  loaded = [ predict_once(text) for text in tqdm(corpus.values(), desc=name) ]
  
  dataset_compare(generated, loaded)

# Again values *should* be identical
test_predict_function(test_corpus, test_siamese_vec, "Test Data")
test_predict_function(val_corpus, val_siamese_vec, "Val Data")
#test_predict_function(train_corpus, train_siamese_vec, "Train Data")

In [None]:
# Test against the entire dataset
def test_against_dataset(dataset, concats, name = None, verbose = False):
    loaded_siamese_model = buildSiameseNet(default_checkpoints_file)
    loaded_word2vec_model = loadW2v(default_word2vec_save)

    generated = [ unwrap(x) for x in clf_network.predict(concats, verbose=0) ]

    loaded = []
    keys = []
    for key, text in tqdm(dataset.items(), desc=name):
      keys.append(key)

      try:
        known_data = text['known']
        unknown_data = text['unknown']

        vectors = vectorize_single(known_data, unknown_data, loaded_word2vec_model)
        concats = concatenate_vectors_single(vectors, loaded_siamese_model.base)

        prediction = unwrap(loaded_siamese_model.clf(np.array([concats])).numpy())
        loaded.append(prediction)
      except ValueError:
        loaded.append(0.0)

    dataset_compare(generated, loaded)

test_against_dataset(test_corpus, test_siamese_vec, "Test Data")
test_against_dataset(val_corpus, val_siamese_vec, "Val Data")
#test_against_dataset(train_corpus, train_siamese_vec, "Train Dataset")

### Self-Contained Classes
Handles both loading and running the 3 models within a single, self-contained class

In [None]:
### Additional utility functions for the class ###
def strip_text(data):
	if type(data) is str: data = data.splitlines()

	text = []
	for line in data:
		if type(line) is not str: line = str(line)
		cleaned = line.strip().lstrip("\ufeff")
		text.append(cleaned)

	return text

def setupNltk(path = f"{os.curdir}/nltk_data") -> None:
	"""Set up the NLTK path and downloads datapacks (if required)"""
	nltk.data.path = [ path ]
	nltk.download(["punkt", "stopwords","wordnet"], download_dir=nltk.data.path[0], quiet=True)

In [None]:
class StyloNet:
	"""Contains the whole stylometry model, functions score, score_multi, predict and predict_multi
		can be called to run the stylometry model on a text.

		Input format for predictions is the following:
			texts = {
				'known': list[str] (known text(s))
				'unknown': list[str] (unknown text(s))
			}
		All lists for texts can be multi-dimensional, as long as it only ends in strings
	"""

	def __init__(self, profile = None, profile_dir:str = "stylometry_models"):
		# Load the profile path (if no profile is specified, use the current directory)
		profile_path = os.path.join(profile_dir, profile) if profile else os.curdir

		# Try loading the manifest, else use default values
		try:
			with open(os.path.join(profile_path, "manifest.json"), "rb") as f:
				manifest = json.load(f)
		except FileNotFoundError:
			# If the manifest isn't there, set to an empty dictionary to use default values
			manifest = {}

		self.valid_threshold = manifest.get("valid_threshold", 0.5)
		embedding_dim = manifest.get("embedding_dim", (323,))
		self.vector_length = manifest.get("word_vector_size", 300)

		nltk_path = os.path.join(profile_path, manifest.get("nltk_data", "nltk_data"))
		w2v_save = os.path.join(profile_path, manifest.get("word2vec", "word2vec.model"))
		model_checkpoints = os.path.join(profile_path, manifest.get("ckpts", "model_weights/cp.ckpt"))

		if not os.path.isdir(nltk_path) or len(os.listdir(nltk_path)) <= 0:
			# If the nltk path isn't populated, assume it probably needs to be downloaded
			setupNltk(nltk_path)

		# Load Word2Vec model
		self.word2vec = loadW2v(w2v_save)

		# Redefine and load the SiameseNet model from checkpoints
		self.siamese_model = buildSiameseNet(model_checkpoints, embedding_dim)
		self.base_network = self.siamese_model.base
		self.clf_network = self.siamese_model.clf

	def _vectorize(self,text : dict):
		vectors = {
			'known': get_vectors(text['known'], self.word2vec, self.vector_length),
			'unknown': get_vectors(text['unknown'], self.word2vec, self.vector_length)
		}
		return vectors

	def _vectorize_multi(self, texts: list[dict]):
		vectors = []
		for text in texts:
			vectors.append(self._vectorize(text))
		return vectors

	def _concatenate(self, vectors: dict):
		known_feature_vectors = self.base_network.predict(np.array(vectors['known']), verbose=0)
		unknown_feature_vectors = self.base_network.predict(np.array(vectors['unknown']), verbose=0)

		author_representation = np.mean(known_feature_vectors, axis=0)
		unknown_representation = np.mean(unknown_feature_vectors, axis=0)

		concat_vec = np.concatenate((author_representation, unknown_representation), axis=None)
		return concat_vec

	def _concatenate_multi(self, vectors : list[dict]):
		concats = []
		for vec in vectors:
			concats.append(self._concatenate(vec))
		return np.array(concats)

	### Interface Functions ###
	def score(self, texts : dict) -> float:
		"""Run the model and return the similarity score as a decimal"""
		if len(texts['unknown']) == 0: return 0  # Incase of empty unknown set

		vectors = self._vectorize(texts)
		concats = self._concatenate(vectors)

		prediction = self.clf_network.predict(np.expand_dims(concats, axis=0), verbose=0)

		# Convert to numpy array and flatten, as output shape will be (1,1)
		return unwrap(prediction)

	def score_batch(self, texts: list|dict) -> list[float]|dict:
		"""Calculate score over a list or dictionary of texts and return a list/dict of the results"""

		# Return a key/value pair generator for both a list and existing dictionary
		unpack = lambda texts: texts.items() if type(texts) is dict else enumerate(data)

		# Check whether a value is valid input (i.e. it's unknown text is non-empty)
		good_input = lambda text: type(text['unknown']) is list and len(text['unknown']) != 0

		# Convert input to a dictionary, if not already.
		# Ensures bad input can be filtered out and given a 0.0 rating at the end
		data = { k: v for k, v in unpack(texts) if good_input(v) }
		# Important for list ordering and including all keys in dictionaries

		# Run the predictions
		vectors = self._vectorize_multi(data.values())
		concates = self._concatenate_multi(vectors)
		predictions = { k: unwrap(v) for k, v in zip(data.keys(), self.clf_network.predict(concates, verbose=0)) }

		# Format the output (unwrapping already done above)
		if type(texts) is list:
			results = [ predictions.get(x, 0.0) for x in range(len(texts)) ]
		else:
			results = { k: predictions.get(k, 0.0) for k in texts.keys() }

		return results

	def predict(self, texts: dict) -> bool:
		"""Calculate score and return a prediction based on the predetermined threshold, returns a boolean result"""
		return self.score(texts) >= self.valid_threshold

	def predict_batch(self, texts: list|dict) -> list[bool]|dict:
		"""Run predict over a list/dict of texts and return the result with a boolean result"""
		scores = self.score_batch(texts)
		pred = lambda s : s >= self.valid_threshold

		if type(texts) is dict:
			results = {}
			for key, val in scores.items():
				results[key] = pred(val)
		else:
			results = []
			for val in scores:
				results.append(pred(val))

		return results

### Testing and Verification

Note: tests on train_corpus are commented out for now, as the *train_siamese_vectors* and *train_corpus* data are different sizes. Puts the data out of phase and makes the entire comparison worthless.

In [None]:
### Test the StyloNet class, defined above, to ensure it generates the same output on all datasets ###
concat_predict = lambda concats, model: [ unwrap(x) for x in model.predict(concats) ]
# Get results from the generated model
generated_test_results = concat_predict(test_siamese_vec, clf_network)
#generated_train_results = concat_predict(train_siamese_vec, clf_network)
generated_val_results = concat_predict(val_siamese_vec, clf_network)

# Run predictions over the same dataset using the loaded model
loaded_model = StyloNet()
loaded_test_results = loaded_model.score_batch(test_corpus)
#loaded_train_results = loaded_model.score_batch(train_corpus)
loaded_val_results = loaded_model.score_batch(val_corpus)

# Verify results
dataset_compare(generated_test_results, list(loaded_test_results.values()))
#dataset_compare(generated_train_results, list(loaded_train_results.values())
dataset_compare(generated_val_results, list(loaded_val_results.values()))

In [None]:
### Verify 'score' and 'score_batch' return the same results ###
def test_function_accuracy(corpus, name=None):
  loaded_model = StyloNet()

  batch_res = list(loaded_model.score_batch(corpus).values())

  individual_res = []
  for text in tqdm(corpus.values(), desc=name):
    individual_res.append(loaded_model.score(text))

  dataset_compare(batch_res, individual_res)

test_function_accuracy(test_corpus, "Test Data")
test_function_accuracy(val_corpus, "Val Data")
test_function_accuracy(train_corpus, "Train Data")

NOTE: Results from the test cases above show that the freshly-generated and loaded model have ALMOST identical output.

The variation in output seen only seems to occur when comparing prediction run as a batch over an entire dataset to running the prediction individually on each profile.

As the maximum discrepancy between between results is < 0.00001 in basically all cases, this is more than enough for the result to be considered the same in virtually every application.

Saving and loading the model doesn't change the analysis results at all (see 2 cells above) however, it's *only* running as a batch vs individually that affects results.

# Profile Handling
Code to save/load/remove profiles from the model. The StyloNet class above also contains the required code, for easy export.

In [None]:
def save_profile(name, profile_dir = "models") -> None:
  # Ensure destination exists
  if not os.path.isdir(profile_dir):
    os.mkdir(profile_dir)
    print(f"Created profile base directory: {profile_dir}")

  try:
    manifest = {}
    manifest['name'] = name
    abs_path = os.path.join(profile_dir, name)
    os.mkdir(abs_path)
    print(f"Created profile dir: {abs_path}")
    
    # save embedding dim
    manifest['embedding_dim'] = unwrap(embedding_dim)
    print(f"Saved embedding_dim (unwrapped): {manifest['embedding_dim']}")

    # save siamese net model weights
    model_save = "model_weights/cp.ckpt"
    abs_path = os.path.join(profile_dir, name, model_save)
    manifest['ckpts'] = model_save
    os.mkdir(os.path.dirname(abs_path))
    siamese_model.save_weights(abs_path)
    print(f"Saved siamese_net weights: {abs_path}")
    
    # save word vector size
    manifest['word_vector_size'] = w2v_vector_size
    print(f"Saved Word Vector Size: {manifest['word_vector_size']}")

    # save word2vec model weights
    word2vec_save = "word2vec.model"
    abs_path = os.path.join(profile_dir, name, word2vec_save)
    manifest['word2vec'] = word2vec_save
    word2vec_model.save(abs_path)
    print(f"Saved word2vec model: {abs_path}")

    # save pre-downloaded ntlk data
    nltk_data_save = "nltk_data"
    abs_path = os.path.join(profile_dir, name, nltk_data_save)
    manifest['nltk_data'] = nltk_data_save
    nltk.download(["punkt", "stopwords","wordnet"], download_dir=abs_path)
    print(f"Saved nltk data: {abs_path}")

    manifest['valid_threshold'] = 0.5
    print(f"Saved validity threshold: {manifest['valid_threshold']}")

    # save the manifest
    manifest_save = "manifest.json"
    abs_path = os.path.join(profile_dir, name, manifest_save)
    with open(abs_path, "w") as f:
      f.write(json.dumps(manifest, indent=4))
    print(f"Saved manifest: {abs_path}")
      
  except OSError:
    # some file IO failed, do a bit of cleanup then continue the error
    deep_clean(os.path.join(profile_dir, name), True)
    raise OSError("Profile failed to save!")

In [None]:
def load_profile(name, profile_dir="models") -> dict:
    abs_path = os.path.join(profile_dir, name)
    loads = {}
    
    with open(os.path.join(abs_path, "manifest.json"), "r") as f:
        manifest = json.load(f)
    
    # Load word2vec model
    word2vec_save = os.path.join(abs_path, manifest['word2vec'])
    word2vec = loadW2v(word2vec_save)
    loads['word2vec'] = word2vec
    loads['word_vector_size'] = manifest['word_vector_size']

    # Load siamese netdef remove_profile(name, profile_dir="models"):
    embedding_dim = (manifest['embedding_dim'],) if type(manifest['embedding_dim']) is int else tuple(manifest['embedding_dim'])
    loads['embedding_dim'] = embedding_dim
    loads['siamese_net'] = buildSiameseNet(os.path.join(abs_path, manifest['ckpts']), embedding_dim)

    loads['nltk_data'] = manifest['nltk_data']
    loads['valid_threshold'] = manifest['valid_threshold']

    return loads

In [None]:
def remove_profile(name, profile_dir="models"):
    deep_clean(os.path.join(profile_dir, name), True)

# Profile Management and Cleanup

In [None]:
# Save a profile with a custom name
print("Input profile name to save the weights (empty to skip saving)")
name = input("profile name? ")
if len(name) > 0:
	save_profile(name.lower().strip())

In [None]:
# Test loading a model
print("Input a profile name to test if it loads correctly (empty to skip)")
name = input("profile name? ")
if len(name) > 0:
    load_test = load_profile(name)
    print(f"Loaded SiameseNet? {type(load_test['siamese_net']) is SiameseNet}")
    print(f"Loaded Word2Vec? {type(load_test['word2vec']) is gensim.models.Word2Vec}")
    print(f"Embedding Dim? {load_test['embedding_dim']}")
    print(f"Word Vector Size? {load_test['word_vector_size']}")
    print(f"Validity Threshold? {load_test['valid_threshold']}")

In [None]:
print("WARNING: This will remove all SiameseNet weights, Word2Vec saved models and downloaded nltk_data!")
if input("Are you sure? ").lower().strip() == "yes":
  # nltk downloads, if they're in the working directory
  deep_clean("nltk_data", True)

  # checkpoint information
  deep_clean("model_weights", False)

  # saved Word2Vec model, if it exists
  try:
    os.remove(default_word2vec_save)
    print("Removed Word2Vec.model")
  except FileNotFoundError:
    print("Invalid file: Word2Vec.model")

else:
  print("Abort")