#Â Run N-Gram Tracing Using Idiolect

This notebook runs the idiolect n-gram tracing methods to get scores ready to plug into the performance functions. These are to be used as a baseline for the logprobs method.

In [17]:
# library(devtools)

# installs from the local folder (your modified clone)
# devtools::install("/Users/user/Documents/GitHub/idiolect", upgrade = "never")

In [18]:
# remove.packages("idiolect")
# install.packages("idiolect")

In [19]:
suppressWarnings(
  suppressPackageStartupMessages(
    {
      library(writexl)
      library(idiolect)
    }
  )
)

In [20]:
source("../../src/R_functions.R")

In [21]:
run_ngram_tracing_save_progress <- function(
  metadata, known, unknown,
  tokens = "word",
  n = 1,
  coefficient = NULL,
  out_path,
  save_every = 10,
  quiet = TRUE
) {
  #' Function to run n-gram tracing pricess across a metadata dataframe and save it to an Excel file.
  #' It also combines various bits of metadata to the results from the dataframe.
  #' 
  #' params
  #' metadata: The metadata dataframe
  #' known: The known dataframe
  #' unknown: The unknown dataframe
  #' tokens: The type of n-gram tokens to use (defaults to 'word')
  #' n: The n-gram level (defaults to 1)
  #' coefficient: The coefficient from the n-gram tracing function (defaults to None a.k.a. simpson)
  #' out_path: a location to save to
  #' save_every: After how many rows of the metadata should we save (defaults to 10)
  #' quiet: Run the funciton without updates (defaults to TRUE)

  message("Saving to: ", out_path)

  results_list <- vector("list", nrow(metadata))
  out_i <- 0L

  for (i in seq_len(nrow(metadata))) {
    selected_problem <- metadata[i, , drop = FALSE]

    k  <- as.character(selected_problem$known_author)
    u  <- as.character(selected_problem$unknown_author)
    c_ <- as.character(selected_problem$corpus)

    known_subset   <- quanteda::corpus_subset(known, corpus == c_ & author == k)
    unknown_subset <- quanteda::corpus_subset(unknown, corpus == c_ & author == u)

    if (quanteda::ndoc(known_subset) == 0 || quanteda::ndoc(unknown_subset) == 0) {
      out_i <- out_i + 1L
      selected_problem$n <- n
      selected_problem$target <- NA
      selected_problem$score <- NA
      results_list[[out_i]] <- selected_problem
      next
    }

    one_res <- tryCatch({
      run_call <- function() ngram_tracing(
        unknown_subset, known_subset,
        tokens = tokens, n = n,
        remove_punct = FALSE, remove_symbols = FALSE, remove_numbers = FALSE,
        coefficient = coefficient
      )

      tracing_raw <- if (quiet) suppressMessages(suppressWarnings(run_call())) else run_call()

      if (is.data.frame(tracing_raw)) {
        if (all(c("target","score") %in% names(tracing_raw))) {
          tr <- tracing_raw[, c("target","score"), drop = FALSE]
        } else if (ncol(tracing_raw) >= 2) {
          tr <- tracing_raw[, 1:2, drop = FALSE]
          names(tr) <- c("target","score")
        } else {
          tr <- data.frame(target = NA, score = NA)
        }
      } else if (is.list(tracing_raw) && !is.null(tracing_raw$target) && !is.null(tracing_raw$score)) {
        tr <- data.frame(target = tracing_raw$target, score = tracing_raw$score)
      } else {
        tr <- data.frame(target = NA, score = NA)
      }

      meta_rep <- selected_problem[rep(1, nrow(tr)), , drop = FALSE]
      cbind(meta_rep, coefficient = coefficient, n = n, tr)

    }, error = function(e) {
      message("Row ", i, " FAILED: ", conditionMessage(e))
      selected_problem$n <- n
      selected_problem$target <- NA
      selected_problem$score <- NA
      selected_problem
    })

    out_i <- out_i + 1L
    results_list[[out_i]] <- one_res

    if (i %% save_every == 0) {
      snapshot <- do.call(rbind, results_list[seq_len(out_i)])
      rownames(snapshot) <- NULL
      writexl::write_xlsx(list(results = snapshot), out_path)
      message("  (snapshot saved at row ", i, ")")
    }
  }

  final_df <- do.call(rbind, results_list[seq_len(out_i)])
  rownames(final_df) <- NULL
  writexl::write_xlsx(list(results = final_df), out_path)
  message("Done. Final saved: ", out_path)

  invisible(final_df)
}


In [22]:
run_ngram_tracing_for_n_values <- function(
  metadata, known, unknown,
  n_values = 1:10,
  out_dir,
  file_prefix = "ngram_tracing_results_simpson_",
  save_every = 10,
  quiet = TRUE,
  tokens = "word",
  coefficient = "simpson",
  skip_if_exists = TRUE
) {
  #' Function to run the n-gram tracing function over a range of n. Saving individual files.
  #' 
  #' 
  if (!dir.exists(out_dir)) dir.create(out_dir, recursive = TRUE)

  pad_width <- nchar(as.character(max(n_values)))

  for (n in n_values) {
    n_str <- sprintf(paste0("%0", pad_width, "d"), n)
    out_path <- file.path(out_dir, paste0(file_prefix, n_str, ".xlsx"))

    if (skip_if_exists && file.exists(out_path)) {
      message("Skipping n = ", n, " (exists): ", out_path)
      next
    }

    message("==== n = ", n, " -> ", out_path, " ====")

    tryCatch({
      run_ngram_tracing_save_progress(
        metadata = metadata,
        known = known,
        unknown = unknown,
        n = n,
        out_path = out_path,
        save_every = save_every,
        quiet = quiet,
        tokens = tokens,
        coefficient = coefficient
      )
    }, error = function(e) {
      message("FAILED n = ", n, " : ", conditionMessage(e))
      invisible(NULL)
    })
  }

  invisible(TRUE)
}

In [None]:
base_read_loc <- "/Volumes/BCross/datasets/author_verification"
base_save_loc <- "/Volumes/BCross/av_datasets_experiments/Baseline Results"

data_types <- c("test", "training")

corpora <- c("Wiki", "Enron", "Perverted Justice", "StackExchange",
            "ACL", "TripAdvisor", "The Apricity", "Koppel's Blogs",
            "The Telegraph", "Reddit")

corpora <- c("Wiki", "Enron", "Perverted Justice", "ACL", "StackExchange")

for(d_type in data_type){

  known <- file.path(base_read_loc, d_type, "known_raw.rds")
  unknown <- file.path(base_read_loc, d_type, "unknown_raw.rds")
  metadata <- file.path(base_read_loc, d_type, "metadata.rds")

  coefs <- c("simpson", "jaccard")

  for(corp in corpora) {

    metadata_corpus <- metadata %>%
      dplyr::filter(corpus == corp)

    out_dir <- file.path(base_save_loc, d_type, corp, "n_gram_tracing")

    dir.create(out_dir, recursive = TRUE, showWarnings = FALSE)

    for(co in coefs){
      print(paste0("Working on Corpus: ", corp, " with Metric: ", co))

      run_ngram_tracing_for_n_values(
      metadata_corpus, known, unknown,
      n_values = 1:10,
      out_dir=out_dir,
      file_prefix = paste0("ngram_tracing_results_", co, "_"),
      save_every = 10,
      quiet = TRUE,
      tokens = "word",
      coefficient = co,
      skip_if_exists = TRUE
      )
    }
  }
}

[1] "Working on Corpus: ACL with Metric: simpson"
Skipping n = 1 (exists): /Volumes/BCross/av_datasets_experiments/ngram_masking_logrpobs/training/n_gram_tracing_tests/ACL/ngram_tracing_results_simpson_01.xlsx
Skipping n = 2 (exists): /Volumes/BCross/av_datasets_experiments/ngram_masking_logrpobs/training/n_gram_tracing_tests/ACL/ngram_tracing_results_simpson_02.xlsx
Skipping n = 3 (exists): /Volumes/BCross/av_datasets_experiments/ngram_masking_logrpobs/training/n_gram_tracing_tests/ACL/ngram_tracing_results_simpson_03.xlsx
Skipping n = 4 (exists): /Volumes/BCross/av_datasets_experiments/ngram_masking_logrpobs/training/n_gram_tracing_tests/ACL/ngram_tracing_results_simpson_04.xlsx
Skipping n = 5 (exists): /Volumes/BCross/av_datasets_experiments/ngram_masking_logrpobs/training/n_gram_tracing_tests/ACL/ngram_tracing_results_simpson_05.xlsx
Skipping n = 6 (exists): /Volumes/BCross/av_datasets_experiments/ngram_masking_logrpobs/training/n_gram_tracing_tests/ACL/ngram_tracing_results_simpso

In [24]:
# run_ngram_tracing_for_n_values(
#   metadata_corpus, known, unknown,
#   n_values = 1:10,
#   out_dir=out_dir,
#   file_prefix = paste0("ngram_tracing_results_", coefficient, "_"),
#   save_every = 10,
#   quiet = TRUE,
#   tokens = "word",
#   coefficient = coefficient,
#   skip_if_exists = FALSE
# )