# Use Idiolect Package to Score Data

In [1]:
suppressWarnings(
  suppressPackageStartupMessages(
    {
      library(readxl)
      library(writexl)
      library(dplyr)
      library(purrr)
      library(tibble)
      library(devtools)
    }
  )
)

# installs from the local folder (your modified clone)
remove.packages("idiolect")
devtools::install("/Users/user/Documents/GitHub/idiolect", upgrade = "never")

library(idiolect)

Removing package from ‘/Library/Frameworks/R.framework/Versions/4.2-arm64/Resources/library’
(as ‘lib’ is unspecified)
[36m──[39m [36mR CMD build[39m [36m─────────────────────────────────────────────────────────────────[39m
✔  checking for file ‘/Users/user/Documents/GitHub/idiolect/DESCRIPTION’ ...
─  preparing ‘idiolect’:
✔  checking DESCRIPTION meta-information ...
─  checking for LF line-endings in source and make files and shell scripts
─  checking for empty or unneeded directories
─  building ‘idiolect_1.1.1.9000.tar.gz’
   
Running /Library/Frameworks/R.framework/Resources/bin/R CMD INSTALL \
  /var/folders/xx/hy496x3x5sn4hy9gy1fk19lw0000gp/T//RtmpSh1dIb/idiolect_1.1.1.9000.tar.gz \
  --install-tests 
* installing to library ‘/Library/Frameworks/R.framework/Versions/4.2-arm64/Resources/library’
* installing *source* package ‘idiolect’ ...
** using staged installation
** R
** data
*** moving datasets to lazyload DB
** inst
** tests
** byte-compile and prepare package for la

1: package ‘quanteda’ was built under R version 4.2.3 
2: In .recacheSubclasses(def@className, def, env) :
  undefined subclass "ndiMatrix" of class "replValueSp"; definition not updated
3: In .recacheSubclasses(def@className, def, env) :
  undefined subclass "pcorMatrix" of class "replValueSp"; definition not updated


In [2]:
models <- c("gpt2")

corpora <- c(
    "Wiki", "Enron", "Perverted Justice", "StackExchange", "ACL",
    "TripAdvisor", "The Apricity", "Koppel's Blogs", "The Telegraph",
    "Reddit"
)

data_types <- c("training", "test")

base_loc <- "/Volumes/BCross/av_datasets_experiments/ngram_masking_logrpobs"

file_name <- "raw_agg_scores.xlsx"

save_name <- "idiolect_results.xlsx"

calibrated_save_name <- "idiolect_calibrated_results.xlsx"

combined_save_name <- "raw_idiolect_results_by_context.xlsx"

combined_calibrated_save_name <- "raw_idiolect_calibrated_results_by_context.xlsx"

In [3]:
get_distinct_problems <- function(df){
  #' Group by specific columns and get a distinct problem list
  
  distinct_problems <- df |>
    dplyr::select(data_type, corpus, scoring_model, max_context_tokens, min_token_size) |>
    dplyr::distinct()

  return(distinct_problems)
}

In [4]:
calculate_performance <- function(df, score_col = "unknown_sum_log_probs") {

  problem_df <- get_distinct_problems(df)

  raw_scores <- df |>
    dplyr::rename(score = !!rlang::sym(score_col))

  out_list <- vector("list", nrow(problem_df))

  for (i in seq_len(nrow(problem_df))) {

    row <- problem_df[i, , drop = FALSE]

    data_type          <- row$data_type
    corpus             <- row$corpus
    scoring_model      <- row$scoring_model
    max_context_tokens <- row$max_context_tokens
    min_token_size     <- row$min_token_size

    filtered_df <- raw_scores |>
      dplyr::filter(
        data_type == !!data_type,
        corpus == !!corpus,
        scoring_model == !!scoring_model,
        min_token_size == !!min_token_size,
        (is.na(max_context_tokens) & is.na(!!max_context_tokens)) |
          (!is.na(max_context_tokens) & max_context_tokens == !!max_context_tokens)
      )

    perf_eval <- tryCatch(
      {
        out <- performance(filtered_df)
        out$evaluation
      },
      error = function(e) {
        message(sprintf("[%d] -> FAILED (%s)", i, conditionMessage(e)))
        NULL
      }
    )

    if (is.null(perf_eval)) {
      out_list[[i]] <- NULL
      next
    } else {
      perf_eval <- perf_eval |>
        dplyr::filter(!(Cllr == "Inf"))
    }

    out_list[[i]] <- dplyr::bind_cols(
      tibble::as_tibble(row),
      tibble::as_tibble(perf_eval)
    )
  }

  dplyr::bind_rows(out_list)
}


In [5]:
calculate_calibrated_performance <- function(test_df, training_df, score_col = "unknown_sum_log_probs") {

  # distinct problems should come from TEST only
  problem_df <- get_distinct_problems(test_df)

  test_scores <- test_df |>
    dplyr::rename(score = !!rlang::sym(score_col))

  training_scores <- training_df |>
    dplyr::rename(score = !!rlang::sym(score_col))

  out_list <- vector("list", nrow(problem_df))

  for (i in seq_len(nrow(problem_df))) {

    row <- problem_df[i, , drop = FALSE]

    # NOTE: we intentionally do NOT match on data_type across test/training
    corpus             <- row$corpus
    scoring_model      <- row$scoring_model
    max_context_tokens <- row$max_context_tokens
    min_token_size     <- row$min_token_size

    filtered_test <- test_scores |>
      dplyr::filter(
        corpus == !!corpus,
        scoring_model == !!scoring_model,
        min_token_size == !!min_token_size,
        (is.na(max_context_tokens) & is.na(!!max_context_tokens)) |
          (!is.na(max_context_tokens) & max_context_tokens == !!max_context_tokens)
      )

    filtered_training <- training_scores |>
      dplyr::filter(
        corpus == !!corpus,
        scoring_model == !!scoring_model,
        min_token_size == !!min_token_size,
        (is.na(max_context_tokens) & is.na(!!max_context_tokens)) |
          (!is.na(max_context_tokens) & max_context_tokens == !!max_context_tokens)
      )

    if (nrow(filtered_training) == 0) {
      message(sprintf("[%d] -> SKIP (no matching training rows)", i))
      out_list[[i]] <- NULL
      next
    }

    perf_eval <- tryCatch(
      {
        out <- performance(filtered_test, training = filtered_training)
        out$evaluation
      },
      error = function(e) {
        message(sprintf("[%d] -> FAILED (%s)", i, conditionMessage(e)))
        NULL
      }
    )

    if (is.null(perf_eval)) {
      out_list[[i]] <- NULL
      next
    } else {
      perf_eval <- perf_eval |>
        dplyr::filter(!(Cllr == "Inf"))
    }

    # keep the TEST metadata row (including data_type="test")
    out_list[[i]] <- dplyr::bind_cols(
      tibble::as_tibble(row),
      tibble::as_tibble(perf_eval)
    )
  }

  dplyr::bind_rows(out_list)
}


In [6]:
for (data_type in data_types) {
  for (corpus in corpora) {
    for (model_name in models) {

      file_loc <- file.path(base_loc, data_type, corpus, model_name, file_name)
      save_loc <- file.path(base_loc, data_type, corpus, model_name, save_name)

      prefix <- sprintf("%s | %s | %s", data_type, corpus, model_name)

      # skip if already saved
      if (file.exists(save_loc)) {
        message(prefix, " -> SKIP (exists): ", save_loc)
        next
      }

      # skip if input missing
      if (!file.exists(file_loc)) {
        message(prefix, " -> MISSING input: ", file_loc)
        next
      }

      tryCatch(
        {
          message(prefix, " -> reading: ", file_loc)
          df <- readxl::read_excel(file_loc)

          message(prefix, " -> calculating performance")
          results <- calculate_performance(df)

          if (is.null(results) || nrow(results) == 0) {
            message(prefix, " -> NO RESULTS (empty), not writing")
            next
          }

          dir.create(dirname(save_loc), recursive = TRUE, showWarnings = FALSE)

          message(prefix, " -> writing: ", save_loc)
          writexl::write_xlsx(results, save_loc)

          message(prefix, " -> OK")
        },
        error = function(e) {
          message(prefix, " -> FAILED: ", conditionMessage(e))
        }
      )

    }
  }
}


training | Wiki | gpt2 -> SKIP (exists): /Volumes/BCross/av_datasets_experiments/ngram_masking_logrpobs/training/Wiki/gpt2/idiolect_results.xlsx
training | Enron | gpt2 -> SKIP (exists): /Volumes/BCross/av_datasets_experiments/ngram_masking_logrpobs/training/Enron/gpt2/idiolect_results.xlsx
training | Perverted Justice | gpt2 -> SKIP (exists): /Volumes/BCross/av_datasets_experiments/ngram_masking_logrpobs/training/Perverted Justice/gpt2/idiolect_results.xlsx
training | StackExchange | gpt2 -> SKIP (exists): /Volumes/BCross/av_datasets_experiments/ngram_masking_logrpobs/training/StackExchange/gpt2/idiolect_results.xlsx
training | ACL | gpt2 -> SKIP (exists): /Volumes/BCross/av_datasets_experiments/ngram_masking_logrpobs/training/ACL/gpt2/idiolect_results.xlsx
training | TripAdvisor | gpt2 -> reading: /Volumes/BCross/av_datasets_experiments/ngram_masking_logrpobs/training/TripAdvisor/gpt2/raw_agg_scores.xlsx
training | TripAdvisor | gpt2 -> calculating performance
training | TripAdvisor 

In [7]:
for (corpus in corpora) {
  for (model_name in models) {

    # paths
    test_file_loc <- file.path(base_loc, "test", corpus, model_name, file_name)
    train_file_loc <- file.path(base_loc, "training", corpus, model_name, file_name)

    save_loc <- file.path(base_loc, "test", corpus, model_name, calibrated_save_name)

    prefix <- sprintf("%s | %s", corpus, model_name)

    # skip if already saved
    if (file.exists(save_loc)) {
      message(prefix, " -> SKIP (exists): ", save_loc)
      next
    }

    # skip if either input missing (need BOTH)
    if (!file.exists(test_file_loc) || !file.exists(train_file_loc)) {
      message(prefix, " -> SKIP (missing inputs) test_exists=",
              file.exists(test_file_loc), " training_exists=", file.exists(train_file_loc))
      next
    }

    tryCatch(
      {
        message(prefix, " -> reading test: ", test_file_loc)
        test_df <- readxl::read_excel(test_file_loc)

        message(prefix, " -> reading training: ", train_file_loc)
        training_df <- readxl::read_excel(train_file_loc)

        message(prefix, " -> calculating performance (calibrated)")
        # uses your calculate_performance(test_df, training_df=..., ...)
        results <- calculate_calibrated_performance(test_df = test_df, training_df = training_df)

        if (is.null(results) || nrow(results) == 0) {
          message(prefix, " -> NO RESULTS (empty), not writing")
          next
        }

        dir.create(dirname(save_loc), recursive = TRUE, showWarnings = FALSE)

        message(prefix, " -> writing: ", save_loc)
        writexl::write_xlsx(results, save_loc)

        message(prefix, " -> OK")
      },
      error = function(e) {
        message(prefix, " -> FAILED: ", conditionMessage(e))
      }
    )
  }
}

Wiki | gpt2 -> SKIP (exists): /Volumes/BCross/av_datasets_experiments/ngram_masking_logrpobs/test/Wiki/gpt2/idiolect_calibrated_results.xlsx
Enron | gpt2 -> SKIP (exists): /Volumes/BCross/av_datasets_experiments/ngram_masking_logrpobs/test/Enron/gpt2/idiolect_calibrated_results.xlsx
Perverted Justice | gpt2 -> SKIP (exists): /Volumes/BCross/av_datasets_experiments/ngram_masking_logrpobs/test/Perverted Justice/gpt2/idiolect_calibrated_results.xlsx
StackExchange | gpt2 -> SKIP (exists): /Volumes/BCross/av_datasets_experiments/ngram_masking_logrpobs/test/StackExchange/gpt2/idiolect_calibrated_results.xlsx
ACL | gpt2 -> SKIP (exists): /Volumes/BCross/av_datasets_experiments/ngram_masking_logrpobs/test/ACL/gpt2/idiolect_calibrated_results.xlsx
TripAdvisor | gpt2 -> reading test: /Volumes/BCross/av_datasets_experiments/ngram_masking_logrpobs/test/TripAdvisor/gpt2/raw_agg_scores.xlsx
TripAdvisor | gpt2 -> reading training: /Volumes/BCross/av_datasets_experiments/ngram_masking_logrpobs/trainin

In [8]:
combined_list <- list()

for (data_type in data_types) {
  for (corpus in corpora) {
    for (model_name in models) {

      # this should point at the *per-combo results file* you want to combine
      file_loc <- file.path(base_loc, data_type, corpus, model_name, save_name)

      prefix <- sprintf("%s | %s | %s", data_type, corpus, model_name)

      if (!file.exists(file_loc)) {
        message(prefix, " -> MISSING: ", file_loc)
        next
      }

      message(prefix, " -> reading: ", file_loc)

      df_i <- tryCatch(
        readxl::read_excel(file_loc),
        error = function(e) {
          message(prefix, " -> FAILED reading (", conditionMessage(e), ")")
          NULL
        }
      )

      if (is.null(df_i) || nrow(df_i) == 0) {
        message(prefix, " -> EMPTY (skipping)")
        next
      }

      # if the file doesn't already include these, add them
      if (!"data_type" %in% names(df_i)) df_i$data_type <- data_type
      if (!"corpus" %in% names(df_i)) df_i$corpus <- corpus
      if (!("scoring_model" %in% names(df_i) || "model_name" %in% names(df_i))) {
        df_i$scoring_model <- model_name
      }

      df_i$source_file <- file_loc

      combined_list[[length(combined_list) + 1]] <- df_i
    }
  }
}

combined_df <- dplyr::bind_rows(combined_list)

combined_save_loc <- file.path(base_loc, combined_save_name)

message("Writing combined file (OVERWRITE): ", combined_save_loc)
writexl::write_xlsx(combined_df, combined_save_loc)
message("DONE. Rows combined: ", nrow(combined_df))


training | Wiki | gpt2 -> reading: /Volumes/BCross/av_datasets_experiments/ngram_masking_logrpobs/training/Wiki/gpt2/idiolect_results.xlsx
training | Enron | gpt2 -> reading: /Volumes/BCross/av_datasets_experiments/ngram_masking_logrpobs/training/Enron/gpt2/idiolect_results.xlsx
training | Perverted Justice | gpt2 -> reading: /Volumes/BCross/av_datasets_experiments/ngram_masking_logrpobs/training/Perverted Justice/gpt2/idiolect_results.xlsx
training | StackExchange | gpt2 -> reading: /Volumes/BCross/av_datasets_experiments/ngram_masking_logrpobs/training/StackExchange/gpt2/idiolect_results.xlsx
training | ACL | gpt2 -> reading: /Volumes/BCross/av_datasets_experiments/ngram_masking_logrpobs/training/ACL/gpt2/idiolect_results.xlsx
training | TripAdvisor | gpt2 -> MISSING: /Volumes/BCross/av_datasets_experiments/ngram_masking_logrpobs/training/TripAdvisor/gpt2/idiolect_results.xlsx
training | The Apricity | gpt2 -> reading: /Volumes/BCross/av_datasets_experiments/ngram_masking_logrpobs/tr

In [9]:
combined_list <- list()

for (corpus in corpora) {
  for (model_name in models) {

    file_loc <- file.path(base_loc, "test", corpus, model_name, calibrated_save_name)
    prefix <- sprintf("test | %s | %s", corpus, model_name)

    if (!file.exists(file_loc)) {
      message(prefix, " -> MISSING: ", file_loc)
      next
    }

    message(prefix, " -> reading: ", file_loc)

    df_i <- tryCatch(
      readxl::read_excel(file_loc),
      error = function(e) {
        message(prefix, " -> FAILED reading (", conditionMessage(e), ")")
        NULL
      }
    )

    if (is.null(df_i) || nrow(df_i) == 0) {
      message(prefix, " -> EMPTY (skipping)")
      next
    }

    # ensure identifiers exist
    if (!"data_type" %in% names(df_i)) df_i$data_type <- "test"
    if (!"corpus" %in% names(df_i)) df_i$corpus <- corpus
    if (!("scoring_model" %in% names(df_i) || "model_name" %in% names(df_i))) {
      df_i$scoring_model <- model_name
    }

    df_i$source_file <- file_loc

    combined_list[[length(combined_list) + 1]] <- df_i
  }
}

combined_calibrated_df <- dplyr::bind_rows(combined_list)

combined_save_loc <- file.path(base_loc, combined_calibrated_save_name)

message("Writing combined calibrated file (OVERWRITE): ", combined_save_loc)
writexl::write_xlsx(combined_calibrated_df, combined_save_loc)
message("DONE. Rows combined: ", nrow(combined_calibrated_df))


test | Wiki | gpt2 -> reading: /Volumes/BCross/av_datasets_experiments/ngram_masking_logrpobs/test/Wiki/gpt2/idiolect_calibrated_results.xlsx
test | Enron | gpt2 -> reading: /Volumes/BCross/av_datasets_experiments/ngram_masking_logrpobs/test/Enron/gpt2/idiolect_calibrated_results.xlsx
test | Perverted Justice | gpt2 -> reading: /Volumes/BCross/av_datasets_experiments/ngram_masking_logrpobs/test/Perverted Justice/gpt2/idiolect_calibrated_results.xlsx
test | StackExchange | gpt2 -> reading: /Volumes/BCross/av_datasets_experiments/ngram_masking_logrpobs/test/StackExchange/gpt2/idiolect_calibrated_results.xlsx
test | ACL | gpt2 -> reading: /Volumes/BCross/av_datasets_experiments/ngram_masking_logrpobs/test/ACL/gpt2/idiolect_calibrated_results.xlsx
test | TripAdvisor | gpt2 -> MISSING: /Volumes/BCross/av_datasets_experiments/ngram_masking_logrpobs/test/TripAdvisor/gpt2/idiolect_calibrated_results.xlsx
test | The Apricity | gpt2 -> reading: /Volumes/BCross/av_datasets_experiments/ngram_maski