# Run Lambda G

This notbook is used to run Andreas LambdaG method in R

In [59]:
suppressPackageStartupMessages({
  library(dplyr)
  library(tibble)
  library(idiolect)
  library(jsonlite)
})

In [60]:
source("./utils.R")

In [61]:
# Input variables
# selected_corpus since column called corpus in metadata
base_loc <- "/Volumes/BCross/datasets/author_verification"
data_type <- "test"
corpus <- "Enron"

# Locations
data_loc <- paste0(base_loc, "/", data_type, "/", corpus)

known_loc <- paste0(data_loc, "/known_raw.jsonl")
unknown_loc <- paste0(data_loc, "/unknown_raw.jsonl")
metadata_loc <-paste0(base_loc, "/", data_type, "/metadata.rds")

# Impostors
result_save_loc <- paste0(data_loc, "/lambdag_results")

if (!dir.exists(result_save_loc)) {
  dir.create(result_save_loc, recursive = TRUE)
}

In [62]:
# Read base files
metadata <- readRDS(metadata_loc)

# Filter metadata for current corpus
filtered_metadata <- metadata |> 
  filter(corpus == .env$corpus) |>
  rownames_to_column(var = 'sample_id') 

# Get the sample_id vales
sample_ids <- filtered_metadata |> pull(sample_id)

In [63]:
filtered_metadata

[38;5;246m# A tibble: 96 × 5[39m
   sample_id problem                          corpus known_author unknown_author
   [3m[38;5;246m<chr>[39m[23m     [3m[38;5;246m<chr>[39m[23m                            [3m[38;5;246m<chr>[39m[23m  [3m[38;5;246m<chr>[39m[23m        [3m[38;5;246m<chr>[39m[23m         
[38;5;250m 1[39m 1         Kevin.hyatt vs Kevin.hyatt       Enron  Kevin.hyatt  Kevin.hyatt   
[38;5;250m 2[39m 2         Kevin.hyatt vs Kimberly.watson   Enron  Kevin.hyatt  Kimberly.wats…
[38;5;250m 3[39m 3         Kimberly.watson vs Kimberly.wat… Enron  Kimberly.wa… Kimberly.wats…
[38;5;250m 4[39m 4         Kimberly.watson vs Larry.campbe… Enron  Kimberly.wa… Larry.campbell
[38;5;250m 5[39m 5         Larry.campbell vs Larry.campbell Enron  Larry.campb… Larry.campbell
[38;5;250m 6[39m 6         Larry.campbell vs Lindy.donoho   Enron  Larry.campb… Lindy.donoho  
[38;5;250m 7[39m 7         Lindy.donoho vs Lindy.donoho     Enron  Lindy.donoho Lindy.donoho  

In [64]:
# Function to read a JSONL file into a data frame
read_jsonl <- function(file_path) {
  lines <- readLines(file_path)
  parsed_list <- lapply(lines, function(line) {
    parsed <- fromJSON(line)
    if (is.list(parsed) && length(parsed) == 1) {
      return(parsed[[1]])
    } else {
      return(parsed)
    }
  })
  df <- do.call(rbind, lapply(parsed_list, function(x) as.data.frame(x, stringsAsFactors = FALSE)))
  return(df)
}

# This method aids in reading list columns
read_jsonl_stream <- function(file_path) {
  con <- file(file_path, open = "r", encoding = "UTF-8")
  on.exit(close(con))
  
  # stream_in reads JSON objects line-by-line
  data <- jsonlite::stream_in(con, verbose = FALSE)
  return(data)
}

# Add Doc ID to Known Docs
create_temp_doc_id <- function(input_text) {
  match <- regexec("\\[(.*?)\\]", input_text, perl = TRUE)
  match_result <- regmatches(input_text, match)
  
  if (length(match_result[[1]]) > 1) {
    extracted_text <- match_result[[1]][2]
    cleaned_text <- gsub("[^\\w]", "_", extracted_text, perl = TRUE)
    final_text <- gsub("_{2,}", "_", cleaned_text, perl = TRUE)
    return(tolower(final_text))
  }
  return(NULL)
}

In [65]:
# Run the impostor method across samples
for(s in sample_ids){
  
  chosen_sample <- filtered_metadata |>
    filter(sample_id == s)
  
  known_author <- chosen_sample |> pull(known_author)
  unknown_author <- chosen_sample |> pull(unknown_author)
  problem <- chosen_sample |> pull(problem)
  save_loc <- paste0(result_save_loc, "/", problem, ".rds")
  
  # Skip if file already exists
  if (file.exists(save_loc)) {
    print(paste0("Already compared: ", problem, " skipping..."))
    next
  }
  print(paste0("Comparing: ", problem))
  
  known_text <- read_jsonl(known_loc) |>
    filter(author==known_author) |>
    mutate('temp_id' = sapply(doc_id, create_temp_doc_id))

  unknown_text <- read_jsonl(unknown_loc) |>
    filter(author==unknown_author) |>
    mutate('temp_id' = create_temp_doc_id(doc_id))

  unknown_doc <- unknown_text |> pull(temp_id) |> head(1)

  impostor_text <- read_jsonl(unknown_loc) |>
    filter(!(author %in% c(known_author, unknown_author))) |>
    mutate('temp_id' = sapply(doc_id, create_temp_doc_id))
  
  known_corpus <- corpus(known_text)
  unknown_corpus <- corpus(unknown_text)
  impostor_corpus <- corpus(impostor_text)

  unknown_masked <- contentmask(unknown_corpus)
  impostor_masked <- contentmask(impostor_corpus)

  unknown_sents <- tokenize_sents(unknown_masked)
  impostor_sents <- tokenize_sents(impostor_masked)
  
  # We want to pull the temp_id to loop through the documents
  known_docs <- docvars(known_corpus) |> pull(temp_id)

  # Get the results
  final_result <- data.frame()
  
  for(kd in known_docs){
    print(paste0("Known Document: ", kd))
    known_subset = quanteda::corpus_subset(known_corpus, temp_id == kd)
    
    # Do this only for LambdaG
    known_masked <- contentmask(known_subset)
    known_sents <- tokenize_sents(known_masked)

    for(i in 1:5){
      print(paste0("Repetition: ", i))

      result <- idiolect::lambdaG(unknown_sents, known_sents, impostor_sents)
      
      result_df <- cbind('known_doc' = kd, 'unknown_doc' = unknown_doc, 'repetition' = i, problem, known_author, unknown_author, result) |>
        select(problem, known_doc, unknown_doc, repetition, known_author, unknown_author, score)
      
      final_result <- rbind(final_result, result_df)
    }
  }
  saveRDS(final_result, save_loc)
}

[1] "Comparing: Kevin.hyatt vs Kevin.hyatt"
successfully initialized (spaCy Version: 3.8.2, language model: en_core_web_sm)
successfully initialized (spaCy Version: 3.8.2, language model: en_core_web_sm)
successfully initialized (spaCy Version: 3.8.2, language model: en_core_web_sm)
successfully initialized (spaCy Version: 3.8.2, language model: en_core_web_sm)
[1] "Known Document: kevin_hyatt_mail_1"
successfully initialized (spaCy Version: 3.8.2, language model: en_core_web_sm)
successfully initialized (spaCy Version: 3.8.2, language model: en_core_web_sm)
[1] "Repetition: 1"
  |++++++++++++++++++++++++++++++++++++++++++++++++++| 100% elapsed=00s  
[1] "Repetition: 2"
  |++++++++++++++++++++++++++++++++++++++++++++++++++| 100% elapsed=00s  
[1] "Repetition: 3"
  |++++++++++++++++++++++++++++++++++++++++++++++++++| 100% elapsed=00s  
[1] "Repetition: 4"
  |++++++++++++++++++++++++++++++++++++++++++++++++++| 100% elapsed=00s  
[1] "Repetition: 5"
  |++++++++++++++++++++++++++++++++++++