# Idiolect LambdaG Method

This notebook is used to run the LambdaG method from idiolect. Used aas a baseline for my methods.

In [21]:
# remove.packages("idiolect")
# install.packages("idiolect")

In [22]:
suppressWarnings(
  suppressPackageStartupMessages(
    {
      library(writexl)
      library(idiolect)
      library(dplyr)
    }
  )
)

In [23]:
known <- readRDS("/Volumes/BCross/datasets/author_verification/test/known_raw.rds")
unknown <- readRDS("/Volumes/BCross/datasets/author_verification/test/unknown_raw.rds")
metadata <- readRDS("/Volumes/BCross/datasets/author_verification/test/metadata.rds")

corpora <- metadata %>%
  dplyr::pull(corpus) %>%
  unique()

In [24]:
run_lambdag <- function(metadata, known, unknown){

  metadata <- metadata %>%
    dplyr::mutate(target = known_author == unknown_author)

  out <- vector("list", nrow(metadata) * 5L)
  idx <- 1L

  for(i in seq_len(nrow(metadata))){

    selected_problem <- metadata[i, , drop = FALSE]

    k  <- as.character(selected_problem$known_author)
    u  <- as.character(selected_problem$unknown_author)
    c_ <- as.character(selected_problem$corpus)

    known_subset <- quanteda::corpus_subset(known, corpus == c_ & author == k) |>
      contentmask(algorithm = "POSnoise") |>
      quanteda::tokens("sentence")

    unknown_subset <- quanteda::corpus_subset(unknown, corpus == c_ & author == u) |>
      contentmask(algorithm = "POSnoise") |>
      quanteda::tokens("sentence")

    ref_subset <- quanteda::corpus_subset(known, corpus == c_ & author != k & author != u) |>
      contentmask(algorithm = "POSnoise") |>
      quanteda::tokens("sentence")

    for(j in seq_len(5L)){
      test_results <- lambdaG(unknown_subset, known_subset, ref_subset)
      test_score <- test_results$score

      out[[idx]] <- cbind(selected_problem, rep = j, score = test_score)
      idx <- idx + 1L
    }
  }

  final_results <- dplyr::bind_rows(out)

  final_results %>%
    group_by(across(all_of(names(metadata)))) %>%
    summarise(score = mean(score, na.rm = TRUE), .groups = "drop")

}

In [25]:
metadata[1:2,]

[38;5;246m# A tibble: 2 × 4[39m
  problem                                     corpus known_author unknown_author
  [3m[38;5;246m<chr>[39m[23m                                       [3m[38;5;246m<chr>[39m[23m  [3m[38;5;246m<chr>[39m[23m        [3m[38;5;246m<chr>[39m[23m         
[38;5;250m1[39m Ibekwe-SanJuan, Fidelia vs Ibekwe-SanJuan,… ACL    Ibekwe-SanJ… Ibekwe-SanJua…
[38;5;250m2[39m Ibekwe-SanJuan, Fidelia vs Ide, Nancy       ACL    Ibekwe-SanJ… Ide, Nancy    

In [26]:
test_results <- run_lambdag(metadata[1:2,], known, unknown)

successfully initialized (spaCy Version: 3.8.2, language model: en_core_web_sm)
successfully initialized (spaCy Version: 3.8.2, language model: en_core_web_sm)
successfully initialized (spaCy Version: 3.8.2, language model: en_core_web_sm)
  |                                                  | 0 % ~calculating   |++++++++++++++++++++++++++++++++++++++++++++++++++| 100% elapsed=01s  
  |++++++++++++++++++++++++++++++++++++++++++++++++++| 100% elapsed=01s  
  |++++++++++++++++++++++++++++++++++++++++++++++++++| 100% elapsed=01s  
  |++++++++++++++++++++++++++++++++++++++++++++++++++| 100% elapsed=01s  
  |++++++++++++++++++++++++++++++++++++++++++++++++++| 100% elapsed=01s  
successfully initialized (spaCy Version: 3.8.2, language model: en_core_web_sm)
successfully initialized (spaCy Version: 3.8.2, language model: en_core_web_sm)
successfully initialized (spaCy Version: 3.8.2, language model: en_core_web_sm)
  |++++++++++++++++++++++++++++++++++++++++++++++++++| 100% elapsed=01s  
  |+

In [27]:
test_results

[38;5;246m# A tibble: 2 × 6[39m
  problem                        corpus known_author unknown_author target score
  [3m[38;5;246m<chr>[39m[23m                          [3m[38;5;246m<chr>[39m[23m  [3m[38;5;246m<chr>[39m[23m        [3m[38;5;246m<chr>[39m[23m          [3m[38;5;246m<lgl>[39m[23m  [3m[38;5;246m<dbl>[39m[23m
[38;5;250m1[39m Ibekwe-SanJuan, Fidelia vs Ib… ACL    Ibekwe-SanJ… Ibekwe-SanJua… TRUE   -[31m42[39m[31m.[39m[31m4[39m
[38;5;250m2[39m Ibekwe-SanJuan, Fidelia vs Id… ACL    Ibekwe-SanJ… Ide, Nancy     FALSE    0  