# Combine Results and Score Using Idiolect

### Load Libraries

In [174]:
source("./utils.R")

In [175]:
suppressPackageStartupMessages(
  {
    library(dplyr)
    library(idiolect)
    library(readr)
    library(readxl)
    library(writexl)
    library(purrr)
  }
)

### Load Data

In [176]:
base_location = '/Volumes/BCross/paraphrase examples slurm'

# Token Size Problems
# This table contains the problems for each different min_token_size value in the dataset
token_size_problems = read_excel(paste0(base_location, '/token_size_problems.xlsx'))

# Raw Score Data
# This data contains the llr scores aggregated across problems with averaging across phrase occurences done first
raw_score_data = read_excel(paste0(base_location, '/score_by_token_size_avg.xlsx'))

# LambdaG Results
# Load the LambdaG results for the Wiki test dataset
lambdag_raw <- read.csv(paste0(base_location, '/lambdaG_results.csv'))

### Create Final Dataset

Here we join the raw data with teh problem dataset to filter out incorrect token sizes.

In [177]:
# We want to rename the unkown score column to just score to allow it to work with performance
score_data = raw_score_data %>%
  inner_join(token_size_problems, by = c('problem', 'min_token_size', 'corpus', 'target')) %>%
  rename('score'='llr_unknown')

score_data %>% head()

[38;5;246m# A tibble: 6 × 10[39m
  model problem         corpus known_author unknown_author target llr_no_context
  [3m[38;5;246m<chr>[39m[23m [3m[38;5;246m<chr>[39m[23m           [3m[38;5;246m<chr>[39m[23m  [3m[38;5;246m<chr>[39m[23m        [3m[38;5;246m<chr>[39m[23m          [3m[38;5;246m<lgl>[39m[23m           [3m[38;5;246m<dbl>[39m[23m
[38;5;250m1[39m gemma HOOTmag vs HOO… Wiki   HOOTmag      HOOTmag        TRUE            11.5 
[38;5;250m2[39m gemma HOOTmag vs Iai… Wiki   HOOTmag      Iain99         FALSE           20.3 
[38;5;250m3[39m gemma Hodja_Nasreddi… Wiki   Hodja_Nasre… Hodja_Nasredd… TRUE            37.2 
[38;5;250m4[39m gemma Hodja_Nasreddi… Wiki   Hodja_Nasre… HonestopL      FALSE           20.9 
[38;5;250m5[39m gemma HonestopL vs H… Wiki   HonestopL    HOOTmag        FALSE           14.2 
[38;5;250m6[39m gemma HonestopL vs H… Wiki   HonestopL    HonestopL      TRUE             8.05
[38;5;246m# ℹ 3 more variables: llr_known <db

### Calculate Performance

In [178]:
distinct_model_sizes <- score_data %>%
  select(model, corpus, min_token_size) %>%
  distinct() %>%
  arrange(model, corpus, min_token_size)

distinct_model_sizes %>% head()

[38;5;246m# A tibble: 6 × 3[39m
  model corpus min_token_size
  [3m[38;5;246m<chr>[39m[23m [3m[38;5;246m<chr>[39m[23m           [3m[38;5;246m<dbl>[39m[23m
[38;5;250m1[39m gemma Wiki                2
[38;5;250m2[39m gemma Wiki                3
[38;5;250m3[39m gemma Wiki                4
[38;5;250m4[39m gemma Wiki                5
[38;5;250m5[39m gpt2  Wiki                2
[38;5;250m6[39m gpt2  Wiki                3

In [179]:
process_group <- function(model, corpus, min_token_size) {

  distinct_problems <- token_size_problems %>%
    filter(min_token_size == !!min_token_size)

  # Filter score_data by the combination
  filtered <- score_data %>%
    filter(model == !!model,
           corpus == !!corpus,
           min_token_size == !!min_token_size) %>%
    inner_join(distinct_problems, by=c('corpus', 'problem', 'min_token_size', 'target'))
  
  # Run your performance function (assume it returns a 1-row data frame)
  perf <- performance(filtered)
  perf <- perf$evaluation

  # Add the identifying columns
  cbind(
    data.frame(model = model,
               corpus = corpus,
               min_token_size = min_token_size),
    perf
  )
}

In [180]:
results <- distinct_model_sizes %>%
  pmap_dfr(process_group)

results %>% arrange(corpus, min_token_size, model) %>% head()

Setting levels: control = FALSE, case = TRUE
Setting direction: controls < cases
Setting levels: control = FALSE, case = TRUE
Setting direction: controls < cases
Setting levels: control = FALSE, case = TRUE
Setting direction: controls < cases
Setting levels: control = FALSE, case = TRUE
Setting direction: controls < cases
Setting levels: control = FALSE, case = TRUE
Setting direction: controls < cases
Setting levels: control = FALSE, case = TRUE
Setting direction: controls < cases
Setting levels: control = FALSE, case = TRUE
Setting direction: controls < cases
Setting levels: control = FALSE, case = TRUE
Setting direction: controls < cases
Setting levels: control = FALSE, case = TRUE
Setting direction: controls < cases
Setting levels: control = FALSE, case = TRUE
Setting direction: controls < cases
Setting levels: control = FALSE, case = TRUE
Setting direction: controls < cases
Setting levels: control = FALSE, case = TRUE
Setting direction: controls < cases
Setting levels: control = FA

  model corpus min_token_size      Cllr  Cllr_min      EER Mean TRUE LLR
1 gemma   Wiki              2 0.8619175 0.7444600 30.11696     0.3143858
2  gpt2   Wiki              2 0.8618176 0.7502645 29.64912     0.3041526
3 llama   Wiki              2 0.8770738 0.7874005 31.14035     0.2745068
4  qwen   Wiki              2 0.8776719 0.7682051 30.73684     0.2710594
5 gemma   Wiki              3 0.7607292 0.6629074 23.76564     1.0192055
6  gpt2   Wiki              3 0.7704295 0.6742033 25.77778     0.8408998
  Mean FALSE LLR TRUE trials FALSE trials       AUC Balanced Accuracy Precision
1     -0.2345160         114          114 0.7686543         0.6919643 0.7009346
2     -0.2375316         114          114 0.7683355         0.6919643 0.7009346
3     -0.2087909         114          114 0.7539063         0.6785714 0.6886792
4     -0.2081527         114          114 0.7507972         0.6741071 0.6788991
5     -0.3642962         113          112 0.8384111         0.7469697 0.7956989
6     -0.

### LambdaG Results

In [181]:
# Get the distinct corpus and min_token_size adding LambdaG as model at front
distinct_corpus_sizes <- distinct_model_sizes %>%
  select(corpus, min_token_size) %>%
  distinct() %>%
  arrange(corpus, min_token_size) %>%
  mutate(model = "LambdaG") %>%
  relocate(model, .before = everything())

In [182]:
distinct_corpus_sizes

[38;5;246m# A tibble: 4 × 3[39m
  model   corpus min_token_size
  [3m[38;5;246m<chr>[39m[23m   [3m[38;5;246m<chr>[39m[23m           [3m[38;5;246m<dbl>[39m[23m
[38;5;250m1[39m LambdaG Wiki                2
[38;5;250m2[39m LambdaG Wiki                3
[38;5;250m3[39m LambdaG Wiki                4
[38;5;250m4[39m LambdaG Wiki                5

In [183]:
process_group_lambdag <- function(model, corpus, min_token_size) {
  "Function to process the lambdaG results"

  # Filter score_data by the combination
  problems <- token_size_problems %>%
    filter(corpus == !!corpus,
           min_token_size == !!min_token_size)
  
  filtered_lambdag <- lambdag_results %>%
    inner_join(problems, by=c('problem', 'target'))

  # Run your performance function (assume it returns a 1-row data frame)
  perf <- performance(filtered_lambdag)
  perf <- perf$evaluation

  # Add the identifying columns
  cbind(
    data.frame(model = model,
               corpus = corpus,
               min_token_size = min_token_size),
    perf
  )
}

In [184]:
results_lambdag <- distinct_corpus_sizes %>%
  pmap_dfr(process_group_lambdag)

Setting levels: control = FALSE, case = TRUE
Setting direction: controls < cases
Setting levels: control = FALSE, case = TRUE
Setting direction: controls < cases
Setting levels: control = FALSE, case = TRUE
Setting direction: controls < cases
Setting levels: control = FALSE, case = TRUE
Setting direction: controls < cases


In [185]:
results_lambdag %>% head()

    model corpus min_token_size      Cllr  Cllr_min      EER Mean TRUE LLR
1 LambdaG   Wiki              2 0.6476871 0.5243020 16.66667      1.092086
2 LambdaG   Wiki              3 0.6374915 0.5171254 16.44444      1.139849
3 LambdaG   Wiki              4 0.6061458 0.4756306 14.59695      1.331042
4 LambdaG   Wiki              5 0.6872042 0.4197365 15.33742      1.709290
  Mean FALSE LLR TRUE trials FALSE trials       AUC Balanced Accuracy Precision
1     -0.7585601         114          114 0.9011480         0.8303571 0.8303571
2     -0.7887780         113          112 0.9049140         0.8325962 0.8363636
3     -0.8316408          92           55 0.9171908         0.8545073 0.9250000
4     -0.6700987          50           12 0.8895833         0.8666667 0.9756098
     Recall        F1 TP FN FP TN
1 0.8303571 0.8303571 93 19 19 93
2 0.8288288 0.8325792 92 19 18 92
3 0.8222222 0.8705882 74 16  6 47
4 0.8333333 0.8988764 40  8  1  9

### Combine Results with LambdaG Results

In [186]:
results_combined <- rbind(results, results_lambdag) %>%
  arrange(corpus, min_token_size, model)

In [187]:
results_combined %>% head(10)

     model corpus min_token_size      Cllr  Cllr_min      EER Mean TRUE LLR
1  LambdaG   Wiki              2 0.6476871 0.5243020 16.66667     1.0920856
2    gemma   Wiki              2 0.8619175 0.7444600 30.11696     0.3143858
3     gpt2   Wiki              2 0.8618176 0.7502645 29.64912     0.3041526
4    llama   Wiki              2 0.8770738 0.7874005 31.14035     0.2745068
5     qwen   Wiki              2 0.8776719 0.7682051 30.73684     0.2710594
6  LambdaG   Wiki              3 0.6374915 0.5171254 16.44444     1.1398488
7    gemma   Wiki              3 0.7607292 0.6629074 23.76564     1.0192055
8     gpt2   Wiki              3 0.7704295 0.6742033 25.77778     0.8408998
9    llama   Wiki              3 0.7753341 0.6722298 25.00000     0.8893752
10    qwen   Wiki              3 0.7943285 0.7049314 26.18090     0.7520333
   Mean FALSE LLR TRUE trials FALSE trials       AUC Balanced Accuracy
1      -0.7585601         114          114 0.9011480         0.8303571
2      -0.2345160     

In [188]:
results_combined %>%
  write_xlsx(paste0(base_location, "/idiolect_token_results_summary_avg.xlsx"))