In [1]:
DATAPATH = "results"

# Installs and imports

In [2]:
#!pip install pingouin > /dev/null

In [3]:
import pingouin as pg
import pandas as pd
import numpy as np
import glob
import shutil
import matplotlib.pyplot as plt
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, roc_curve, auc, f1_score
from sklearn import metrics
import pickle
import os
from tqdm import tqdm
import gzip
import scipy.stats as stats
import itertools
from statsmodels.stats.anova import AnovaRM
from collections import Counter

pd.set_option('display.max_colwidth', None)
pd.set_option('display.max_rows', None)

  from pandas.core.computation.check import NUMEXPR_INSTALLED
  from pandas.core import (


# Data loading

In [4]:
label_names = ["human", "machine"] #0, 1
id2label = {idx:label for idx, label in enumerate(label_names)}
label2id = {v:k for k,v in id2label.items()}

In [5]:
multidomain = pd.read_csv(f"{DATAPATH.replace('results', 'dataset')}/multidomain.csv.gz")

In [6]:
temp = multidomain[(multidomain.split == 'test') & (multidomain.domain == 'social_media')]
temp['source'] = temp['source'].str.replace('multisocial_', '')
temp.groupby(['source'])['length'].describe()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  temp['source'] = temp['source'].str.replace('multisocial_', '')


Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
source,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
discord,27652.0,16.213764,26.201387,3.0,5.0,9.0,15.0,197.0
gab,30740.0,25.979538,30.70871,3.0,9.0,16.0,30.0,200.0
telegram,44385.0,23.82039,32.57946,3.0,7.0,12.0,24.0,200.0
twitter,25172.0,25.470563,26.693076,3.0,11.0,18.0,30.0,200.0
whatsapp,10878.0,30.622909,33.921798,3.0,9.0,18.0,38.0,200.0


In [7]:
test_results = []

In [8]:

files = glob.glob(DATAPATH + "/pretrained/*.csv.gz")
for f in tqdm(files, total= len(files)):
   df = pd.read_csv(f)
   df = pd.concat([multidomain, df], axis=1)
   df['label'] = ['human' if 'human' in x else 'machine' for x in df['multi_label']]
   df['predictions'] = ['human' if (('human' in str(x)) or ('0' in str(x))) else 'machine' for x in df['predictions']]
   df['Category'] = 'P'

   if ('ruroberta' in f.lower()):
     df['prediction_probs'] = 1 - df['prediction_probs']
   test_results.append({f.split('/')[-1].replace('multidomain.csv_', '').replace('.csv.gz', '') : df})

100%|█████████████████████████████████████████████████████████████████████████████████████| 5/5 [00:04<00:00,  1.11it/s]


In [9]:

files = glob.glob(DATAPATH + "/statistical/*.csv.gz")
for f in tqdm(files, total= len(files)):
   df = pd.read_csv(f)
   df = pd.concat([multidomain, df], axis=1)
   df['label'] = ['human' if 'human' in x else 'machine' for x in df['multi_label']]
   df['predictions'] = ['human' if (('human' in str(x)) or ('0' in str(x))) else 'machine' for x in df['predictions']]
   df['Category'] = 'S'

   if ('llr' in f.lower()):
     continue
     df['prediction_probs'] = 1 - df['prediction_probs']
   if ('s5' in f.lower()):
     if ('gpt-j' not in f.lower()): continue #use only GPT-J based S5 metric
     df['ll'] = 1 - df['ll']
     temp = pd.DataFrame()
     temp[['ll', 'entropy', 'rank', 'log-rank', 'llm_deviation']] = 1 / df[['ll', 'entropy', 'rank', 'log-rank', 'llm_deviation']]
     temp = temp.fillna(0.0)
     df['prediction_probs'] = temp.prod(axis=1).fillna(0.0).replace(np.inf, 0.0)
     
     #llm_deviation
     temp = df.copy()
     temp['prediction_probs'] = 1 - temp['llm_deviation']
     test_results.append({f.split('/')[-1].replace('multidomain.csv_', '').replace('.csv.gz', '').replace('s5', 'llm_deviation') : temp})

     #lrr
     temp = df.copy()
     temp['prediction_probs'] = temp['ll'] / temp['log-rank']
     temp['prediction_probs'] = temp['prediction_probs'].fillna(0.0).replace(np.inf, 0.0)
     test_results.append({f.split('/')[-1].replace('multidomain.csv_', '').replace('.csv.gz', '').replace('s5', 'lrr') : temp})
   test_results.append({f.split('/')[-1].replace('multidomain.csv_', '').replace('.csv.gz', '') : df})

100%|█████████████████████████████████████████████████████████████████████████████████████| 3/3 [00:05<00:00,  1.70s/it]


In [10]:

files = glob.glob(DATAPATH + "/finetuned/*.csv.gz")
for f in tqdm(files, total= len(files)):
   #print(f)
   df = pd.read_csv(f)
   if 'prediction_probs' not in df.columns:
    df.columns.values[-1] = 'prediction_probs'
   if 'predictions' not in df.columns: df['predictions'] = 0
   df = pd.concat([multidomain, df[['predictions', 'prediction_probs']]], axis=1)
   df['prediction_probs'] = df['prediction_probs'].fillna(0.0).replace(np.inf, 0.0)
   df['label'] = ['human' if 'human' in x else 'machine' for x in df['multi_label']]
   df['predictions'] = ['human' if (('human' in str(x)) or ('0' in str(x))) else 'machine' for x in df['predictions']]
   df['Category'] = 'F'
   test_results.append({f.split('/')[-1].replace('multidomain.csv_', '').replace('.csv.gz', '') : df})

100%|███████████████████████████████████████████████████████████████████████████████████| 73/73 [00:56<00:00,  1.29it/s]


# Results analysis

## Definitions

In [11]:
#https://github.com/scikit-learn/scikit-learn/issues/26808
def report_np(y_true, y_pred, n_classes):

    classes = np.arange(n_classes)[None, :]
    supp = classes == y_true[:, None]
    tmp = classes == y_pred[:, None]
    hits = (tmp & supp).sum(axis=0)
    pred = tmp.sum(axis=0)
    n = y_true.shape[0]

    supp = supp.sum(axis=0)
    #https://stackoverflow.com/questions/26248654/how-to-return-0-with-divide-by-zero
    #prec = hits / pred
    pred_inv = np.array([1/i if i!=0 else 0 for i in pred])
    prec = hits * pred_inv
    #prec = np.divide(hits, pred, out=np.zeros(hits.shape, dtype=float), where=pred!=0)
    #rec = hits / supp
    supp_inv = np.array([1/i if i!=0 else 0 for i in supp])
    rec = hits * supp_inv
    #rec = np.divide(hits, supp, out=np.zeros(hits.shape, dtype=float), where=supp!=0)
    balanced_acc = rec.mean()
    prec_rec = prec + rec
    prec_rec_mult = 2 * prec * rec
    #f1 = prec_rec_mult / prec_rec
    prec_rec_inv = np.array([1/i if i!=0 else 0 for i in prec_rec])
    f1 = prec_rec_mult * prec_rec_inv
    #f1 = np.divide(prec_rec_mult, prec_rec, out=np.zeros(prec_rec_mult.shape, dtype=float), where=prec_rec!=0)
    acc = hits.sum() / n
    stacked = np.vstack([prec, rec, f1])
    macro = stacked.mean(axis=1)
    weighted = stacked @ supp / n

    return hits, pred - hits, acc, balanced_acc, supp, prec, rec, f1 , macro, weighted

def report_todict(hits, miss, acc, balanced_acc, supp, prec, rec, f1 , macro, weighted):
  report = {}
  TN = hits[0]
  FN = miss[0]
  TP = hits[1]
  FP = miss[1]
  report['fpr'] = FP/(FP+TN) if (FP+TN) > 0 else 0
  report['fnr'] = FN/(TP+FN) if (TP+FN) > 0 else 0
  human = {}
  human['precision'] = prec[0]
  human['recall'] = rec[0]
  human['f1-score'] = f1[0]
  human['support'] = supp[0]
  report['human'] = human
  machine = {}
  machine['precision'] = prec[1]
  machine['recall'] = rec[1]
  machine['f1-score'] = f1[1]
  machine['support'] = supp[1]
  report['machine'] = machine
  report['accuracy'] = acc
  report['balanced accuracy'] = balanced_acc
  macro_avg = {}
  macro_avg['precision'] = macro[0]
  macro_avg['recall'] = macro[1]
  macro_avg['f1-score'] = macro[2]
  report['macro avg'] = macro_avg
  weighted_avg = {}
  weighted_avg['precision'] = weighted[0]
  weighted_avg['recall'] = weighted[1]
  weighted_avg['f1-score'] = weighted[2]
  report['weighted avg'] = weighted_avg
  return report

In [12]:
def rename_detector(detector_name):
  detector_name = (detector_name
    .lower()
    .replace('meta-', '')
    .replace('orzhan-', '')
    .replace('nealcly-', '')
    .replace('hello-simpleai-', '')
    .replace('andreas122001-', '')
    .replace('openai-community-', '')
    .replace('_threshold', '')
    .replace('metric', '')
    .replace('gpt-j-6b_', '')
    .replace('s5', 'S5')
    .replace('llm_deviation', 'LLM-Deviation')
    .replace('lrr', 'DetectLLM-LRR')
    .replace('llr', 'DetectLLM-LRR')
    .replace('-first-social-media', '-MultiSocial')
    .replace('-first-news', '-MULTITuDE')
    .replace('xlm', 'XLM')
    .replace('roberta', 'RoBERTa')
    .replace('mdeberta', 'mDeBERTa')
    .replace('bert', 'BERT')
    .replace('bloomz', 'BLOOMZ')
    .replace('falcon', 'Falcon')
    .replace('mistral', 'Mistral')
    .replace('llama', 'Llama')
    .replace('aya', 'Aya')
    .replace('opt-iml-max', 'OPT-IML-Max')
    .replace('openai', 'OpenAI')
    .replace('detection-longformer', 'Longformer Detector')
    .replace('detect', 'Detect')
    .replace('fast', 'Fast')
    .replace('chatgpt', 'ChatGPT')
    #.replace('gpt-j', 'GPT-J')
    .replace('gpt', 'GPT')
    .replace('chinese', 'Chinese')
    .replace('longformer', 'Longformer')
    .replace('binoculars', 'Binoculars')
    )

  return detector_name

In [13]:
rename_generators = {'opt-iml-max-30b': 'OPT-IML-Max-30b', 'Mistral-7B-Instruct-v0.2': 'Mistral-7B-Instruct-v0.2', 'vicuna-13b': 'Vicuna-13b',
       'gpt-3.5-turbo-0125': 'GPT-3.5-Turbo-0125', 'aya-101': 'Aya-101', 'v5-Eagle-7B-HF': 'v5-Eagle-7B-HF', 'gemini': 'Gemini'}

In [14]:
rename_platforms = {'gab': 'Gab', 'whatsapp': 'WhatsApp', 'twitter': 'Twitter', 'telegram': 'Telegram', 'discord': 'Discord', '{all}': '{all}'}

In [15]:
#extract_platforms = {'-cross-platform-1-cut-majority': '_all', '-cross-platform-4-cut-majority': '_Gab', '-cross-platform-6-cut-majority': '_WhatsApp', '-cross-platform-3-cut-majority': '_Twitter', '-cross-platform-2-cut-majority': '_Telegram', '-cross-platform-5-cut-majority': '_Discord'}
def extract_platform(s):
  s = (s.replace('-cross-platform-1-cut-majority-one-fifth-train-size', '_all')
        .replace('-cross-platform-1-cut-majority', '_all-x5')
        .replace('-cross-platform-4-cut-majority', '_Gab')
        .replace('-cross-platform-6-cut-majority', '_WhatsApp')
        .replace('-cross-platform-3-cut-majority', '_Twitter')
        .replace('-cross-platform-2-cut-majority', '_Telegram')
        .replace('-cross-platform-5-cut-majority', '_Discord')
  )
  return s

In [16]:
def extract_language(s):
  s = (s.replace('-cross-language-1-cut-majority-one-third-train-size', '_{en-es-ru}')
        .replace('-cross-language-1-cut-majority', '_{en-es-ru-x3}')
        .replace('-cross-language-2-cut-majority', '_en')
        .replace('-cross-language-3-cut-majority', '_es')
        .replace('-cross-language-4-cut-majority', '_ru')
  )
  return s

In [17]:
to_category = {}
for detector in tqdm(test_results, total=len(test_results)):
  for model,data in detector.items():
    to_category[model] = data.Category.unique()[0]

100%|███████████████████████████████████████████████████████████████████████████████████| 83/83 [00:01<00:00, 47.80it/s]


In [18]:
to_language_family = {
    'el': 'Greek',
    'ar': 'Semitic', 'zh': 'Sino-Tibetan',
    'ca': 'Romance', 'es': 'Romance', 'pt': 'Romance',
    'en': 'Germanic', 'ga': 'Celtic', 'gd': 'Celtic',
    'hu': 'Uralic', 'et': 'Uralic',
    'de': 'Germanic', 'nl': 'Germanic',
    'cs': 'Slavic-Latin', 'pl': 'Slavic-Latin', 'sk': 'Slavic-Latin',
    'hr': 'Slavic-Latin', 'sl': 'Slavic-Latin',
    'ro': 'Romance',
    'bg': 'Slavic-Cyrillic', 'uk': 'Slavic-Cyrillic',  'ru': 'Slavic-Cyrillic',
}

In [19]:
def highlight_categories(s):
    v = s['Category']
    color = 'background-color: #b6d7a8;' if v == 'F' else 'background-color: #f9cb9c;' if v == 'S' else 'background-color: #9fc5e8;'
    return [color for v in s]

In [20]:
def auc_roc_reliable(fpr, tpr, labels):
    res =  auc(fpr, tpr)
    counts = Counter(labels)
    return res if (len(counts) and counts[0] >= 10 and counts[1] >= 10) else pd.NA

## Benchmark Comparison

In [21]:

#social media test data - th calibration for macroF1 based on train data
auc_dict = {}
for detector in tqdm(test_results, total=len(test_results)):
  for model,data in detector.items():
    train = data[(data.split == 'train') & (data.domain == 'social_media') & (~data.multi_label.str.contains('gemini'))]
    temp = data[(data.split == 'test') & (data.domain == 'social_media')]
  if (to_category[model] == 'F') and ('social-media' not in model): continue
  #if (temp['prediction_probs'].min() >= 0.5) and (temp['prediction_probs'].max() <= 1.0):
    #temp.loc[temp.predictions == 'human', 'prediction_probs'] = 1 - temp['prediction_probs'] #adjust human prediction prob to be in range <0, 0.5)
  labels = [label2id[x] for x in temp['label']]
  fpr, tpr, thresholds = roc_curve(labels, temp['prediction_probs'])
  auc_dict[model] = {'auc':  auc_roc_reliable(fpr, tpr, labels), 'th_optim': thresholds[np.argmax(tpr - fpr)], 'tpr_1%fpr': tpr[fpr <= 0.01][-1], 'tpr_5%fpr': tpr[fpr <= 0.05][-1], 'th_1%fpr': thresholds[fpr <= 0.01][-1], 'th_3%fpr': thresholds[fpr <= 0.03][-1], 'th_5%fpr': thresholds[fpr <= 0.05][-1], 'th_10%fpr': thresholds[fpr <= 0.10][-1], 'th_15%fpr': thresholds[fpr <= 0.15][-1], 'th_20%fpr': thresholds[fpr <= 0.20][-1], 'th_25%fpr': thresholds[fpr <= 0.25][-1], 'th_30%fpr': thresholds[fpr <= 0.30][-1], 'th_40%fpr': thresholds[fpr <= 0.40][-1], 'th_50%fpr': thresholds[fpr <= 0.50][-1]}

  train_labels = [label2id[x] for x in train['label']]
  fpr, tpr, thresholds = roc_curve(train_labels, train['prediction_probs'])
  predictions = [1 if ((x >= thresholds[fpr <= 0.05][-1])) else 0 for x in temp['prediction_probs']]
  auc_dict[model]['cr_5'] = report_todict(*report_np(np.array(labels), np.array(predictions), 2))
  predictions = [1 if ((x >= thresholds[np.argmax(tpr - fpr)])) else 0 for x in temp['prediction_probs']]
  auc_dict[model]['cr_o'] = report_todict(*report_np(np.array(labels), np.array(predictions), 2))
results_all = pd.DataFrame()
for model, v in tqdm(auc_dict.items(), total=len(auc_dict)):
  temp = pd.DataFrame({'Detector': rename_detector(model), 'Category': to_category[model], 'AUC ROC': v['auc'], 'MacroF1@5%FPR': v['cr_5']['macro avg']['f1-score'], 'MacroF1@optim': v['cr_o']['macro avg']['f1-score']}, index=[0])
  results_all = pd.concat([results_all, temp])
results_all = results_all.sort_values(by=['AUC ROC'], ascending=False).reset_index(drop=True)
results_all.index = results_all.index + 1
results_all.style.format(na_rep=0, precision=4)

100%|███████████████████████████████████████████████████████████████████████████████████| 83/83 [02:05<00:00,  1.51s/it]
100%|██████████████████████████████████████████████████████████████████████████████████| 17/17 [00:00<00:00, 465.59it/s]


Unnamed: 0,Detector,Category,AUC ROC,MacroF1@5%FPR,MacroF1@optim
1,Llama-3-8b-MultiSocial,F,0.9769,0.8696,0.8567
2,Mistral-7b-v0.1-MultiSocial,F,0.9768,0.8692,0.8619
3,Aya-101-MultiSocial,F,0.9731,0.8462,0.8457
4,Falcon-rw-1b-MultiSocial,F,0.9592,0.781,0.8082
5,BLOOMZ-3b-MultiSocial,F,0.9582,0.7843,0.8107
6,XLM-RoBERTa-large-MultiSocial,F,0.9553,0.784,0.8075
7,mDeBERTa-v3-base-MultiSocial,F,0.9544,0.7652,0.7947
8,BLOOMZ-3b-mixed-Detector,P,0.7553,0.3024,0.5882
9,DetectLLM-LRR,S,0.7464,0.2523,0.658
10,LLM-Deviation,S,0.7454,0.2497,0.657


In [22]:

#social media test data - th calibration for macroF1 based on train data
auc_dict = {}
for detector in tqdm(test_results, total=len(test_results)):
  for model,data in detector.items():
    train = data[(data.split == 'train') & (data.domain == 'social_media') & (~data.multi_label.str.contains('gemini'))]
    temp = data[(data.split == 'test') & (data.domain == 'social_media')]
  if (to_category[model] == 'F') and ('social-media' not in model): continue
  #if (temp['prediction_probs'].min() >= 0.5) and (temp['prediction_probs'].max() <= 1.0):
    #temp.loc[temp.predictions == 'human', 'prediction_probs'] = 1 - temp['prediction_probs'] #adjust human prediction prob to be in range <0, 0.5)
  labels = [label2id[x] for x in temp['label']]
  fpr, tpr, thresholds = roc_curve(labels, temp['prediction_probs'])
  auc_dict[model] = {'auc':  auc_roc_reliable(fpr, tpr, labels), 'th_optim': thresholds[np.argmax(tpr - fpr)], 'tpr_1%fpr': tpr[fpr <= 0.01][-1], 'tpr_5%fpr': tpr[fpr <= 0.05][-1], 'th_1%fpr': thresholds[fpr <= 0.01][-1], 'th_3%fpr': thresholds[fpr <= 0.03][-1], 'th_5%fpr': thresholds[fpr <= 0.05][-1], 'th_10%fpr': thresholds[fpr <= 0.10][-1], 'th_15%fpr': thresholds[fpr <= 0.15][-1], 'th_20%fpr': thresholds[fpr <= 0.20][-1], 'th_25%fpr': thresholds[fpr <= 0.25][-1], 'th_30%fpr': thresholds[fpr <= 0.30][-1], 'th_40%fpr': thresholds[fpr <= 0.40][-1], 'th_50%fpr': thresholds[fpr <= 0.50][-1]}

  train_labels = [label2id[x] for x in train['label']]
  fpr, tpr, thresholds = roc_curve(train_labels, train['prediction_probs'])
  predictions = [1 if ((x >= thresholds[fpr <= 0.05][-1])) else 0 for x in temp['prediction_probs']]
  auc_dict[model]['cr_5'] = report_todict(*report_np(np.array(labels), np.array(predictions), 2))
results_all = pd.DataFrame()
for model, v in tqdm(auc_dict.items(), total=len(auc_dict)):
  temp = pd.DataFrame({'Detector': rename_detector(model), 'Category': to_category[model], 'AUC ROC': v['auc'], 'MacroF1@5%FPR': v['cr_5']['macro avg']['f1-score']}, index=[0])
  results_all = pd.concat([results_all, temp])
results_all = results_all.sort_values(by=['AUC ROC'], ascending=False).reset_index(drop=True)
results_all.index = results_all.index + 1
results_all.style.format(na_rep=0, precision=4)

100%|███████████████████████████████████████████████████████████████████████████████████| 83/83 [01:15<00:00,  1.10it/s]
100%|██████████████████████████████████████████████████████████████████████████████████| 17/17 [00:00<00:00, 723.77it/s]


Unnamed: 0,Detector,Category,AUC ROC,MacroF1@5%FPR
1,Llama-3-8b-MultiSocial,F,0.9769,0.8696
2,Mistral-7b-v0.1-MultiSocial,F,0.9768,0.8692
3,Aya-101-MultiSocial,F,0.9731,0.8462
4,Falcon-rw-1b-MultiSocial,F,0.9592,0.781
5,BLOOMZ-3b-MultiSocial,F,0.9582,0.7843
6,XLM-RoBERTa-large-MultiSocial,F,0.9553,0.784
7,mDeBERTa-v3-base-MultiSocial,F,0.9544,0.7652
8,BLOOMZ-3b-mixed-Detector,P,0.7553,0.3024
9,DetectLLM-LRR,S,0.7464,0.2523
10,LLM-Deviation,S,0.7454,0.2497


In [23]:
results_all.style.apply(highlight_categories, axis=1).hide('Category', axis=1)

Unnamed: 0,Detector,AUC ROC,MacroF1@5%FPR
1,Llama-3-8b-MultiSocial,0.976945,0.869627
2,Mistral-7b-v0.1-MultiSocial,0.976791,0.869152
3,Aya-101-MultiSocial,0.973101,0.846166
4,Falcon-rw-1b-MultiSocial,0.959221,0.780969
5,BLOOMZ-3b-MultiSocial,0.958181,0.784282
6,XLM-RoBERTa-large-MultiSocial,0.955259,0.784015
7,mDeBERTa-v3-base-MultiSocial,0.954366,0.765163
8,BLOOMZ-3b-mixed-Detector,0.755251,0.302403
9,DetectLLM-LRR,0.746401,0.252282
10,LLM-Deviation,0.745394,0.249699


In [24]:
print(results_all.reset_index().style.apply(highlight_categories, axis=1).hide('Category', axis=1).format(na_rep=0, precision=4).applymap_index(lambda v: "font-weight: bold;", axis=0).applymap_index(lambda v: "font-weight: bold;", axis=1).hide(
    axis=0).to_latex(convert_css=True))

\begin{tabular}{rlrr}
\bfseries index & \bfseries Detector & \bfseries AUC ROC & MacroF1@5%FPR \\
{\cellcolor[HTML]{B6D7A8}} 1 & {\cellcolor[HTML]{B6D7A8}} Llama-3-8b-MultiSocial & {\cellcolor[HTML]{B6D7A8}} 0.9769 & {\cellcolor[HTML]{B6D7A8}} 0.8696 \\
{\cellcolor[HTML]{B6D7A8}} 2 & {\cellcolor[HTML]{B6D7A8}} Mistral-7b-v0.1-MultiSocial & {\cellcolor[HTML]{B6D7A8}} 0.9768 & {\cellcolor[HTML]{B6D7A8}} 0.8692 \\
{\cellcolor[HTML]{B6D7A8}} 3 & {\cellcolor[HTML]{B6D7A8}} Aya-101-MultiSocial & {\cellcolor[HTML]{B6D7A8}} 0.9731 & {\cellcolor[HTML]{B6D7A8}} 0.8462 \\
{\cellcolor[HTML]{B6D7A8}} 4 & {\cellcolor[HTML]{B6D7A8}} Falcon-rw-1b-MultiSocial & {\cellcolor[HTML]{B6D7A8}} 0.9592 & {\cellcolor[HTML]{B6D7A8}} 0.7810 \\
{\cellcolor[HTML]{B6D7A8}} 5 & {\cellcolor[HTML]{B6D7A8}} BLOOMZ-3b-MultiSocial & {\cellcolor[HTML]{B6D7A8}} 0.9582 & {\cellcolor[HTML]{B6D7A8}} 0.7843 \\
{\cellcolor[HTML]{B6D7A8}} 6 & {\cellcolor[HTML]{B6D7A8}} XLM-RoBERTa-large-MultiSocial & {\cellcolor[HTML]{B6D7A8}} 0.

  print(results_all.reset_index().style.apply(highlight_categories, axis=1).hide('Category', axis=1).format(na_rep=0, precision=4).applymap_index(lambda v: "font-weight: bold;", axis=0).applymap_index(lambda v: "font-weight: bold;", axis=1).hide(


In [25]:
#social media test data - per LLM

auc_dict = {}
for detector in tqdm(test_results, total=len(test_results)):
 for model,data in detector.items():
  temp_data = data[(data.split == 'test') & (data.domain == 'social_media')]
 if (to_category[model] == 'F') and ('social-media' not in model): continue
 for llm in tqdm(temp_data.multi_label.unique(), total=len(temp_data.multi_label.unique())):
  if llm == 'human': continue
  temp = temp_data[(temp_data.multi_label.str.contains(f'human|{llm}'))]
  #if (temp['prediction_probs'].min() >= 0.5) and (temp['prediction_probs'].max() <= 1.0):
    #temp.loc[temp.predictions == 'human', 'prediction_probs'] = 1 - temp['prediction_probs'] #adjust human prediction prob to be in range <0, 0.5)
  labels = [label2id[x] for x in temp['label']]
  fpr, tpr, thresholds = roc_curve(labels, temp['prediction_probs'])
  auc_dict[f"{model} {llm}"] = {'auc':  auc_roc_reliable(fpr, tpr, labels), 'th_optim': thresholds[np.argmax(tpr - fpr)], 'tpr_1%fpr': tpr[fpr <= 0.01][-1], 'tpr_5%fpr': tpr[fpr <= 0.05][-1], 'th_1%fpr': thresholds[fpr <= 0.01][-1], 'th_3%fpr': thresholds[fpr <= 0.03][-1], 'th_5%fpr': thresholds[fpr <= 0.05][-1], 'th_10%fpr': thresholds[fpr <= 0.10][-1], 'th_15%fpr': thresholds[fpr <= 0.15][-1], 'th_20%fpr': thresholds[fpr <= 0.20][-1], 'th_25%fpr': thresholds[fpr <= 0.25][-1], 'th_30%fpr': thresholds[fpr <= 0.30][-1], 'th_40%fpr': thresholds[fpr <= 0.40][-1], 'th_50%fpr': thresholds[fpr <= 0.50][-1]}
  for test_language in temp.language.unique():
    temp2 = temp[temp.language == test_language].reset_index(drop=True)
    labels = [label2id[x] for x in temp2['label']]
    fpr, tpr, thresholds = roc_curve(labels, temp2['prediction_probs'])
    auc_dict[f"{model} {llm}"][test_language] = {'auc':  auc_roc_reliable(fpr, tpr, labels), 'th_optim': thresholds[np.argmax(tpr - fpr)], 'tpr_1%fpr': tpr[fpr <= 0.01][-1], 'tpr_5%fpr': tpr[fpr <= 0.05][-1], 'th_1%fpr': thresholds[fpr <= 0.01][-1], 'th_3%fpr': thresholds[fpr <= 0.03][-1], 'th_5%fpr': thresholds[fpr <= 0.05][-1], 'th_10%fpr': thresholds[fpr <= 0.10][-1], 'th_15%fpr': thresholds[fpr <= 0.15][-1], 'th_20%fpr': thresholds[fpr <= 0.20][-1], 'th_25%fpr': thresholds[fpr <= 0.25][-1], 'th_30%fpr': thresholds[fpr <= 0.30][-1], 'th_40%fpr': thresholds[fpr <= 0.40][-1], 'th_50%fpr': thresholds[fpr <= 0.50][-1]}
results_all = pd.DataFrame()
for model, v in tqdm(auc_dict.items(), total=len(auc_dict)):
  temp = pd.DataFrame({'Detector': model.split(' ')[0], 'Generator': model.split(' ')[1], 'Language': '{all}', 'AUC ROC': v['auc']}, index=[0])
  results_all = pd.concat([results_all, temp])
  for test_language,val in v.items():
    if (test_language == 'auc') or ('_' in test_language): continue
    temp = pd.DataFrame({'Detector': model.split(' ')[0], 'Generator': model.split(' ')[1], 'Language': test_language, 'AUC ROC': val['auc']}, index=[0])
    results_all = pd.concat([results_all, temp])
#results_all.set_index(['Detector', 'Generator', 'Language']).unstack().style.format(precision=2).highlight_max(props='font-weight: bold;', axis=0).background_gradient(vmin=0.5, axis=None)
results_all['Category'] = results_all['Detector'].map(to_category)
results_all['Detector'] = results_all['Detector'].apply(rename_detector)
results_all['Generator'] = results_all['Generator'].map(rename_generators)
results_perllm = results_all.copy()

  0%|                                                                                            | 0/83 [00:00<?, ?it/s]
  0%|                                                                                             | 0/8 [00:00<?, ?it/s][A
 12%|██████████▋                                                                          | 1/8 [00:00<00:01,  6.48it/s][A
 25%|█████████████████████▎                                                               | 2/8 [00:00<00:00,  6.59it/s][A
 38%|███████████████████████████████▉                                                     | 3/8 [00:00<00:00,  6.66it/s][A
 50%|██████████████████████████████████████████▌                                          | 4/8 [00:00<00:00,  6.63it/s][A
 62%|█████████████████████████████████████████████████████▏                               | 5/8 [00:00<00:00,  6.61it/s][A
 75%|███████████████████████████████████████████████████████████████▊                     | 6/8 [00:00<00:00,  6.56it/s][A
100%|██████

In [26]:
#statistical
temp = results_all[(results_all.Category == 'S')].drop(columns=['Category']).set_index(['Detector', 'Generator', 'Language']).unstack().style.format(precision=2).background_gradient(vmin=0.5, vmax=1.5, text_color_threshold=0, axis=None)
#styles = [dict(selector="th.row_heading.level0", props=[("writing-mode", "vertical-rl"), ('transform', 'rotateZ(180deg)')])]
display(temp)#.set_table_styles(styles)
print(temp.applymap_index(lambda v: "font-weight: bold;", axis=0).applymap_index(lambda v: "font-weight: bold;", axis=1).to_latex(convert_css=True).replace('\\multirow', '\\hline\n\\multirow').replace('\\multirow[c]{7}{*}{', '\\multirow[c]{7}{*}{\\rotatebox{90}{').replace('} &', '}} &').replace('{\\cellcolor[HTML]{000000}} \\color[HTML]{000000} nan', 'N/A'))

Unnamed: 0_level_0,Unnamed: 1_level_0,AUC ROC,AUC ROC,AUC ROC,AUC ROC,AUC ROC,AUC ROC,AUC ROC,AUC ROC,AUC ROC,AUC ROC,AUC ROC,AUC ROC,AUC ROC,AUC ROC,AUC ROC,AUC ROC,AUC ROC,AUC ROC,AUC ROC,AUC ROC,AUC ROC,AUC ROC,AUC ROC
Unnamed: 0_level_1,Language,ar,bg,ca,cs,de,el,en,es,et,ga,gd,hr,hu,nl,pl,pt,ro,ru,sk,sl,uk,zh,{all}
Detector,Generator,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2,Unnamed: 22_level_2,Unnamed: 23_level_2,Unnamed: 24_level_2
Binoculars,Aya-101,0.7,0.61,0.6,0.69,0.75,0.77,0.79,0.73,0.67,0.62,0.59,0.72,0.71,0.71,0.71,0.7,0.67,0.67,0.64,0.67,0.54,0.65,0.69
Binoculars,GPT-3.5-Turbo-0125,0.62,0.64,0.57,0.72,0.72,0.78,0.75,0.7,0.66,0.61,0.59,0.75,0.74,0.68,0.73,0.7,0.71,0.69,0.7,0.66,0.61,0.92,0.68
Binoculars,Gemini,0.64,0.73,0.71,0.85,0.86,0.8,0.92,0.9,0.87,0.95,,0.88,0.87,0.8,0.84,0.88,0.88,0.69,0.82,0.85,0.7,0.9,0.83
Binoculars,Mistral-7B-Instruct-v0.2,0.69,0.67,0.61,0.7,0.67,0.75,0.71,0.69,0.72,0.73,0.82,0.78,0.75,0.64,0.71,0.7,0.72,0.7,0.66,0.68,0.65,0.63,0.68
Binoculars,OPT-IML-Max-30b,0.77,0.65,0.58,0.58,0.64,0.75,0.74,0.64,0.68,0.64,0.63,0.71,0.73,0.63,0.62,0.66,0.63,0.65,0.58,0.65,0.64,0.43,0.64
Binoculars,Vicuna-13b,0.77,0.69,0.63,0.79,0.8,0.86,0.82,0.78,0.75,0.76,0.84,0.82,0.8,0.78,0.81,0.8,0.79,0.8,0.76,0.76,0.64,0.76,0.76
Binoculars,v5-Eagle-7B-HF,0.73,0.72,0.64,0.82,0.82,0.84,0.89,0.85,0.79,0.82,0.8,0.83,0.84,0.81,0.81,0.82,0.81,0.81,0.8,0.81,0.7,0.86,0.79
DetectLLM-LRR,Aya-101,0.75,0.82,0.67,0.89,0.75,0.84,0.77,0.76,0.82,0.73,0.69,0.79,0.91,0.76,0.82,0.76,0.81,0.74,0.79,0.7,0.66,0.66,0.7
DetectLLM-LRR,GPT-3.5-Turbo-0125,0.7,0.75,0.65,0.93,0.74,0.83,0.78,0.77,0.85,0.78,0.65,0.88,0.92,0.77,0.85,0.8,0.84,0.68,0.86,0.77,0.66,0.76,0.71
DetectLLM-LRR,Gemini,0.89,0.94,0.65,0.96,0.86,0.95,0.9,0.94,0.96,0.86,,0.97,0.97,0.84,0.92,0.93,0.96,0.84,0.94,0.93,0.86,0.92,0.83


  print(temp.applymap_index(lambda v: "font-weight: bold;", axis=0).applymap_index(lambda v: "font-weight: bold;", axis=1).to_latex(convert_css=True).replace('\\multirow', '\\hline\n\\multirow').replace('\\multirow[c]{7}{*}{', '\\multirow[c]{7}{*}{\\rotatebox{90}{').replace('} &', '}} &').replace('{\\cellcolor[HTML]{000000}} \\color[HTML]{000000} nan', 'N/A'))


\begin{tabular}{llrrrrrrrrrrrrrrrrrrrrrrr}
 &  & \multicolumn{23}{r}{\bfseries AUC ROC} \\
 & Language & \bfseries ar & \bfseries bg & \bfseries ca & \bfseries cs & \bfseries de & \bfseries el & \bfseries en & \bfseries es & \bfseries et & \bfseries ga & \bfseries gd & \bfseries hr & \bfseries hu & \bfseries nl & \bfseries pl & \bfseries pt & \bfseries ro & \bfseries ru & \bfseries sk & \bfseries sl & \bfseries uk & \bfseries zh & \bfseries {all} \\
Detector & Generator &  &  &  &  &  &  &  &  &  &  &  &  &  &  &  &  &  &  &  &  &  &  &  \\
\hline
\multirow[c]{7}{*}{\rotatebox{90}{\bfseries Binoculars}} & \bfseries Aya-101 & {\cellcolor[HTML]{DBDAEB}} \color[HTML]{000000} 0.70 & {\cellcolor[HTML]{EEE9F3}} \color[HTML]{000000} 0.61 & {\cellcolor[HTML]{F0EAF4}} \color[HTML]{000000} 0.60 & {\cellcolor[HTML]{DEDCEC}} \color[HTML]{000000} 0.69 & {\cellcolor[HTML]{D2D2E7}} \color[HTML]{000000} 0.75 & {\cellcolor[HTML]{C9CEE4}} \color[HTML]{000000} 0.77 & {\cellcolor[HTML]{C1CAE2}} \color[HTM

In [27]:
#pretrained
temp = results_all[(results_all.Category == 'P')].drop(columns=['Category']).set_index(['Detector', 'Generator', 'Language']).unstack().style.format(precision=2).background_gradient(vmin=0.5, vmax=1.5, text_color_threshold=0, axis=None)
display(temp)
print(temp.applymap_index(lambda v: "font-weight: bold;", axis=0).applymap_index(lambda v: "font-weight: bold;", axis=1).to_latex(convert_css=True).replace('\\multirow', '\\hline\n\\multirow').replace('\\multirow[c]{7}{*}{', '\\multirow[c]{7}{*}{\\rotatebox{90}{\\parbox{3cm}{').replace('} &', '}}} &').replace('{\\cellcolor[HTML]{000000}} \\color[HTML]{000000} nan', 'N/A'))

Unnamed: 0_level_0,Unnamed: 1_level_0,AUC ROC,AUC ROC,AUC ROC,AUC ROC,AUC ROC,AUC ROC,AUC ROC,AUC ROC,AUC ROC,AUC ROC,AUC ROC,AUC ROC,AUC ROC,AUC ROC,AUC ROC,AUC ROC,AUC ROC,AUC ROC,AUC ROC,AUC ROC,AUC ROC,AUC ROC,AUC ROC
Unnamed: 0_level_1,Language,ar,bg,ca,cs,de,el,en,es,et,ga,gd,hr,hu,nl,pl,pt,ro,ru,sk,sl,uk,zh,{all}
Detector,Generator,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2,Unnamed: 22_level_2,Unnamed: 23_level_2,Unnamed: 24_level_2
BLOOMZ-3b-mixed-Detector,Aya-101,0.83,0.88,0.82,0.88,0.85,0.82,0.85,0.84,0.91,0.79,0.68,0.86,0.92,0.84,0.84,0.85,0.78,0.78,0.83,0.76,0.81,0.86,0.83
BLOOMZ-3b-mixed-Detector,GPT-3.5-Turbo-0125,0.85,0.8,0.81,0.86,0.82,0.82,0.87,0.83,0.87,0.81,0.67,0.83,0.87,0.8,0.83,0.83,0.74,0.73,0.8,0.71,0.82,0.19,0.8
BLOOMZ-3b-mixed-Detector,Gemini,0.64,0.55,0.63,0.69,0.65,0.76,0.5,0.58,0.78,0.65,,0.67,0.77,0.66,0.72,0.63,0.33,0.48,0.65,0.41,0.43,0.6,0.59
BLOOMZ-3b-mixed-Detector,Mistral-7B-Instruct-v0.2,0.76,0.71,0.74,0.74,0.71,0.71,0.85,0.78,0.8,0.75,0.57,0.75,0.78,0.68,0.73,0.78,0.72,0.68,0.68,0.59,0.71,0.76,0.74
BLOOMZ-3b-mixed-Detector,OPT-IML-Max-30b,0.69,0.66,0.76,0.8,0.71,0.72,0.78,0.79,0.77,0.71,0.6,0.76,0.77,0.71,0.75,0.77,0.7,0.63,0.79,0.69,0.65,0.75,0.73
BLOOMZ-3b-mixed-Detector,Vicuna-13b,0.88,0.77,0.84,0.79,0.81,0.74,0.88,0.83,0.81,0.78,0.77,0.77,0.85,0.78,0.81,0.83,0.66,0.75,0.78,0.67,0.73,0.83,0.79
BLOOMZ-3b-mixed-Detector,v5-Eagle-7B-HF,0.86,0.82,0.85,0.82,0.82,0.76,0.9,0.86,0.89,0.83,0.65,0.79,0.88,0.79,0.81,0.86,0.72,0.72,0.77,0.67,0.74,0.85,0.81
ChatGPT-Detector-RoBERTa-Chinese,Aya-101,0.75,0.79,0.73,0.63,0.75,0.7,0.82,0.75,0.78,0.61,0.52,0.61,0.72,0.67,0.61,0.61,0.67,0.73,0.6,0.6,0.71,0.76,0.67
ChatGPT-Detector-RoBERTa-Chinese,GPT-3.5-Turbo-0125,0.59,0.63,0.64,0.59,0.7,0.65,0.86,0.73,0.76,0.65,0.52,0.57,0.68,0.63,0.57,0.69,0.68,0.62,0.56,0.6,0.63,0.88,0.66
ChatGPT-Detector-RoBERTa-Chinese,Gemini,0.74,0.84,0.8,0.78,0.88,0.72,0.97,0.9,0.94,0.95,,0.76,0.89,0.78,0.71,0.86,0.9,0.81,0.73,0.78,0.87,0.78,0.8


\begin{tabular}{llrrrrrrrrrrrrrrrrrrrrrrr}
 &  & \multicolumn{23}{r}{\bfseries AUC ROC} \\
 & Language & \bfseries ar & \bfseries bg & \bfseries ca & \bfseries cs & \bfseries de & \bfseries el & \bfseries en & \bfseries es & \bfseries et & \bfseries ga & \bfseries gd & \bfseries hr & \bfseries hu & \bfseries nl & \bfseries pl & \bfseries pt & \bfseries ro & \bfseries ru & \bfseries sk & \bfseries sl & \bfseries uk & \bfseries zh & \bfseries {all} \\
Detector & Generator &  &  &  &  &  &  &  &  &  &  &  &  &  &  &  &  &  &  &  &  &  &  &  \\
\hline
\multirow[c]{7}{*}{\rotatebox{90}{\parbox{3cm}{\bfseries BLOOMZ-3b-mixed-Detector}}} & \bfseries Aya-101 & {\cellcolor[HTML]{B4C4DF}} \color[HTML]{000000} 0.83 & {\cellcolor[HTML]{A5BDDB}} \color[HTML]{000000} 0.88 & {\cellcolor[HTML]{B9C6E0}} \color[HTML]{000000} 0.82 & {\cellcolor[HTML]{A5BDDB}} \color[HTML]{000000} 0.88 & {\cellcolor[HTML]{AFC1DD}} \color[HTML]{000000} 0.85 & {\cellcolor[HTML]{B9C6E0}} \color[HTML]{000000} 0.82 & {\cellcol

  print(temp.applymap_index(lambda v: "font-weight: bold;", axis=0).applymap_index(lambda v: "font-weight: bold;", axis=1).to_latex(convert_css=True).replace('\\multirow', '\\hline\n\\multirow').replace('\\multirow[c]{7}{*}{', '\\multirow[c]{7}{*}{\\rotatebox{90}{\\parbox{3cm}{').replace('} &', '}}} &').replace('{\\cellcolor[HTML]{000000}} \\color[HTML]{000000} nan', 'N/A'))


In [28]:
#finetuned
temp = results_all[(results_all.Category == 'F')].drop(columns=['Category']).set_index(['Detector', 'Generator', 'Language']).unstack().style.format(precision=2).background_gradient(vmin=0.5, vmax=1.5, text_color_threshold=0, axis=None)
display(temp)
print(temp.applymap_index(lambda v: "font-weight: bold;", axis=0).applymap_index(lambda v: "font-weight: bold;", axis=1).to_latex(convert_css=True).replace('\\multirow', '\\hline\n\\multirow').replace('\\multirow[c]{7}{*}{', '\\multirow[c]{7}{*}{\\rotatebox{90}{\\parbox{3cm}{').replace('} &', '}}} &').replace('{\\cellcolor[HTML]{000000}} \\color[HTML]{000000} nan', 'N/A'))

Unnamed: 0_level_0,Unnamed: 1_level_0,AUC ROC,AUC ROC,AUC ROC,AUC ROC,AUC ROC,AUC ROC,AUC ROC,AUC ROC,AUC ROC,AUC ROC,AUC ROC,AUC ROC,AUC ROC,AUC ROC,AUC ROC,AUC ROC,AUC ROC,AUC ROC,AUC ROC,AUC ROC,AUC ROC,AUC ROC,AUC ROC
Unnamed: 0_level_1,Language,ar,bg,ca,cs,de,el,en,es,et,ga,gd,hr,hu,nl,pl,pt,ro,ru,sk,sl,uk,zh,{all}
Detector,Generator,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2,Unnamed: 22_level_2,Unnamed: 23_level_2,Unnamed: 24_level_2
Aya-101-MultiSocial,Aya-101,0.92,0.98,0.97,0.98,0.97,0.96,0.96,0.96,0.98,0.92,0.91,0.98,0.99,0.97,0.97,0.96,0.96,0.93,0.97,0.92,0.9,0.93,0.96
Aya-101-MultiSocial,GPT-3.5-Turbo-0125,0.99,1.0,1.0,0.99,0.99,0.99,1.0,0.99,0.99,0.97,0.96,0.99,1.0,0.99,0.99,0.99,0.99,0.99,0.99,0.97,0.98,0.99,0.99
Aya-101-MultiSocial,Gemini,0.9,0.96,0.88,0.95,0.92,0.92,0.96,0.96,0.99,0.85,,0.96,0.99,0.92,0.94,0.94,0.96,0.83,0.92,0.89,0.87,0.94,0.93
Aya-101-MultiSocial,Mistral-7B-Instruct-v0.2,0.99,1.0,0.99,1.0,0.99,0.99,1.0,0.99,0.99,0.99,0.99,0.99,1.0,0.98,0.99,0.99,1.0,0.99,1.0,0.98,0.98,0.98,0.99
Aya-101-MultiSocial,OPT-IML-Max-30b,0.98,0.99,0.96,0.97,0.94,0.96,0.92,0.95,0.95,0.84,0.77,0.95,0.98,0.95,0.97,0.95,0.96,0.98,0.96,0.91,0.95,0.99,0.95
Aya-101-MultiSocial,Vicuna-13b,1.0,1.0,0.99,1.0,0.99,0.99,1.0,0.99,0.99,0.98,0.95,0.99,1.0,0.99,0.99,0.99,1.0,0.99,0.99,0.98,0.97,0.99,0.99
Aya-101-MultiSocial,v5-Eagle-7B-HF,0.99,1.0,1.0,1.0,0.99,0.99,1.0,1.0,1.0,0.98,0.94,1.0,1.0,0.99,1.0,1.0,1.0,0.99,1.0,0.98,0.99,0.99,0.99
BLOOMZ-3b-MultiSocial,Aya-101,0.92,0.97,0.96,0.96,0.95,0.94,0.96,0.96,0.96,0.86,0.71,0.94,0.99,0.94,0.94,0.95,0.92,0.9,0.94,0.84,0.85,0.9,0.94
BLOOMZ-3b-MultiSocial,GPT-3.5-Turbo-0125,0.99,0.99,0.99,0.98,0.98,0.97,0.99,0.99,0.98,0.9,0.84,0.97,0.99,0.97,0.97,0.99,0.96,0.97,0.97,0.91,0.93,0.99,0.98
BLOOMZ-3b-MultiSocial,Gemini,0.91,0.93,0.88,0.96,0.91,0.93,0.97,0.97,0.99,0.91,,0.95,0.99,0.9,0.92,0.96,0.93,0.82,0.89,0.75,0.77,0.96,0.93


\begin{tabular}{llrrrrrrrrrrrrrrrrrrrrrrr}
 &  & \multicolumn{23}{r}{\bfseries AUC ROC} \\
 & Language & \bfseries ar & \bfseries bg & \bfseries ca & \bfseries cs & \bfseries de & \bfseries el & \bfseries en & \bfseries es & \bfseries et & \bfseries ga & \bfseries gd & \bfseries hr & \bfseries hu & \bfseries nl & \bfseries pl & \bfseries pt & \bfseries ro & \bfseries ru & \bfseries sk & \bfseries sl & \bfseries uk & \bfseries zh & \bfseries {all} \\
Detector & Generator &  &  &  &  &  &  &  &  &  &  &  &  &  &  &  &  &  &  &  &  &  &  &  \\
\hline
\multirow[c]{7}{*}{\rotatebox{90}{\parbox{3cm}{\bfseries Aya-101-MultiSocial}}} & \bfseries Aya-101 & {\cellcolor[HTML]{93B5D6}} \color[HTML]{000000} 0.92 & {\cellcolor[HTML]{7BACD1}} \color[HTML]{000000} 0.98 & {\cellcolor[HTML]{81AED2}} \color[HTML]{000000} 0.97 & {\cellcolor[HTML]{7BACD1}} \color[HTML]{000000} 0.98 & {\cellcolor[HTML]{81AED2}} \color[HTML]{000000} 0.97 & {\cellcolor[HTML]{86B0D3}} \color[HTML]{000000} 0.96 & {\cellcolor[HT

  print(temp.applymap_index(lambda v: "font-weight: bold;", axis=0).applymap_index(lambda v: "font-weight: bold;", axis=1).to_latex(convert_css=True).replace('\\multirow', '\\hline\n\\multirow').replace('\\multirow[c]{7}{*}{', '\\multirow[c]{7}{*}{\\rotatebox{90}{\\parbox{3cm}{').replace('} &', '}}} &').replace('{\\cellcolor[HTML]{000000}} \\color[HTML]{000000} nan', 'N/A'))


In [29]:
#social media test data - per platform

auc_dict = {}
for detector in tqdm(test_results, total=len(test_results)):
 for model,data in detector.items():
  temp_data = data[(data.split == 'test') & (data.domain == 'social_media')]
 if (to_category[model] == 'F') and ('social-media' not in model): continue
 temp_data['source'] = temp_data['source'].str.replace('multisocial_', '')
 for llm in tqdm(temp_data.source.unique(), total=len(temp_data.multi_label.unique())):
  temp = temp_data[(temp_data.source.str.contains(f'{llm}'))]
  #if (temp['prediction_probs'].min() >= 0.5) and (temp['prediction_probs'].max() <= 1.0):
    #temp.loc[temp.predictions == 'human', 'prediction_probs'] = 1 - temp['prediction_probs'] #adjust human prediction prob to be in range <0, 0.5)
  labels = [label2id[x] for x in temp['label']]
  fpr, tpr, thresholds = roc_curve(labels, temp['prediction_probs'])
  auc_dict[f"{model} {llm}"] = {'auc':  auc_roc_reliable(fpr, tpr, labels), 'th_optim': thresholds[np.argmax(tpr - fpr)], 'tpr_1%fpr': tpr[fpr <= 0.01][-1], 'tpr_5%fpr': tpr[fpr <= 0.05][-1], 'th_1%fpr': thresholds[fpr <= 0.01][-1], 'th_3%fpr': thresholds[fpr <= 0.03][-1], 'th_5%fpr': thresholds[fpr <= 0.05][-1], 'th_10%fpr': thresholds[fpr <= 0.10][-1], 'th_15%fpr': thresholds[fpr <= 0.15][-1], 'th_20%fpr': thresholds[fpr <= 0.20][-1], 'th_25%fpr': thresholds[fpr <= 0.25][-1], 'th_30%fpr': thresholds[fpr <= 0.30][-1], 'th_40%fpr': thresholds[fpr <= 0.40][-1], 'th_50%fpr': thresholds[fpr <= 0.50][-1]}
  for test_language in temp.language.unique():
    temp2 = temp[temp.language == test_language].reset_index(drop=True)
    labels = [label2id[x] for x in temp2['label']]
    fpr, tpr, thresholds = roc_curve(labels, temp2['prediction_probs'])
    auc_dict[f"{model} {llm}"][test_language] = {'auc':  auc_roc_reliable(fpr, tpr, labels), 'th_optim': thresholds[np.argmax(tpr - fpr)], 'tpr_1%fpr': tpr[fpr <= 0.01][-1], 'tpr_5%fpr': tpr[fpr <= 0.05][-1], 'th_1%fpr': thresholds[fpr <= 0.01][-1], 'th_3%fpr': thresholds[fpr <= 0.03][-1], 'th_5%fpr': thresholds[fpr <= 0.05][-1], 'th_10%fpr': thresholds[fpr <= 0.10][-1], 'th_15%fpr': thresholds[fpr <= 0.15][-1], 'th_20%fpr': thresholds[fpr <= 0.20][-1], 'th_25%fpr': thresholds[fpr <= 0.25][-1], 'th_30%fpr': thresholds[fpr <= 0.30][-1], 'th_40%fpr': thresholds[fpr <= 0.40][-1], 'th_50%fpr': thresholds[fpr <= 0.50][-1]}
results_all = pd.DataFrame()
for model, v in tqdm(auc_dict.items(), total=len(auc_dict)):
  temp = pd.DataFrame({'Detector': model.split(' ')[0], 'Platform': model.split(' ')[1], 'Language': '{all}', 'AUC ROC': v['auc']}, index=[0])
  results_all = pd.concat([results_all, temp])
  for test_language,val in v.items():
    if (test_language == 'auc') or ('_' in test_language): continue
    temp = pd.DataFrame({'Detector': model.split(' ')[0], 'Platform': model.split(' ')[1], 'Language': test_language, 'AUC ROC': val['auc']}, index=[0])
    results_all = pd.concat([results_all, temp])
#results_all.set_index(['Detector', 'Platform', 'Language']).unstack().style.format(precision=2).highlight_max(props='font-weight: bold;', axis=0).background_gradient(vmin=0.5, axis=None)
results_all['Category'] = results_all['Detector'].map(to_category)
results_all['Detector'] = results_all['Detector'].apply(rename_detector)
results_all['Platform'] = results_all['Platform'].map(rename_platforms)
results_perplatform = results_all.copy()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  temp_data['source'] = temp_data['source'].str.replace('multisocial_', '')

  0%|                                                                                             | 0/8 [00:00<?, ?it/s][A
 12%|██████████▋                                                                          | 1/8 [00:00<00:00,  7.46it/s][A
 25%|█████████████████████▎                                                               | 2/8 [00:00<00:00,  6.56it/s][A
 38%|███████████████████████████████▉                                                     | 3/8 [00:00<00:00,  7.38it/s][A
 62%|█████████████████████████████████████████████████████▏                               | 5/8 [00:00<00:00,  8.18it/s][A
A value is trying to be set on a copy of

In [30]:
#statistical
temp = results_all[(results_all.Category == 'S')].drop(columns=['Category']).set_index(['Detector', 'Platform', 'Language']).unstack().style.format(precision=2).background_gradient(vmin=0.5, vmax=1.5, text_color_threshold=0, axis=None)
display(temp)
print(temp.applymap_index(lambda v: "font-weight: bold;", axis=0).applymap_index(lambda v: "font-weight: bold;", axis=1).to_latex(convert_css=True).replace('\\multirow', '\\hline\n\\multirow').replace('\\multirow[c]{5}{*}{', '\\multirow[c]{5}{*}{\\rotatebox{90}{\\parbox{2cm}{').replace('} &', '}}} &').replace('{\\cellcolor[HTML]{000000}} \\color[HTML]{000000} nan', 'N/A'))

Unnamed: 0_level_0,Unnamed: 1_level_0,AUC ROC,AUC ROC,AUC ROC,AUC ROC,AUC ROC,AUC ROC,AUC ROC,AUC ROC,AUC ROC,AUC ROC,AUC ROC,AUC ROC,AUC ROC,AUC ROC,AUC ROC,AUC ROC,AUC ROC,AUC ROC,AUC ROC,AUC ROC,AUC ROC,AUC ROC,AUC ROC
Unnamed: 0_level_1,Language,ar,bg,ca,cs,de,el,en,es,et,ga,gd,hr,hu,nl,pl,pt,ro,ru,sk,sl,uk,zh,{all}
Detector,Platform,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2,Unnamed: 22_level_2,Unnamed: 23_level_2,Unnamed: 24_level_2
Binoculars,Discord,,,0.78,0.76,0.8,,0.86,0.81,0.75,0.67,0.7,0.83,0.82,0.8,0.84,0.81,0.77,,0.72,,,,0.79
Binoculars,Gab,0.61,0.62,0.47,0.58,0.69,0.71,0.78,0.72,0.7,0.72,,0.72,0.68,0.71,0.67,0.72,0.67,0.69,0.62,0.7,0.63,0.75,0.68
Binoculars,Telegram,0.71,0.69,0.61,0.77,0.74,0.83,0.81,0.77,0.75,0.72,,0.79,0.82,0.72,0.78,0.76,0.79,0.65,0.72,0.77,0.63,0.73,0.73
Binoculars,Twitter,0.67,0.69,0.64,0.76,0.82,0.92,0.81,0.67,0.84,,,0.85,0.79,0.72,0.82,0.8,0.77,0.85,0.87,,,0.8,0.74
Binoculars,WhatsApp,0.82,,0.73,0.66,0.79,,0.75,0.81,0.52,,,,,0.74,,0.7,0.85,0.66,,,,,0.73
DetectLLM-LRR,Discord,,,0.95,0.98,0.91,,0.92,0.93,0.94,0.83,0.75,0.94,0.98,0.94,0.98,0.95,0.96,,0.9,,,,0.94
DetectLLM-LRR,Gab,0.74,0.8,0.58,0.8,0.74,0.79,0.78,0.75,0.71,0.75,,0.77,0.81,0.72,0.82,0.75,0.77,0.72,0.71,0.74,0.77,0.78,0.69
DetectLLM-LRR,Telegram,0.76,0.86,0.63,0.94,0.69,0.94,0.81,0.79,0.92,0.77,,0.94,0.97,0.74,0.88,0.86,0.92,0.79,0.89,0.96,0.74,0.76,0.75
DetectLLM-LRR,Twitter,0.81,0.87,0.75,0.91,0.91,0.96,0.85,0.78,0.87,,,0.94,0.94,0.85,0.95,0.91,0.9,0.93,0.95,,,0.92,0.75
DetectLLM-LRR,WhatsApp,0.87,,0.87,0.96,0.8,,0.66,0.89,0.91,,,,,0.86,,0.8,0.95,0.69,,,,,0.7


\begin{tabular}{llrrrrrrrrrrrrrrrrrrrrrrr}
 &  & \multicolumn{23}{r}{\bfseries AUC ROC} \\
 & Language & \bfseries ar & \bfseries bg & \bfseries ca & \bfseries cs & \bfseries de & \bfseries el & \bfseries en & \bfseries es & \bfseries et & \bfseries ga & \bfseries gd & \bfseries hr & \bfseries hu & \bfseries nl & \bfseries pl & \bfseries pt & \bfseries ro & \bfseries ru & \bfseries sk & \bfseries sl & \bfseries uk & \bfseries zh & \bfseries {all} \\
Detector & Platform &  &  &  &  &  &  &  &  &  &  &  &  &  &  &  &  &  &  &  &  &  &  &  \\
\hline
\multirow[c]{5}{*}{\rotatebox{90}{\parbox{2cm}{\bfseries Binoculars}}} & \bfseries Discord & N/A & N/A & {\cellcolor[HTML]{C8CDE4}} \color[HTML]{000000} 0.78 & {\cellcolor[HTML]{CDD0E5}} \color[HTML]{000000} 0.76 & {\cellcolor[HTML]{C0C9E2}} \color[HTML]{000000} 0.80 & N/A & {\cellcolor[HTML]{ABBFDC}} \color[HTML]{000000} 0.86 & {\cellcolor[HTML]{BCC7E1}} \color[HTML]{000000} 0.81 & {\cellcolor[HTML]{D0D1E6}} \color[HTML]{000000} 0.75 & {\cell

  print(temp.applymap_index(lambda v: "font-weight: bold;", axis=0).applymap_index(lambda v: "font-weight: bold;", axis=1).to_latex(convert_css=True).replace('\\multirow', '\\hline\n\\multirow').replace('\\multirow[c]{5}{*}{', '\\multirow[c]{5}{*}{\\rotatebox{90}{\\parbox{2cm}{').replace('} &', '}}} &').replace('{\\cellcolor[HTML]{000000}} \\color[HTML]{000000} nan', 'N/A'))


In [31]:
#pretrained
temp = results_all[(results_all.Category == 'P')].drop(columns=['Category']).set_index(['Detector', 'Platform', 'Language']).unstack().style.format(precision=2).background_gradient(vmin=0.5, vmax=1.5, text_color_threshold=0, axis=None)
display(temp)
print(temp.applymap_index(lambda v: "font-weight: bold;", axis=0).applymap_index(lambda v: "font-weight: bold;", axis=1).to_latex(convert_css=True).replace('\\multirow', '\\hline\n\\multirow').replace('\\multirow[c]{5}{*}{', '\\multirow[c]{5}{*}{\\rotatebox{90}{\\parbox{2cm}{').replace('} &', '}}} &').replace('{\\cellcolor[HTML]{000000}} \\color[HTML]{000000} nan', 'N/A'))

Unnamed: 0_level_0,Unnamed: 1_level_0,AUC ROC,AUC ROC,AUC ROC,AUC ROC,AUC ROC,AUC ROC,AUC ROC,AUC ROC,AUC ROC,AUC ROC,AUC ROC,AUC ROC,AUC ROC,AUC ROC,AUC ROC,AUC ROC,AUC ROC,AUC ROC,AUC ROC,AUC ROC,AUC ROC,AUC ROC,AUC ROC
Unnamed: 0_level_1,Language,ar,bg,ca,cs,de,el,en,es,et,ga,gd,hr,hu,nl,pl,pt,ro,ru,sk,sl,uk,zh,{all}
Detector,Platform,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2,Unnamed: 22_level_2,Unnamed: 23_level_2,Unnamed: 24_level_2
BLOOMZ-3b-mixed-Detector,Discord,,,0.96,0.89,0.87,,0.87,0.9,0.87,0.8,0.66,0.84,0.86,0.91,0.91,0.89,0.78,,0.71,,,,0.87
BLOOMZ-3b-mixed-Detector,Gab,0.69,0.71,0.77,0.69,0.67,0.62,0.77,0.69,0.73,0.75,,0.61,0.75,0.69,0.7,0.69,0.53,0.59,0.7,0.54,0.58,0.64,0.66
BLOOMZ-3b-mixed-Detector,Telegram,0.81,0.79,0.73,0.81,0.76,0.85,0.83,0.8,0.84,0.78,,0.8,0.88,0.74,0.77,0.8,0.75,0.75,0.76,0.84,0.71,0.71,0.78
BLOOMZ-3b-mixed-Detector,Twitter,0.82,0.68,0.7,0.71,0.78,0.68,0.81,0.68,0.77,,,0.81,0.71,0.7,0.7,0.8,0.59,0.73,0.9,,,0.68,0.72
BLOOMZ-3b-mixed-Detector,WhatsApp,0.82,,0.88,0.46,0.78,,0.76,0.87,0.81,,,,,0.67,,0.81,0.78,0.8,,,,,0.79
ChatGPT-Detector-RoBERTa-Chinese,Discord,,,0.88,0.72,0.86,,0.95,0.88,0.85,0.73,0.62,0.66,0.82,0.71,0.7,0.75,0.83,,0.68,,,,0.77
ChatGPT-Detector-RoBERTa-Chinese,Gab,0.57,0.81,0.65,0.55,0.74,0.68,0.88,0.74,0.74,0.72,,0.6,0.66,0.72,0.58,0.75,0.68,0.65,0.75,0.61,0.74,0.78,0.67
ChatGPT-Detector-RoBERTa-Chinese,Telegram,0.65,0.75,0.73,0.66,0.82,0.78,0.9,0.8,0.83,0.75,,0.61,0.83,0.63,0.62,0.75,0.8,0.78,0.6,0.74,0.74,0.81,0.7
ChatGPT-Detector-RoBERTa-Chinese,Twitter,0.86,0.9,0.73,0.67,0.77,0.78,0.9,0.79,0.72,,,0.61,0.68,0.76,0.58,0.71,0.7,0.88,0.77,,,0.83,0.74
ChatGPT-Detector-RoBERTa-Chinese,WhatsApp,0.82,,0.77,0.36,0.91,,0.89,0.85,0.83,,,,,0.58,,0.71,0.83,0.81,,,,,0.79


  print(temp.applymap_index(lambda v: "font-weight: bold;", axis=0).applymap_index(lambda v: "font-weight: bold;", axis=1).to_latex(convert_css=True).replace('\\multirow', '\\hline\n\\multirow').replace('\\multirow[c]{5}{*}{', '\\multirow[c]{5}{*}{\\rotatebox{90}{\\parbox{2cm}{').replace('} &', '}}} &').replace('{\\cellcolor[HTML]{000000}} \\color[HTML]{000000} nan', 'N/A'))


\begin{tabular}{llrrrrrrrrrrrrrrrrrrrrrrr}
 &  & \multicolumn{23}{r}{\bfseries AUC ROC} \\
 & Language & \bfseries ar & \bfseries bg & \bfseries ca & \bfseries cs & \bfseries de & \bfseries el & \bfseries en & \bfseries es & \bfseries et & \bfseries ga & \bfseries gd & \bfseries hr & \bfseries hu & \bfseries nl & \bfseries pl & \bfseries pt & \bfseries ro & \bfseries ru & \bfseries sk & \bfseries sl & \bfseries uk & \bfseries zh & \bfseries {all} \\
Detector & Platform &  &  &  &  &  &  &  &  &  &  &  &  &  &  &  &  &  &  &  &  &  &  &  \\
\hline
\multirow[c]{5}{*}{\rotatebox{90}{\parbox{2cm}{\bfseries BLOOMZ-3b-mixed-Detector}}} & \bfseries Discord & N/A & N/A & {\cellcolor[HTML]{84B0D3}} \color[HTML]{000000} 0.96 & {\cellcolor[HTML]{9FBAD9}} \color[HTML]{000000} 0.89 & {\cellcolor[HTML]{A9BFDC}} \color[HTML]{000000} 0.87 & N/A & {\cellcolor[HTML]{A7BDDB}} \color[HTML]{000000} 0.87 & {\cellcolor[HTML]{9CB9D9}} \color[HTML]{000000} 0.90 & {\cellcolor[HTML]{A9BFDC}} \color[HTML]{000000}

In [32]:
#finetuned
temp = results_all[(results_all.Category == 'F')].drop(columns=['Category']).set_index(['Detector', 'Platform', 'Language']).unstack().style.format(precision=2).background_gradient(vmin=0.5, vmax=1.5, text_color_threshold=0, axis=None)
display(temp)
print(temp.applymap_index(lambda v: "font-weight: bold;", axis=0).applymap_index(lambda v: "font-weight: bold;", axis=1).to_latex(convert_css=True).replace('\\multirow', '\\hline\n\\multirow').replace('\\multirow[c]{5}{*}{', '\\multirow[c]{5}{*}{\\rotatebox{90}{\\parbox{2cm}{').replace('} &', '}}} &').replace('{\\cellcolor[HTML]{000000}} \\color[HTML]{000000} nan', 'N/A'))

Unnamed: 0_level_0,Unnamed: 1_level_0,AUC ROC,AUC ROC,AUC ROC,AUC ROC,AUC ROC,AUC ROC,AUC ROC,AUC ROC,AUC ROC,AUC ROC,AUC ROC,AUC ROC,AUC ROC,AUC ROC,AUC ROC,AUC ROC,AUC ROC,AUC ROC,AUC ROC,AUC ROC,AUC ROC,AUC ROC,AUC ROC
Unnamed: 0_level_1,Language,ar,bg,ca,cs,de,el,en,es,et,ga,gd,hr,hu,nl,pl,pt,ro,ru,sk,sl,uk,zh,{all}
Detector,Platform,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2,Unnamed: 22_level_2,Unnamed: 23_level_2,Unnamed: 24_level_2
Aya-101-MultiSocial,Discord,,,0.99,1.0,0.97,,0.99,0.99,1.0,0.96,0.9,0.99,1.0,0.99,0.99,0.99,0.99,,0.99,,,,0.99
Aya-101-MultiSocial,Gab,0.91,0.96,0.97,0.96,0.96,0.95,0.96,0.96,0.94,0.94,,0.94,0.98,0.97,0.97,0.96,0.95,0.92,0.96,0.92,0.89,0.94,0.94
Aya-101-MultiSocial,Telegram,0.97,0.99,0.97,0.98,0.98,0.98,0.98,0.97,0.99,0.94,,0.99,1.0,0.96,0.98,0.97,0.99,0.97,0.98,1.0,0.96,0.99,0.98
Aya-101-MultiSocial,Twitter,0.99,0.99,0.98,0.97,0.97,0.99,0.97,0.99,0.98,,,1.0,0.98,0.97,0.99,0.98,0.98,0.98,1.0,,,0.97,0.98
Aya-101-MultiSocial,WhatsApp,0.99,,0.97,0.98,0.95,,0.98,0.98,1.0,,,,,0.97,,0.97,0.99,0.95,,,,,0.97
BLOOMZ-3b-MultiSocial,Discord,,,0.99,1.0,0.97,,0.99,0.99,0.99,0.97,0.84,0.98,1.0,0.98,0.99,0.99,0.99,,0.97,,,,0.99
BLOOMZ-3b-MultiSocial,Gab,0.87,0.96,0.96,0.93,0.92,0.93,0.96,0.95,0.94,0.88,,0.9,0.97,0.92,0.94,0.95,0.89,0.88,0.95,0.81,0.81,0.93,0.91
BLOOMZ-3b-MultiSocial,Telegram,0.95,0.98,0.94,0.97,0.96,0.98,0.98,0.97,0.98,0.86,,0.97,1.0,0.92,0.94,0.97,0.97,0.94,0.95,0.98,0.9,0.98,0.96
BLOOMZ-3b-MultiSocial,Twitter,0.99,0.98,0.98,0.94,0.96,0.99,0.98,0.99,0.92,,,0.99,0.97,0.94,0.97,0.97,0.96,0.99,0.98,,,0.99,0.97
BLOOMZ-3b-MultiSocial,WhatsApp,0.99,,0.97,0.98,0.97,,0.98,0.98,0.99,,,,,0.95,,0.98,0.99,0.97,,,,,0.98


  print(temp.applymap_index(lambda v: "font-weight: bold;", axis=0).applymap_index(lambda v: "font-weight: bold;", axis=1).to_latex(convert_css=True).replace('\\multirow', '\\hline\n\\multirow').replace('\\multirow[c]{5}{*}{', '\\multirow[c]{5}{*}{\\rotatebox{90}{\\parbox{2cm}{').replace('} &', '}}} &').replace('{\\cellcolor[HTML]{000000}} \\color[HTML]{000000} nan', 'N/A'))


\begin{tabular}{llrrrrrrrrrrrrrrrrrrrrrrr}
 &  & \multicolumn{23}{r}{\bfseries AUC ROC} \\
 & Language & \bfseries ar & \bfseries bg & \bfseries ca & \bfseries cs & \bfseries de & \bfseries el & \bfseries en & \bfseries es & \bfseries et & \bfseries ga & \bfseries gd & \bfseries hr & \bfseries hu & \bfseries nl & \bfseries pl & \bfseries pt & \bfseries ro & \bfseries ru & \bfseries sk & \bfseries sl & \bfseries uk & \bfseries zh & \bfseries {all} \\
Detector & Platform &  &  &  &  &  &  &  &  &  &  &  &  &  &  &  &  &  &  &  &  &  &  &  \\
\hline
\multirow[c]{5}{*}{\rotatebox{90}{\parbox{2cm}{\bfseries Aya-101-MultiSocial}}} & \bfseries Discord & N/A & N/A & {\cellcolor[HTML]{76AAD0}} \color[HTML]{000000} 0.99 & {\cellcolor[HTML]{75A9CF}} \color[HTML]{000000} 1.00 & {\cellcolor[HTML]{80AED2}} \color[HTML]{000000} 0.97 & N/A & {\cellcolor[HTML]{79ABD0}} \color[HTML]{000000} 0.99 & {\cellcolor[HTML]{78ABD0}} \color[HTML]{000000} 0.99 & {\cellcolor[HTML]{76AAD0}} \color[HTML]{000000} 1.00

In [33]:

#social media test data - per platform per llm
auc_dict = {}
for detector in tqdm(test_results, total=len(test_results)):
 for model,data in detector.items():
  temp_data = data[(data.split == 'test') & (data.domain == 'social_media')]
 if (to_category[model] == 'F') and ('social-media' not in model): continue
 temp_data['source'] = temp_data['source'].str.replace('multisocial_', '')
 for llm in tqdm(temp_data.multi_label.unique(), total=len(temp_data.multi_label.unique())):
  if llm == 'human': continue
  temp = temp_data[(temp_data.multi_label.str.contains(f'{llm}') | temp_data.multi_label.str.contains(f'human'))]
  #if (temp['prediction_probs'].min() >= 0.5) and (temp['prediction_probs'].max() <= 1.0):
    #temp.loc[temp.predictions == 'human', 'prediction_probs'] = 1 - temp['prediction_probs'] #adjust human prediction prob to be in range <0, 0.5)
  labels = [label2id[x] for x in temp['label']]
  fpr, tpr, thresholds = roc_curve(labels, temp['prediction_probs'])
  auc_dict[f"{model} {llm}"] = {'auc': auc_roc_reliable(fpr, tpr, labels), 'th_optim': thresholds[np.argmax(tpr - fpr)], 'tpr_1%fpr': tpr[fpr <= 0.01][-1], 'tpr_5%fpr': tpr[fpr <= 0.05][-1], 'th_1%fpr': thresholds[fpr <= 0.01][-1], 'th_3%fpr': thresholds[fpr <= 0.03][-1], 'th_5%fpr': thresholds[fpr <= 0.05][-1], 'th_10%fpr': thresholds[fpr <= 0.10][-1], 'th_15%fpr': thresholds[fpr <= 0.15][-1], 'th_20%fpr': thresholds[fpr <= 0.20][-1], 'th_25%fpr': thresholds[fpr <= 0.25][-1], 'th_30%fpr': thresholds[fpr <= 0.30][-1], 'th_40%fpr': thresholds[fpr <= 0.40][-1], 'th_50%fpr': thresholds[fpr <= 0.50][-1]}
  for test_language in temp.source.unique():
    temp2 = temp[temp.source.str.contains(test_language)].reset_index(drop=True)
    labels = [label2id[x] for x in temp2['label']]
    fpr, tpr, thresholds = roc_curve(labels, temp2['prediction_probs'])
    auc_dict[f"{model} {llm}"][test_language] = {'auc': auc_roc_reliable(fpr, tpr, labels), 'th_optim': thresholds[np.argmax(tpr - fpr)], 'tpr_1%fpr': tpr[fpr <= 0.01][-1], 'tpr_5%fpr': tpr[fpr <= 0.05][-1], 'th_1%fpr': thresholds[fpr <= 0.01][-1], 'th_3%fpr': thresholds[fpr <= 0.03][-1], 'th_5%fpr': thresholds[fpr <= 0.05][-1], 'th_10%fpr': thresholds[fpr <= 0.10][-1], 'th_15%fpr': thresholds[fpr <= 0.15][-1], 'th_20%fpr': thresholds[fpr <= 0.20][-1], 'th_25%fpr': thresholds[fpr <= 0.25][-1], 'th_30%fpr': thresholds[fpr <= 0.30][-1], 'th_40%fpr': thresholds[fpr <= 0.40][-1], 'th_50%fpr': thresholds[fpr <= 0.50][-1]}
results_all = pd.DataFrame()
for model, v in tqdm(auc_dict.items(), total=len(auc_dict)):
  temp = pd.DataFrame({'Detector': model.split(' ')[0], 'Generator': model.split(' ')[1], 'Platform': '{all}', 'AUC ROC': v['auc']}, index=[0])
  results_all = pd.concat([results_all, temp])
  for test_language,val in v.items():
    if (test_language == 'auc') or ('_' in test_language): continue
    temp = pd.DataFrame({'Detector': model.split(' ')[0], 'Generator': model.split(' ')[1], 'Platform': test_language, 'AUC ROC': val['auc']}, index=[0])
    results_all = pd.concat([results_all, temp])
#results_all.set_index(['Detector', 'Platform', 'Language']).unstack().style.format(precision=2).highlight_max(props='font-weight: bold;', axis=0).background_gradient(vmin=0.5, axis=None)
results_all['Category'] = results_all['Detector'].map(to_category)
results_all['Detector'] = results_all['Detector'].apply(rename_detector)
results_all['Platform'] = results_all['Platform'].map(rename_platforms)
results_perplatformperllm = results_all.copy()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  temp_data['source'] = temp_data['source'].str.replace('multisocial_', '')

  0%|                                                                                             | 0/8 [00:00<?, ?it/s][A
 12%|██████████▋                                                                          | 1/8 [00:00<00:01,  6.46it/s][A
 25%|█████████████████████▎                                                               | 2/8 [00:00<00:00,  6.58it/s][A
 38%|███████████████████████████████▉                                                     | 3/8 [00:00<00:00,  6.64it/s][A
 50%|██████████████████████████████████████████▌                                          | 4/8 [00:00<00:00,  6.75it/s][A
 62%|███████████████████████████████████

In [34]:
#statistical
temp = results_all[(results_all.Category == 'S')].drop(columns=['Category']).set_index(['Detector', 'Generator', 'Platform']).unstack().style.format(precision=2).background_gradient(vmin=0.5, vmax=1.5, text_color_threshold=0, axis=None)
display(temp)
print(temp.applymap_index(lambda v: "font-weight: bold;", axis=0).applymap_index(lambda v: "font-weight: bold;", axis=1).to_latex(convert_css=True).replace('\\multirow', '\\hline\n\\multirow').replace('\\multirow[c]{7}{*}{', '\\multirow[c]{7}{*}{\\rotatebox{90}{\\parbox{2cm}{').replace('} &', '}}} &').replace('{\\cellcolor[HTML]{000000}} \\color[HTML]{000000} nan', 'N/A'))

Unnamed: 0_level_0,Unnamed: 1_level_0,AUC ROC,AUC ROC,AUC ROC,AUC ROC,AUC ROC,AUC ROC
Unnamed: 0_level_1,Platform,Discord,Gab,Telegram,Twitter,WhatsApp,{all}
Detector,Generator,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2
Binoculars,Mistral-7B-Instruct-v0.2,0.77,0.62,0.68,0.68,0.67,0.68
Binoculars,aya-101,0.76,0.65,0.68,0.7,0.72,0.69
Binoculars,gemini,0.88,0.79,0.84,0.83,0.84,0.83
Binoculars,gpt-3.5-turbo-0125,0.75,0.64,0.71,0.68,0.69,0.68
Binoculars,opt-iml-max-30b,0.71,0.6,0.64,0.66,0.65,0.64
Binoculars,v5-Eagle-7B-HF,0.85,0.76,0.8,0.81,0.79,0.79
Binoculars,vicuna-13b,0.82,0.73,0.76,0.8,0.77,0.76
DetectLLM-LRR,Mistral-7B-Instruct-v0.2,0.94,0.63,0.71,0.71,0.68,0.71
DetectLLM-LRR,aya-101,0.91,0.64,0.69,0.7,0.66,0.7
DetectLLM-LRR,gemini,0.96,0.8,0.84,0.82,0.78,0.83


\begin{tabular}{llrrrrrr}
 &  & \multicolumn{6}{r}{\bfseries AUC ROC} \\
 & Platform & \bfseries Discord & \bfseries Gab & \bfseries Telegram & \bfseries Twitter & \bfseries WhatsApp & \bfseries {all} \\
Detector & Generator &  &  &  &  &  &  \\
\hline
\multirow[c]{7}{*}{\rotatebox{90}{\parbox{2cm}{\bfseries Binoculars}}} & \bfseries Mistral-7B-Instruct-v0.2 & {\cellcolor[HTML]{C9CEE4}} \color[HTML]{000000} 0.77 & {\cellcolor[HTML]{EDE8F3}} \color[HTML]{000000} 0.62 & {\cellcolor[HTML]{DFDDEC}} \color[HTML]{000000} 0.68 & {\cellcolor[HTML]{DFDDEC}} \color[HTML]{000000} 0.68 & {\cellcolor[HTML]{E3E0EE}} \color[HTML]{000000} 0.67 & {\cellcolor[HTML]{E1DFED}} \color[HTML]{000000} 0.68 \\
\bfseries  & \bfseries aya-101 & {\cellcolor[HTML]{CED0E6}} \color[HTML]{000000} 0.76 & {\cellcolor[HTML]{E7E3F0}} \color[HTML]{000000} 0.65 & {\cellcolor[HTML]{E0DDED}} \color[HTML]{000000} 0.68 & {\cellcolor[HTML]{DCDAEB}} \color[HTML]{000000} 0.70 & {\cellcolor[HTML]{D8D7E9}} \color[HTML]{000000} 0.72 

  print(temp.applymap_index(lambda v: "font-weight: bold;", axis=0).applymap_index(lambda v: "font-weight: bold;", axis=1).to_latex(convert_css=True).replace('\\multirow', '\\hline\n\\multirow').replace('\\multirow[c]{7}{*}{', '\\multirow[c]{7}{*}{\\rotatebox{90}{\\parbox{2cm}{').replace('} &', '}}} &').replace('{\\cellcolor[HTML]{000000}} \\color[HTML]{000000} nan', 'N/A'))


In [35]:
#pretrained
temp = results_all[(results_all.Category == 'P')].drop(columns=['Category']).set_index(['Detector', 'Generator', 'Platform']).unstack().style.format(precision=2).background_gradient(vmin=0.5, vmax=1.5, text_color_threshold=0, axis=None)
display(temp)
print(temp.applymap_index(lambda v: "font-weight: bold;", axis=0).applymap_index(lambda v: "font-weight: bold;", axis=1).to_latex(convert_css=True).replace('\\multirow', '\\hline\n\\multirow').replace('\\multirow[c]{7}{*}{', '\\multirow[c]{7}{*}{\\rotatebox{90}{\\parbox{2cm}{').replace('} &', '}}} &').replace('{\\cellcolor[HTML]{000000}} \\color[HTML]{000000} nan', 'N/A'))

Unnamed: 0_level_0,Unnamed: 1_level_0,AUC ROC,AUC ROC,AUC ROC,AUC ROC,AUC ROC,AUC ROC
Unnamed: 0_level_1,Platform,Discord,Gab,Telegram,Twitter,WhatsApp,{all}
Detector,Generator,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2
BLOOMZ-3b-mixed-Detector,Mistral-7B-Instruct-v0.2,0.87,0.62,0.76,0.7,0.78,0.74
BLOOMZ-3b-mixed-Detector,aya-101,0.92,0.75,0.85,0.81,0.85,0.83
BLOOMZ-3b-mixed-Detector,gemini,0.76,0.47,0.63,0.52,0.58,0.59
BLOOMZ-3b-mixed-Detector,gpt-3.5-turbo-0125,0.91,0.7,0.8,0.75,0.86,0.8
BLOOMZ-3b-mixed-Detector,opt-iml-max-30b,0.81,0.67,0.76,0.69,0.79,0.73
BLOOMZ-3b-mixed-Detector,v5-Eagle-7B-HF,0.92,0.7,0.83,0.78,0.86,0.81
BLOOMZ-3b-mixed-Detector,vicuna-13b,0.89,0.68,0.81,0.78,0.83,0.79
ChatGPT-Detector-RoBERTa-Chinese,Mistral-7B-Instruct-v0.2,0.85,0.7,0.75,0.74,0.85,0.76
ChatGPT-Detector-RoBERTa-Chinese,aya-101,0.7,0.64,0.65,0.71,0.73,0.67
ChatGPT-Detector-RoBERTa-Chinese,gemini,0.88,0.73,0.78,0.81,0.86,0.8


  print(temp.applymap_index(lambda v: "font-weight: bold;", axis=0).applymap_index(lambda v: "font-weight: bold;", axis=1).to_latex(convert_css=True).replace('\\multirow', '\\hline\n\\multirow').replace('\\multirow[c]{7}{*}{', '\\multirow[c]{7}{*}{\\rotatebox{90}{\\parbox{2cm}{').replace('} &', '}}} &').replace('{\\cellcolor[HTML]{000000}} \\color[HTML]{000000} nan', 'N/A'))


\begin{tabular}{llrrrrrr}
 &  & \multicolumn{6}{r}{\bfseries AUC ROC} \\
 & Platform & \bfseries Discord & \bfseries Gab & \bfseries Telegram & \bfseries Twitter & \bfseries WhatsApp & \bfseries {all} \\
Detector & Generator &  &  &  &  &  &  \\
\hline
\multirow[c]{7}{*}{\rotatebox{90}{\parbox{2cm}{\bfseries BLOOMZ-3b-mixed-Detector}}} & \bfseries Mistral-7B-Instruct-v0.2 & {\cellcolor[HTML]{A8BEDC}} \color[HTML]{000000} 0.87 & {\cellcolor[HTML]{EDE7F2}} \color[HTML]{000000} 0.62 & {\cellcolor[HTML]{CED0E6}} \color[HTML]{000000} 0.76 & {\cellcolor[HTML]{DBDAEB}} \color[HTML]{000000} 0.70 & {\cellcolor[HTML]{C5CCE3}} \color[HTML]{000000} 0.78 & {\cellcolor[HTML]{D3D4E7}} \color[HTML]{000000} 0.74 \\
\bfseries  & \bfseries aya-101 & {\cellcolor[HTML]{93B5D6}} \color[HTML]{000000} 0.92 & {\cellcolor[HTML]{CED0E6}} \color[HTML]{000000} 0.75 & {\cellcolor[HTML]{ADC1DD}} \color[HTML]{000000} 0.85 & {\cellcolor[HTML]{BBC7E0}} \color[HTML]{000000} 0.81 & {\cellcolor[HTML]{B0C2DE}} \color[HTML]

In [36]:
#finetuned
temp = results_all[(results_all.Category == 'F')].drop(columns=['Category']).set_index(['Detector', 'Generator', 'Platform']).unstack().style.format(precision=2).background_gradient(vmin=0.5, vmax=1.5, text_color_threshold=0, axis=None)
display(temp)
print(temp.applymap_index(lambda v: "font-weight: bold;", axis=0).applymap_index(lambda v: "font-weight: bold;", axis=1).to_latex(convert_css=True).replace('\\multirow', '\\hline\n\\multirow').replace('\\multirow[c]{7}{*}{', '\\multirow[c]{7}{*}{\\rotatebox{90}{\\parbox{2cm}{').replace('} &', '}}} &').replace('{\\cellcolor[HTML]{000000}} \\color[HTML]{000000} nan', 'N/A'))

Unnamed: 0_level_0,Unnamed: 1_level_0,AUC ROC,AUC ROC,AUC ROC,AUC ROC,AUC ROC,AUC ROC
Unnamed: 0_level_1,Platform,Discord,Gab,Telegram,Twitter,WhatsApp,{all}
Detector,Generator,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2
Aya-101-MultiSocial,Mistral-7B-Instruct-v0.2,1.0,0.98,0.99,0.99,1.0,0.99
Aya-101-MultiSocial,aya-101,0.99,0.91,0.97,0.96,0.95,0.96
Aya-101-MultiSocial,gemini,0.98,0.86,0.95,0.93,0.93,0.93
Aya-101-MultiSocial,gpt-3.5-turbo-0125,1.0,0.98,0.99,1.0,1.0,0.99
Aya-101-MultiSocial,opt-iml-max-30b,0.97,0.92,0.97,0.97,0.96,0.95
Aya-101-MultiSocial,v5-Eagle-7B-HF,1.0,0.98,1.0,1.0,0.99,0.99
Aya-101-MultiSocial,vicuna-13b,1.0,0.98,0.99,1.0,1.0,0.99
BLOOMZ-3b-MultiSocial,Mistral-7B-Instruct-v0.2,1.0,0.94,0.97,0.98,0.99,0.97
BLOOMZ-3b-MultiSocial,aya-101,0.98,0.88,0.94,0.95,0.96,0.94
BLOOMZ-3b-MultiSocial,gemini,0.99,0.85,0.94,0.93,0.96,0.93


\begin{tabular}{llrrrrrr}
 &  & \multicolumn{6}{r}{\bfseries AUC ROC} \\
 & Platform & \bfseries Discord & \bfseries Gab & \bfseries Telegram & \bfseries Twitter & \bfseries WhatsApp & \bfseries {all} \\
Detector & Generator &  &  &  &  &  &  \\
\hline
\multirow[c]{7}{*}{\rotatebox{90}{\parbox{2cm}{\bfseries Aya-101-MultiSocial}}} & \bfseries Mistral-7B-Instruct-v0.2 & {\cellcolor[HTML]{75A9CF}} \color[HTML]{000000} 1.00 & {\cellcolor[HTML]{7EADD1}} \color[HTML]{000000} 0.98 & {\cellcolor[HTML]{76AAD0}} \color[HTML]{000000} 0.99 & {\cellcolor[HTML]{76AAD0}} \color[HTML]{000000} 0.99 & {\cellcolor[HTML]{75A9CF}} \color[HTML]{000000} 1.00 & {\cellcolor[HTML]{78ABD0}} \color[HTML]{000000} 0.99 \\
\bfseries  & \bfseries aya-101 & {\cellcolor[HTML]{79ABD0}} \color[HTML]{000000} 0.99 & {\cellcolor[HTML]{99B8D8}} \color[HTML]{000000} 0.91 & {\cellcolor[HTML]{81AED2}} \color[HTML]{000000} 0.97 & {\cellcolor[HTML]{83AFD3}} \color[HTML]{000000} 0.96 & {\cellcolor[HTML]{88B1D4}} \color[HTML]{0000

  print(temp.applymap_index(lambda v: "font-weight: bold;", axis=0).applymap_index(lambda v: "font-weight: bold;", axis=1).to_latex(convert_css=True).replace('\\multirow', '\\hline\n\\multirow').replace('\\multirow[c]{7}{*}{', '\\multirow[c]{7}{*}{\\rotatebox{90}{\\parbox{2cm}{').replace('} &', '}}} &').replace('{\\cellcolor[HTML]{000000}} \\color[HTML]{000000} nan', 'N/A'))


## RQ1: Zero-shot MGTD

In [37]:
#due to low performance
to_exclude = ['ruRoBERTa-ruatd-binary', 'RoBERTa-large-OpenAI-Detector',
       'Longformer Detector']

In [38]:
temp_results_all = results_all.copy()
results_all = results_perplatform.copy()

In [39]:
results_all = results_all[~results_all.Detector.isin(to_exclude)]

In [40]:
#per platform mean per detector category
temp = pd.DataFrame(multidomain[multidomain.domain=='social_media'][['source', 'language']].value_counts())
temp = temp.reset_index()
temp['source'] = temp['source'].str.replace('multisocial_', '')
temp = temp.groupby(['source', 'language']).sum().unstack().fillna(0).astype(int) > 2000 #if there are enough samples (~ 250 of each generator)
enough_platform_language_samples = pd.concat([temp, temp])
enough_platform_language_samples[('count', '{all}')] = True

temp = results_all[(results_all.Category != 'F')].groupby(['Category', 'Platform', 'Language']).mean(numeric_only=True).unstack()
enough_platform_language_samples.index = temp.index
enough_platform_language_samples.columns = temp.columns
temp[~enough_platform_language_samples] = 'nan'
platforms = temp.copy()
#temp = temp.style.format(precision=2).background_gradient(vmin=0.5, vmax=1.5, text_color_threshold=0, axis=None, gmap=temp).applymap(lambda x: 'background-color: white;' if str(x)=='nan' else '')
#display(temp)
#print(temp.applymap_index(lambda v: "font-weight: bold;", axis=0).applymap_index(lambda v: "font-weight: bold;", axis=1).to_latex(convert_css=True).replace('\\multirow', '\\hline\n\\multirow').replace('\\multirow[c]{5}{*}{', '\\multirow[c]{5}{*}{\\rotatebox{90}{\\parbox{2cm}{').replace('} &', '}}} &').replace('{\\cellcolor[HTML]{000000}} \\color[HTML]{000000} nan', 'N/A'))


  temp[~enough_platform_language_samples] = 'nan'


In [41]:
#social media test data

auc_dict = {}
for detector in tqdm(test_results, total=len(test_results)):
  for model,data in detector.items():
    temp = data[(data.split == 'test') & (data.domain == 'social_media')]
  if (to_category[model] == 'F') and ('social-media' not in model): continue
  #if (temp['prediction_probs'].min() >= 0.5) and (temp['prediction_probs'].max() <= 1.0):
    #temp.loc[temp.predictions == 'human', 'prediction_probs'] = 1 - temp['prediction_probs'] #adjust human prediction prob to be in range <0, 0.5)
  labels = [label2id[x] for x in temp['label']]
  fpr, tpr, thresholds = roc_curve(labels, temp['prediction_probs'])
  auc_dict[model] = {'auc':  auc_roc_reliable(fpr, tpr, labels), 'th_optim': thresholds[np.argmax(tpr - fpr)], 'tpr_1%fpr': tpr[fpr <= 0.01][-1], 'tpr_5%fpr': tpr[fpr <= 0.05][-1], 'th_1%fpr': thresholds[fpr <= 0.01][-1], 'th_3%fpr': thresholds[fpr <= 0.03][-1], 'th_5%fpr': thresholds[fpr <= 0.05][-1], 'th_10%fpr': thresholds[fpr <= 0.10][-1], 'th_15%fpr': thresholds[fpr <= 0.15][-1], 'th_20%fpr': thresholds[fpr <= 0.20][-1], 'th_25%fpr': thresholds[fpr <= 0.25][-1], 'th_30%fpr': thresholds[fpr <= 0.30][-1], 'th_40%fpr': thresholds[fpr <= 0.40][-1], 'th_50%fpr': thresholds[fpr <= 0.50][-1]}
  for test_language in temp.language.unique():
    temp2 = temp[temp.language == test_language].reset_index(drop=True)
    labels = [label2id[x] for x in temp2['label']]
    fpr, tpr, thresholds = roc_curve(labels, temp2['prediction_probs'])
    auc_dict[model][test_language] = {'auc':  auc_roc_reliable(fpr, tpr, labels), 'th_optim': thresholds[np.argmax(tpr - fpr)], 'tpr_1%fpr': tpr[fpr <= 0.01][-1], 'tpr_5%fpr': tpr[fpr <= 0.05][-1], 'th_1%fpr': thresholds[fpr <= 0.01][-1], 'th_3%fpr': thresholds[fpr <= 0.03][-1], 'th_5%fpr': thresholds[fpr <= 0.05][-1], 'th_10%fpr': thresholds[fpr <= 0.10][-1], 'th_15%fpr': thresholds[fpr <= 0.15][-1], 'th_20%fpr': thresholds[fpr <= 0.20][-1], 'th_25%fpr': thresholds[fpr <= 0.25][-1], 'th_30%fpr': thresholds[fpr <= 0.30][-1], 'th_40%fpr': thresholds[fpr <= 0.40][-1], 'th_50%fpr': thresholds[fpr <= 0.50][-1]}
results_all = pd.DataFrame()
for model, v in tqdm(auc_dict.items(), total=len(auc_dict)):
  temp = pd.DataFrame({'Detector': model, 'Language': '{all}', 'AUC ROC': v['auc']}, index=[0])
  results_all = pd.concat([results_all, temp])
  for test_language,val in v.items():
    if (test_language == 'auc') or ('_' in test_language): continue
    temp = pd.DataFrame({'Detector': model, 'Language': test_language, 'AUC ROC': val['auc']}, index=[0])
    results_all = pd.concat([results_all, temp])
#results_all.set_index(['Detector', 'Language']).unstack().sort_values(by=('AUC ROC','{all}'), ascending=False).style.format(precision=2).apply(lambda x: ["background-color: lightyellow" if v >= 0.6 else "" for v in x], axis = 0, subset='AUC ROC').highlight_max(props='font-weight: bold;', axis=0)
results_all['Category'] = results_all['Detector'].map(to_category)
results_all['Detector'] = results_all['Detector'].apply(rename_detector)
results_perlanguage = results_all.copy()

100%|███████████████████████████████████████████████████████████████████████████████████| 83/83 [00:13<00:00,  6.14it/s]
100%|██████████████████████████████████████████████████████████████████████████████████| 17/17 [00:00<00:00, 120.02it/s]


In [42]:
temp = results_all[results_all.Category != 'F'].set_index(['Category', 'Detector', 'Language']).unstack().sort_values(by=('AUC ROC','{all}'), ascending=False).sort_index().style.format(precision=2).background_gradient(vmin=0.5, vmax=1.5, text_color_threshold=0, axis=None).applymap(lambda x: 'background-color: white;' if str(x)=='nan' else '')
display(temp)
print(temp.applymap_index(lambda v: "font-weight: bold;", axis=0).applymap_index(lambda v: "font-weight: bold;", axis=1).to_latex(convert_css=True).replace('\\multirow', '\\hline\n\\multirow').replace('nan', 'N/A'))

  temp = results_all[results_all.Category != 'F'].set_index(['Category', 'Detector', 'Language']).unstack().sort_values(by=('AUC ROC','{all}'), ascending=False).sort_index().style.format(precision=2).background_gradient(vmin=0.5, vmax=1.5, text_color_threshold=0, axis=None).applymap(lambda x: 'background-color: white;' if str(x)=='nan' else '')


Unnamed: 0_level_0,Unnamed: 1_level_0,AUC ROC,AUC ROC,AUC ROC,AUC ROC,AUC ROC,AUC ROC,AUC ROC,AUC ROC,AUC ROC,AUC ROC,AUC ROC,AUC ROC,AUC ROC,AUC ROC,AUC ROC,AUC ROC,AUC ROC,AUC ROC,AUC ROC,AUC ROC,AUC ROC,AUC ROC,AUC ROC
Unnamed: 0_level_1,Language,ar,bg,ca,cs,de,el,en,es,et,ga,gd,hr,hu,nl,pl,pt,ro,ru,sk,sl,uk,zh,{all}
Category,Detector,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2,Unnamed: 22_level_2,Unnamed: 23_level_2,Unnamed: 24_level_2
P,BLOOMZ-3b-mixed-Detector,0.79,0.74,0.79,0.8,0.77,0.76,0.8,0.79,0.83,0.78,0.66,0.77,0.84,0.75,0.78,0.79,0.66,0.68,0.76,0.64,0.7,0.69,0.76
P,ChatGPT-Detector-RoBERTa-Chinese,0.72,0.8,0.76,0.66,0.8,0.75,0.9,0.81,0.82,0.73,0.63,0.63,0.78,0.7,0.62,0.73,0.75,0.76,0.63,0.66,0.74,0.81,0.72
P,Longformer Detector,0.34,0.48,0.32,0.47,0.43,0.54,0.65,0.43,0.48,0.53,0.49,0.51,0.5,0.41,0.46,0.42,0.41,0.46,0.45,0.46,0.61,0.47,0.46
P,RoBERTa-large-OpenAI-Detector,0.73,0.43,0.43,0.14,0.32,0.74,0.52,0.3,0.2,0.3,0.48,0.19,0.13,0.3,0.21,0.23,0.24,0.54,0.26,0.33,0.36,0.6,0.35
P,ruRoBERTa-ruatd-binary,0.4,0.63,0.56,0.43,0.43,0.35,0.56,0.47,0.43,0.49,0.47,0.48,0.46,0.5,0.44,0.43,0.34,0.7,0.45,0.47,0.59,0.44,0.48
S,Binoculars,0.7,0.68,0.62,0.74,0.75,0.79,0.8,0.76,0.74,0.71,0.71,0.79,0.78,0.72,0.75,0.75,0.74,0.72,0.71,0.73,0.64,0.74,0.72
S,DetectLLM-LRR,0.79,0.86,0.69,0.93,0.78,0.88,0.8,0.82,0.88,0.79,0.74,0.88,0.94,0.79,0.88,0.84,0.87,0.78,0.85,0.79,0.75,0.78,0.75
S,Fast-Detect-GPT,0.75,0.65,0.61,0.81,0.77,0.66,0.8,0.74,0.69,0.7,0.74,0.8,0.77,0.74,0.77,0.77,0.77,0.73,0.71,0.74,0.7,0.74,0.74
S,LLM-Deviation,0.82,0.86,0.68,0.93,0.79,0.89,0.8,0.82,0.9,0.81,0.79,0.89,0.94,0.79,0.89,0.84,0.88,0.78,0.86,0.81,0.75,0.79,0.75
S,S5,0.8,0.85,0.68,0.92,0.78,0.88,0.77,0.81,0.89,0.8,0.78,0.88,0.94,0.77,0.88,0.83,0.88,0.78,0.85,0.8,0.74,0.78,0.74


  print(temp.applymap_index(lambda v: "font-weight: bold;", axis=0).applymap_index(lambda v: "font-weight: bold;", axis=1).to_latex(convert_css=True).replace('\\multirow', '\\hline\n\\multirow').replace('nan', 'N/A'))


\begin{tabular}{llrrrrrrrrrrrrrrrrrrrrrrr}
 &  & \multicolumn{23}{r}{\bfseries AUC ROC} \\
 & Language & \bfseries ar & \bfseries bg & \bfseries ca & \bfseries cs & \bfseries de & \bfseries el & \bfseries en & \bfseries es & \bfseries et & \bfseries ga & \bfseries gd & \bfseries hr & \bfseries hu & \bfseries nl & \bfseries pl & \bfseries pt & \bfseries ro & \bfseries ru & \bfseries sk & \bfseries sl & \bfseries uk & \bfseries zh & \bfseries {all} \\
Category & Detector &  &  &  &  &  &  &  &  &  &  &  &  &  &  &  &  &  &  &  &  &  &  &  \\
\hline
\multirow[c]{5}{*}{\bfseries P} & \bfseries BLOOMZ-3b-mixed-Detector & {\cellcolor[HTML]{C4CBE3}} \color[HTML]{000000} 0.79 & {\cellcolor[HTML]{D2D3E7}} \color[HTML]{000000} 0.74 & {\cellcolor[HTML]{C2CBE2}} \color[HTML]{000000} 0.79 & {\cellcolor[HTML]{C1CAE2}} \color[HTML]{000000} 0.80 & {\cellcolor[HTML]{CACEE5}} \color[HTML]{000000} 0.77 & {\cellcolor[HTML]{CCCFE5}} \color[HTML]{000000} 0.76 & {\cellcolor[HTML]{BFC9E1}} \color[HTML]{000000

In [43]:
results_all = results_all[~results_all.Detector.isin(to_exclude)]

In [44]:
#mean per detector category
temp = pd.DataFrame(multidomain[multidomain.domain=='social_media'][['language']].value_counts())
temp = temp.reset_index()
temp = temp.groupby(['language']).sum().unstack().fillna(0).astype(int) > 2000 #if there are enough samples (~ 250 of each generator)
enough_platform_language_samples = pd.concat([temp, temp], axis=1)
enough_platform_language_samples = enough_platform_language_samples.T
enough_platform_language_samples['{all}'] = True

temp = results_all[(results_all.Category != 'F')].groupby(['Category', 'Language']).mean(numeric_only=True).unstack()
enough_platform_language_samples.index = temp.index
enough_platform_language_samples.columns = temp.columns
temp[~enough_platform_language_samples] = 'nan'
#temp = temp.style.format(precision=2).background_gradient(vmin=0.5, vmax=1.5, text_color_threshold=0, axis=None, gmap=temp).applymap(lambda x: 'background-color: white;' if str(x)=='nan' else '')
#display(temp)
#print(temp.applymap_index(lambda v: "font-weight: bold;", axis=0).applymap_index(lambda v: "font-weight: bold;", axis=1).to_latex(convert_css=True).replace('\\multirow', '\\hline\n\\multirow').replace('\\multirow[c]{5}{*}{', '\\multirow[c]{5}{*}{\\rotatebox{90}{\\parbox{2cm}{').replace('} &', '}}} &').replace('{\\cellcolor[HTML]{000000}} \\color[HTML]{000000} nan', 'N/A'))
temp['Platform'] = '{all}'


  temp[~enough_platform_language_samples] = 'nan'


In [45]:
temp = pd.concat([platforms.reset_index(), temp.reset_index()]).sort_values(['Category', 'Platform']).set_index(['Category', 'Platform'])
temp = temp.style.format(precision=2).background_gradient(vmin=0.5, vmax=1.5, text_color_threshold=0, axis=None, gmap=temp).applymap(lambda x: 'background-color: white;' if str(x)=='nan' else '')
display(temp)
print(temp.applymap_index(lambda v: "font-weight: bold;", axis=0).applymap_index(lambda v: "font-weight: bold;", axis=1).to_latex(convert_css=True).replace('\\multirow', '\\hline\n\\multirow').replace('nan', 'N/A'))

  temp = temp.style.format(precision=2).background_gradient(vmin=0.5, vmax=1.5, text_color_threshold=0, axis=None, gmap=temp).applymap(lambda x: 'background-color: white;' if str(x)=='nan' else '')


Unnamed: 0_level_0,Unnamed: 1_level_0,AUC ROC,AUC ROC,AUC ROC,AUC ROC,AUC ROC,AUC ROC,AUC ROC,AUC ROC,AUC ROC,AUC ROC,AUC ROC,AUC ROC,AUC ROC,AUC ROC,AUC ROC,AUC ROC,AUC ROC,AUC ROC,AUC ROC,AUC ROC,AUC ROC,AUC ROC,AUC ROC
Unnamed: 0_level_1,Language,ar,bg,ca,cs,de,el,en,es,et,ga,gd,hr,hu,nl,pl,pt,ro,ru,sk,sl,uk,zh,{all}
Category,Platform,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2,Unnamed: 22_level_2,Unnamed: 23_level_2,Unnamed: 24_level_2
P,Discord,,,0.92,0.81,0.86,,0.91,0.89,0.86,,,0.75,0.84,0.81,0.81,0.82,0.81,,,,,,0.82
P,Gab,,,,,0.71,,0.82,0.71,,,,,,0.71,0.64,0.72,0.61,0.62,,,,,0.66
P,Telegram,0.73,0.77,0.73,0.73,0.79,0.81,0.86,0.8,0.84,,,0.71,0.86,0.68,0.7,0.77,0.77,0.76,,,0.73,0.76,0.74
P,Twitter,0.84,0.79,,,0.78,,0.85,0.73,,,,,,0.73,,0.75,0.65,0.8,,,,,0.73
P,WhatsApp,,,,,,,0.82,0.86,,,,,,,,0.76,,,,,,,0.79
P,{all},0.76,0.77,0.77,0.73,0.78,0.75,0.85,0.8,0.82,0.75,,0.7,0.81,0.72,0.7,0.76,0.71,0.72,0.7,0.65,0.72,0.75,0.74
S,Discord,,,0.85,0.9,0.87,,0.9,0.89,0.86,,,0.89,0.91,0.89,0.92,0.9,0.89,,,,,,0.88
S,Gab,,,,,0.73,,0.76,0.74,,,,,,0.72,0.77,0.73,0.74,0.71,,,,,0.69
S,Telegram,0.76,0.78,0.62,0.88,0.71,0.86,0.81,0.77,0.84,,,0.89,0.91,0.74,0.84,0.82,0.87,0.74,,,0.71,0.75,0.74
S,Twitter,0.76,0.79,,,0.87,,0.83,0.72,,,,,,0.78,,0.86,0.85,0.9,,,,,0.74


  print(temp.applymap_index(lambda v: "font-weight: bold;", axis=0).applymap_index(lambda v: "font-weight: bold;", axis=1).to_latex(convert_css=True).replace('\\multirow', '\\hline\n\\multirow').replace('nan', 'N/A'))


\begin{tabular}{llllllllrrlllllllrllllllr}
 &  & \multicolumn{23}{r}{\bfseries AUC ROC} \\
 & Language & \bfseries ar & \bfseries bg & \bfseries ca & \bfseries cs & \bfseries de & \bfseries el & \bfseries en & \bfseries es & \bfseries et & \bfseries ga & \bfseries gd & \bfseries hr & \bfseries hu & \bfseries nl & \bfseries pl & \bfseries pt & \bfseries ro & \bfseries ru & \bfseries sk & \bfseries sl & \bfseries uk & \bfseries zh & \bfseries {all} \\
Category & Platform &  &  &  &  &  &  &  &  &  &  &  &  &  &  &  &  &  &  &  &  &  &  &  \\
\hline
\multirow[c]{6}{*}{\bfseries P} & \bfseries Discord & {\cellcolor[HTML]{000000}} \color[HTML]{000000} {\cellcolor{white}} N/A & {\cellcolor[HTML]{000000}} \color[HTML]{000000} {\cellcolor{white}} N/A & {\cellcolor[HTML]{94B6D7}} \color[HTML]{000000} 0.92 & {\cellcolor[HTML]{BCC7E1}} \color[HTML]{000000} 0.81 & {\cellcolor[HTML]{ABBFDC}} \color[HTML]{000000} 0.86 & {\cellcolor[HTML]{000000}} \color[HTML]{000000} {\cellcolor{white}} N/A & {\cell

In [46]:
results_all[(results_all.Category == 'S')].drop(columns=['Category']).set_index(['Detector', 'Language']).unstack().corr().style.format(precision=2).background_gradient(vmin=0.5, vmax=1.5, text_color_threshold=0, axis=None).applymap(lambda x: 'background-color: white;' if str(x)=='nan' else '')

  results_all[(results_all.Category == 'S')].drop(columns=['Category']).set_index(['Detector', 'Language']).unstack().corr().style.format(precision=2).background_gradient(vmin=0.5, vmax=1.5, text_color_threshold=0, axis=None).applymap(lambda x: 'background-color: white;' if str(x)=='nan' else '')


Unnamed: 0_level_0,Unnamed: 1_level_0,AUC ROC,AUC ROC,AUC ROC,AUC ROC,AUC ROC,AUC ROC,AUC ROC,AUC ROC,AUC ROC,AUC ROC,AUC ROC,AUC ROC,AUC ROC,AUC ROC,AUC ROC,AUC ROC,AUC ROC,AUC ROC,AUC ROC,AUC ROC,AUC ROC,AUC ROC,AUC ROC
Unnamed: 0_level_1,Language,ar,bg,ca,cs,de,el,en,es,et,ga,gd,hr,hu,nl,pl,pt,ro,ru,sk,sl,uk,zh,{all}
Unnamed: 0_level_2,Language,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2,Unnamed: 22_level_2,Unnamed: 23_level_2,Unnamed: 24_level_2
AUC ROC,ar,1.0,0.88,0.88,0.98,0.99,0.63,-0.48,0.84,0.83,0.91,0.9,0.94,0.89,0.95,0.95,0.96,0.96,0.95,0.91,0.97,0.99,0.94,0.89
AUC ROC,bg,0.88,1.0,0.99,0.94,0.87,0.91,-0.49,0.99,0.99,0.99,0.74,0.99,1.0,0.95,0.98,0.97,0.98,0.97,1.0,0.97,0.88,0.98,0.66
AUC ROC,ca,0.88,0.99,1.0,0.95,0.86,0.88,-0.47,0.98,0.98,0.97,0.7,0.98,0.99,0.96,0.98,0.97,0.98,0.98,0.99,0.96,0.89,0.96,0.7
AUC ROC,cs,0.98,0.94,0.95,1.0,0.96,0.72,-0.48,0.91,0.9,0.94,0.82,0.98,0.95,0.98,0.99,0.99,0.99,0.99,0.96,0.98,0.99,0.96,0.86
AUC ROC,de,0.99,0.87,0.86,0.96,1.0,0.61,-0.36,0.84,0.81,0.9,0.88,0.93,0.87,0.95,0.94,0.95,0.94,0.94,0.89,0.95,0.96,0.94,0.89
AUC ROC,el,0.63,0.91,0.88,0.72,0.61,1.0,-0.46,0.92,0.95,0.89,0.54,0.84,0.9,0.75,0.81,0.78,0.8,0.79,0.89,0.81,0.6,0.85,0.29
AUC ROC,en,-0.48,-0.49,-0.47,-0.48,-0.36,-0.46,1.0,-0.38,-0.5,-0.54,-0.64,-0.5,-0.5,-0.34,-0.48,-0.45,-0.51,-0.46,-0.49,-0.51,-0.46,-0.47,-0.2
AUC ROC,es,0.84,0.99,0.98,0.91,0.84,0.92,-0.38,1.0,0.99,0.97,0.67,0.97,0.99,0.94,0.96,0.95,0.95,0.96,0.99,0.94,0.84,0.97,0.63
AUC ROC,et,0.83,0.99,0.98,0.9,0.81,0.95,-0.5,0.99,1.0,0.98,0.71,0.97,0.99,0.91,0.95,0.94,0.95,0.94,0.99,0.95,0.82,0.97,0.57
AUC ROC,ga,0.91,0.99,0.97,0.94,0.9,0.89,-0.54,0.97,0.98,1.0,0.83,0.99,0.99,0.94,0.98,0.97,0.98,0.97,0.99,0.99,0.89,0.99,0.66


### Statistics

In [47]:
#statistical significance of per-language per-platform results between statistical and pretrained detectors
temp = results_perplatform.copy()
temp = temp[temp['Category'] != 'F'].reset_index(drop=True)
temp = temp[~temp.Detector.isin(to_exclude)]
languages = temp.Language.unique()
temp = temp.pivot(index=['Category', 'Platform', 'Detector'], columns='Language', values='AUC ROC').reset_index()
#temp = pd.concat([temp, temp.loc[temp.Category == 'P', :]])

res_df = pd.DataFrame()
for (src, trg) in itertools.combinations_with_replacement(temp.Category.unique(), 2):
 if src != trg: continue
 for (psrc, ptrg) in itertools.combinations_with_replacement(temp.Platform.unique(), 2):
  if psrc == ptrg: continue
  for lang in languages[languages == '{all}']:
   try:
    print(f"\nTest language: {lang}, ({ptrg}, {psrc}), ({trg}, {src})")
    res = pg.ttest(temp[(temp["Category"] == trg) & (temp["Platform"] == ptrg)][lang], temp[(temp["Category"] == src) & (temp["Platform"] == psrc)][lang], paired=True)
    display(res.style.apply(lambda _: np.where(res['p-val'] >= 0.05, 'background-color: yellow', '')))
   except:
    pass


Test language: {all}, (Gab, Discord), (P, P)


Unnamed: 0,T,dof,alternative,p-val,CI95%,cohen-d,BF10,power
T-test,-2.877336,1,two-sided,0.21294,[-0.85 0.54],3.133956,1.271,0.271963



Test language: {all}, (Telegram, Discord), (P, P)


Unnamed: 0,T,dof,alternative,p-val,CI95%,cohen-d,BF10,power
T-test,-5.755065,1,two-sided,0.109525,[-0.26 0.1 ],1.296543,1.772,0.116036



Test language: {all}, (Twitter, Discord), (P, P)


Unnamed: 0,T,dof,alternative,p-val,CI95%,cohen-d,BF10,power
T-test,-1.497792,1,two-sided,0.374767,[-0.85 0.67],1.757089,0.891,0.154842



Test language: {all}, (WhatsApp, Discord), (P, P)


Unnamed: 0,T,dof,alternative,p-val,CI95%,cohen-d,BF10,power
T-test,-0.571369,1,two-sided,0.66953,[-0.66 0.6 ],0.5651,0.604,0.065096



Test language: {all}, (Telegram, Gab), (P, P)


Unnamed: 0,T,dof,alternative,p-val,CI95%,cohen-d,BF10,power
T-test,1.897459,1,two-sided,0.308779,[-0.44 0.6 ],2.126487,1.013,0.186576



Test language: {all}, (Twitter, Gab), (P, P)


Unnamed: 0,T,dof,alternative,p-val,CI95%,cohen-d,BF10,power
T-test,13.401825,1,two-sided,0.047415,[0. 0.13],6.26383,2.431,0.512958



Test language: {all}, (WhatsApp, Gab), (P, P)


Unnamed: 0,T,dof,alternative,p-val,CI95%,cohen-d,BF10,power
T-test,24.62022,1,two-sided,0.025843,[0.06 0.2 ],27.295356,2.914,0.997543



Test language: {all}, (Twitter, Telegram), (P, P)


Unnamed: 0,T,dof,alternative,p-val,CI95%,cohen-d,BF10,power
T-test,-0.208231,1,two-sided,0.869304,[-0.59 0.57],0.255369,0.535,0.053212



Test language: {all}, (WhatsApp, Telegram), (P, P)


Unnamed: 0,T,dof,alternative,p-val,CI95%,cohen-d,BF10,power
T-test,1.454036,1,two-sided,0.383532,[-0.4 0.5],1.4319,0.877,0.127233



Test language: {all}, (WhatsApp, Twitter), (P, P)


Unnamed: 0,T,dof,alternative,p-val,CI95%,cohen-d,BF10,power
T-test,5.941812,1,two-sided,0.106148,[-0.07 0.19],6.264069,1.796,0.512975



Test language: {all}, (Gab, Discord), (S, S)


Unnamed: 0,T,dof,alternative,p-val,CI95%,cohen-d,BF10,power
T-test,-5.310034,4,two-sided,0.006046,[-0.29 -0.09],3.307932,10.145,0.999481



Test language: {all}, (Telegram, Discord), (S, S)


Unnamed: 0,T,dof,alternative,p-val,CI95%,cohen-d,BF10,power
T-test,-4.063327,4,two-sided,0.015307,[-0.24 -0.04],2.486409,5.184,0.980799



Test language: {all}, (Twitter, Discord), (S, S)


Unnamed: 0,T,dof,alternative,p-val,CI95%,cohen-d,BF10,power
T-test,-3.869102,4,two-sided,0.018007,[-0.23 -0.04],2.39952,4.611,0.973825



Test language: {all}, (WhatsApp, Discord), (S, S)


Unnamed: 0,T,dof,alternative,p-val,CI95%,cohen-d,BF10,power
T-test,-3.229739,4,two-sided,0.031983,[-0.29 -0.02],2.582885,3.052,0.986607



Test language: {all}, (Telegram, Gab), (S, S)


Unnamed: 0,T,dof,alternative,p-val,CI95%,cohen-d,BF10,power
T-test,21.01759,4,two-sided,3e-05,[0.04 0.05],8.459388,499.56,1.0



Test language: {all}, (Twitter, Gab), (S, S)


Unnamed: 0,T,dof,alternative,p-val,CI95%,cohen-d,BF10,power
T-test,30.839816,4,two-sided,7e-06,[0.05 0.06],10.314938,1559.308,1.0



Test language: {all}, (WhatsApp, Gab), (S, S)


Unnamed: 0,T,dof,alternative,p-val,CI95%,cohen-d,BF10,power
T-test,2.325739,4,two-sided,0.080628,[-0.01 0.07],1.411813,1.583,0.661114



Test language: {all}, (Twitter, Telegram), (S, S)


Unnamed: 0,T,dof,alternative,p-val,CI95%,cohen-d,BF10,power
T-test,6.616423,4,two-sided,0.002706,[0. 0.01],1.030571,18.195,0.420866



Test language: {all}, (WhatsApp, Telegram), (S, S)


Unnamed: 0,T,dof,alternative,p-val,CI95%,cohen-d,BF10,power
T-test,-1.087172,4,two-sided,0.338086,[-0.06 0.02],0.70408,0.611,0.229811



Test language: {all}, (WhatsApp, Twitter), (S, S)


Unnamed: 0,T,dof,alternative,p-val,CI95%,cohen-d,BF10,power
T-test,-1.491922,4,two-sided,0.209995,[-0.06 0.02],0.938104,0.824,0.362619


In [48]:
#statistical significance of per-language results between statistical and pretrained detectors
temp = results_perlanguage.copy()
temp = temp[temp['Category'] != 'F'].reset_index(drop=True)
temp = temp[~temp.Detector.isin(to_exclude)]
languages = temp.Language.unique()
temp = temp.pivot(index=['Category', 'Detector'], columns='Language', values='AUC ROC').reset_index()

res_df = pd.DataFrame()
for (src, trg) in itertools.combinations_with_replacement(temp.Category.unique(), 2):
  for lang in languages:
   if src == trg: continue
   print(f"\nTest language: {lang}, ({trg}, {src})")
   res = pg.ttest(temp[temp["Category"] == trg][lang], temp[temp["Category"] == src][lang], paired=True)
   display(res.style.apply(lambda _: np.where(res['p-val'] >= 0.05, 'background-color: yellow', '')))


Test language: {all}, (S, P)




Unnamed: 0,T,dof,alternative,p-val,CI95%,cohen-d,BF10,power
T-test,0.178734,1.090455,two-sided,0.88551,[-0.2 0.2],0.240264,0.574,0.056449



Test language: ro, (S, P)




Unnamed: 0,T,dof,alternative,p-val,CI95%,cohen-d,BF10,power
T-test,2.380732,2.037148,two-sided,0.137959,[-0.1 0.34],1.91635,1.684,0.457131



Test language: cs, (S, P)




Unnamed: 0,T,dof,alternative,p-val,CI95%,cohen-d,BF10,power
T-test,1.762482,1.772303,two-sided,0.235861,[-0.24 0.51],1.524431,1.13,0.316438



Test language: pl, (S, P)




Unnamed: 0,T,dof,alternative,p-val,CI95%,cohen-d,BF10,power
T-test,1.531669,1.287376,two-sided,0.326203,[-0.51 0.77],1.676588,0.981,0.369323



Test language: hr, (S, P)




Unnamed: 0,T,dof,alternative,p-val,CI95%,cohen-d,BF10,power
T-test,1.949265,1.208677,two-sided,0.266867,[-0.51 0.81],2.282017,1.272,0.592646



Test language: ar, (S, P)




Unnamed: 0,T,dof,alternative,p-val,CI95%,cohen-d,BF10,power
T-test,0.420753,1.910227,two-sided,0.716521,[-0.15 0.19],0.349586,0.598,0.063693



Test language: en, (S, P)




Unnamed: 0,T,dof,alternative,p-val,CI95%,cohen-d,BF10,power
T-test,-1.173821,1.024218,two-sided,0.445634,[-0.62 0.51],1.763835,0.803,0.400804



Test language: pt, (S, P)




Unnamed: 0,T,dof,alternative,p-val,CI95%,cohen-d,BF10,power
T-test,1.341353,1.974474,two-sided,0.313278,[-0.11 0.2 ],1.096135,0.879,0.188354



Test language: de, (S, P)




Unnamed: 0,T,dof,alternative,p-val,CI95%,cohen-d,BF10,power
T-test,-0.487024,1.38869,two-sided,0.691968,[-0.11 0.1 ],0.497627,0.607,0.077893



Test language: es, (S, P)




Unnamed: 0,T,dof,alternative,p-val,CI95%,cohen-d,BF10,power
T-test,-0.347482,4.426025,two-sided,0.744133,[-0.06 0.05],0.209105,0.588,0.054882



Test language: nl, (S, P)




Unnamed: 0,T,dof,alternative,p-val,CI95%,cohen-d,BF10,power
T-test,1.327031,1.474589,two-sided,0.353245,[-0.14 0.22],1.292926,0.872,0.242774



Test language: ca, (S, P)




Unnamed: 0,T,dof,alternative,p-val,CI95%,cohen-d,BF10,power
T-test,-5.03753,3.043229,two-sided,0.014569,[-0.2 -0.04],3.455464,8.79,0.905004



Test language: uk, (S, P)




Unnamed: 0,T,dof,alternative,p-val,CI95%,cohen-d,BF10,power
T-test,-0.160771,3.128454,two-sided,0.882098,[-0.1 0.09],0.109234,0.573,0.05133



Test language: el, (S, P)




Unnamed: 0,T,dof,alternative,p-val,CI95%,cohen-d,BF10,power
T-test,1.508357,4.272075,two-sided,0.201503,[-0.05 0.19],0.766304,0.968,0.11689



Test language: sk, (S, P)




Unnamed: 0,T,dof,alternative,p-val,CI95%,cohen-d,BF10,power
T-test,1.412788,1.7416,two-sided,0.309984,[-0.25 0.45],1.234197,0.915,0.225677



Test language: hu, (S, P)




Unnamed: 0,T,dof,alternative,p-val,CI95%,cohen-d,BF10,power
T-test,1.339278,4.690428,two-sided,0.241707,[-0.06 0.19],0.785091,0.878,0.120265



Test language: ga, (S, P)




Unnamed: 0,T,dof,alternative,p-val,CI95%,cohen-d,BF10,power
T-test,0.248727,3.54044,two-sided,0.817321,[-0.09 0.11],0.162065,0.579,0.05293



Test language: et, (S, P)




Unnamed: 0,T,dof,alternative,p-val,CI95%,cohen-d,BF10,power
T-test,-0.135479,4.277115,two-sided,0.898372,[-0.13 0.11],0.06885,0.572,0.050528



Test language: gd, (S, P)




Unnamed: 0,T,dof,alternative,p-val,CI95%,cohen-d,BF10,power
T-test,5.374921,3.105213,two-sided,0.011535,[0.04 0.17],3.661288,10.595,0.932711



Test language: bg, (S, P)




Unnamed: 0,T,dof,alternative,p-val,CI95%,cohen-d,BF10,power
T-test,0.181972,4.75916,two-sided,0.8631,[-0.14 0.16],0.105839,0.574,0.051249



Test language: ru, (S, P)




Unnamed: 0,T,dof,alternative,p-val,CI95%,cohen-d,BF10,power
T-test,0.949568,1.279354,two-sided,0.486935,[-0.28 0.36],1.045888,0.719,0.17582



Test language: sl, (S, P)




Unnamed: 0,T,dof,alternative,p-val,CI95%,cohen-d,BF10,power
T-test,6.862711,4.954537,two-sided,0.001041,[0.08 0.17],3.712047,22.583,0.938458



Test language: zh, (S, P)




Unnamed: 0,T,dof,alternative,p-val,CI95%,cohen-d,BF10,power
T-test,0.264622,1.067859,two-sided,0.833141,[-0.63 0.66],0.368117,0.58,0.065193


In [49]:
#statistical significance of per-language results between EN and non-EN languages
temp = results_perlanguage.copy()
temp = temp[temp['Category'] != 'F'].reset_index(drop=True)
#temp = temp[~temp.Detector.isin(to_exclude)]
languages = temp.Language.unique()
display(temp.head())
#temp = temp.pivot(index=['Category', 'Detector'], columns='Language', values='AUC ROC').reset_index()
#display(temp.head())

res_df = pd.DataFrame()
for lang in languages:
   if lang == 'en': continue
   print(f"\nTest language: en vs {lang}")
   res = pg.ttest(temp[temp['Language'] == 'en']['AUC ROC'], temp[temp['Language'] == lang]['AUC ROC'], paired=True)
   display(res.style.apply(lambda _: np.where(res['p-val'] >= 0.05, 'background-color: yellow', '')))

print(f"\nTest language: en vs others")
res = pg.ttest(temp[temp['Language'] == 'en']['AUC ROC'], temp[(temp['Language'] != 'en') & (temp['Language'] != '{all}')]['AUC ROC'], paired=True)
display(res.style.apply(lambda _: np.where(res['p-val'] >= 0.05, 'background-color: yellow', '')))

print('*' * 10, 'pretrained')
res_df = pd.DataFrame()
for lang in languages:
   if lang == 'en': continue
   print(f"\nTest language: en vs {lang}")
   res = pg.ttest(temp[(temp.Category == 'P') & (temp['Language'] == 'en')]['AUC ROC'], temp[(temp.Category == 'P') & (temp['Language'] == lang)]['AUC ROC'], paired=True)
   display(res.style.apply(lambda _: np.where(res['p-val'] >= 0.05, 'background-color: yellow', '')))

print(f"\nTest language: en vs others")
res = pg.ttest(temp[(temp.Category == 'P') & (temp['Language'] == 'en')]['AUC ROC'], temp[(temp.Category == 'P') & (temp['Language'] != 'en') & (temp['Language'] != '{all}')]['AUC ROC'], paired=True)
display(res.style.apply(lambda _: np.where(res['p-val'] >= 0.05, 'background-color: yellow', '')))

print('*' * 10, 'statistical')
res_df = pd.DataFrame()
for lang in languages:
   if lang == 'en': continue
   print(f"\nTest language: en vs {lang}")
   res = pg.ttest(temp[(temp.Category == 'S') & (temp['Language'] == 'en')]['AUC ROC'], temp[(temp.Category == 'S') & (temp['Language'] == lang)]['AUC ROC'], paired=True)
   display(res.style.apply(lambda _: np.where(res['p-val'] >= 0.05, 'background-color: yellow', '')))

print(f"\nTest language: en vs others")
res = pg.ttest(temp[(temp.Category == 'S') & (temp['Language'] == 'en')]['AUC ROC'], temp[(temp.Category == 'S') & (temp['Language'] != 'en') & (temp['Language'] != '{all}')]['AUC ROC'], paired=True)
display(res.style.apply(lambda _: np.where(res['p-val'] >= 0.05, 'background-color: yellow', '')))

Unnamed: 0,Detector,Language,AUC ROC,Category
0,BLOOMZ-3b-mixed-Detector,{all},0.755251,P
1,BLOOMZ-3b-mixed-Detector,ro,0.662115,P
2,BLOOMZ-3b-mixed-Detector,cs,0.796334,P
3,BLOOMZ-3b-mixed-Detector,pl,0.782974,P
4,BLOOMZ-3b-mixed-Detector,hr,0.773144,P



Test language: en vs {all}


Unnamed: 0,T,dof,alternative,p-val,CI95%,cohen-d,BF10,power
T-test,4.923321,9,two-sided,0.000821,[0.05 0.14],0.669497,48.181,0.472387



Test language: en vs ro


Unnamed: 0,T,dof,alternative,p-val,CI95%,cohen-d,BF10,power
T-test,1.913002,9,two-sided,0.088037,[-0.02 0.18],0.444895,1.158,0.242505



Test language: en vs cs


Unnamed: 0,T,dof,alternative,p-val,CI95%,cohen-d,BF10,power
T-test,1.027632,9,two-sided,0.330949,[-0.07 0.18],0.276258,0.475,0.12276



Test language: en vs pl


Unnamed: 0,T,dof,alternative,p-val,CI95%,cohen-d,BF10,power
T-test,1.540991,9,two-sided,0.157708,[-0.03 0.18],0.38802,0.766,0.195824



Test language: en vs hr


Unnamed: 0,T,dof,alternative,p-val,CI95%,cohen-d,BF10,power
T-test,1.189933,9,two-sided,0.264514,[-0.05 0.16],0.304998,0.544,0.139101



Test language: en vs ar


Unnamed: 0,T,dof,alternative,p-val,CI95%,cohen-d,BF10,power
T-test,1.2072,9,two-sided,0.258124,[-0.05 0.15],0.362592,0.553,0.17697



Test language: en vs pt


Unnamed: 0,T,dof,alternative,p-val,CI95%,cohen-d,BF10,power
T-test,1.96615,9,two-sided,0.080841,[-0.01 0.16],0.422061,1.232,0.223038



Test language: en vs de


Unnamed: 0,T,dof,alternative,p-val,CI95%,cohen-d,BF10,power
T-test,3.16647,9,two-sided,0.011431,[0.02 0.14],0.498717,5.57,0.29194



Test language: en vs es


Unnamed: 0,T,dof,alternative,p-val,CI95%,cohen-d,BF10,power
T-test,2.243733,9,two-sided,0.051528,[-0. 0.13],0.390507,1.723,0.197737



Test language: en vs nl


Unnamed: 0,T,dof,alternative,p-val,CI95%,cohen-d,BF10,power
T-test,3.251827,9,two-sided,0.009968,[0.03 0.16],0.611802,6.215,0.408713



Test language: en vs ca


Unnamed: 0,T,dof,alternative,p-val,CI95%,cohen-d,BF10,power
T-test,4.21324,9,two-sided,0.002262,[0.06 0.19],0.937518,20.769,0.751417



Test language: en vs uk


Unnamed: 0,T,dof,alternative,p-val,CI95%,cohen-d,BF10,power
T-test,4.082358,9,two-sided,0.002748,[0.04 0.13],0.680448,17.693,0.484598



Test language: en vs el


Unnamed: 0,T,dof,alternative,p-val,CI95%,cohen-d,BF10,power
T-test,0.341213,9,two-sided,0.740774,[-0.08 0.12],0.101038,0.325,0.059485



Test language: en vs sk


Unnamed: 0,T,dof,alternative,p-val,CI95%,cohen-d,BF10,power
T-test,2.189165,9,two-sided,0.056327,[-0. 0.18],0.514225,1.611,0.307029



Test language: en vs hu


Unnamed: 0,T,dof,alternative,p-val,CI95%,cohen-d,BF10,power
T-test,0.60004,9,two-sided,0.56328,[-0.09 0.15],0.153945,0.36,0.072158



Test language: en vs ga


Unnamed: 0,T,dof,alternative,p-val,CI95%,cohen-d,BF10,power
T-test,3.03676,9,two-sided,0.014092,[0.02 0.13],0.51002,4.715,0.302904



Test language: en vs et


Unnamed: 0,T,dof,alternative,p-val,CI95%,cohen-d,BF10,power
T-test,1.259396,9,two-sided,0.239567,[-0.04 0.15],0.286221,0.58,0.128231



Test language: en vs gd


Unnamed: 0,T,dof,alternative,p-val,CI95%,cohen-d,BF10,power
T-test,3.600626,9,two-sided,0.005743,[0.03 0.15],0.724945,9.694,0.534217



Test language: en vs bg


Unnamed: 0,T,dof,alternative,p-val,CI95%,cohen-d,BF10,power
T-test,1.327484,9,two-sided,0.217034,[-0.03 0.11],0.297998,0.618,0.134963



Test language: en vs ru


Unnamed: 0,T,dof,alternative,p-val,CI95%,cohen-d,BF10,power
T-test,1.551269,9,two-sided,0.155251,[-0.02 0.11],0.399798,0.774,0.204989



Test language: en vs sl


Unnamed: 0,T,dof,alternative,p-val,CI95%,cohen-d,BF10,power
T-test,3.367375,9,two-sided,0.008291,[0.03 0.16],0.666776,7.205,0.469356



Test language: en vs zh


Unnamed: 0,T,dof,alternative,p-val,CI95%,cohen-d,BF10,power
T-test,2.425819,9,two-sided,0.038244,[0. 0.11],0.442645,2.162,0.240544



Test language: en vs others




Unnamed: 0,T,dof,alternative,p-val,CI95%,cohen-d,BF10,power
T-test,1.673935,11.008202,two-sided,0.122288,[-0.02 0.16],0.37709,0.946,0.212747


********** pretrained

Test language: en vs {all}


Unnamed: 0,T,dof,alternative,p-val,CI95%,cohen-d,BF10,power
T-test,4.57209,4,two-sided,0.010245,[0.05 0.21],0.776762,6.926,0.268121



Test language: en vs ro


Unnamed: 0,T,dof,alternative,p-val,CI95%,cohen-d,BF10,power
T-test,8.065167,4,two-sided,0.001284,[0.13 0.27],1.061897,31.353,0.440982



Test language: en vs cs


Unnamed: 0,T,dof,alternative,p-val,CI95%,cohen-d,BF10,power
T-test,2.980614,4,two-sided,0.040714,[0.01 0.35],0.867598,2.568,0.319946



Test language: en vs pl


Unnamed: 0,T,dof,alternative,p-val,CI95%,cohen-d,BF10,power
T-test,3.510311,4,two-sided,0.024665,[0.04 0.32],0.949945,3.677,0.369954



Test language: en vs hr


Unnamed: 0,T,dof,alternative,p-val,CI95%,cohen-d,BF10,power
T-test,2.972019,4,two-sided,0.041062,[0.01 0.32],0.877515,2.553,0.325834



Test language: en vs ar


Unnamed: 0,T,dof,alternative,p-val,CI95%,cohen-d,BF10,power
T-test,0.961611,4,two-sided,0.3907,[-0.16 0.33],0.456098,0.562,0.125179



Test language: en vs pt


Unnamed: 0,T,dof,alternative,p-val,CI95%,cohen-d,BF10,power
T-test,3.510634,4,two-sided,0.024658,[0.03 0.29],0.811631,3.678,0.287539



Test language: en vs de


Unnamed: 0,T,dof,alternative,p-val,CI95%,cohen-d,BF10,power
T-test,4.173445,4,two-sided,0.013992,[0.05 0.23],0.712599,5.53,0.23414



Test language: en vs es


Unnamed: 0,T,dof,alternative,p-val,CI95%,cohen-d,BF10,power
T-test,3.239539,4,two-sided,0.031688,[0.02 0.23],0.628502,3.073,0.193421



Test language: en vs nl


Unnamed: 0,T,dof,alternative,p-val,CI95%,cohen-d,BF10,power
T-test,3.816556,4,two-sided,0.018834,[0.04 0.27],0.872066,4.464,0.322594



Test language: en vs ca


Unnamed: 0,T,dof,alternative,p-val,CI95%,cohen-d,BF10,power
T-test,1.885813,4,two-sided,0.132389,[-0.05 0.28],0.610892,1.122,0.185485



Test language: en vs uk


Unnamed: 0,T,dof,alternative,p-val,CI95%,cohen-d,BF10,power
T-test,2.409381,4,two-sided,0.073604,[-0.01 0.18],0.551681,1.687,0.160381



Test language: en vs el


Unnamed: 0,T,dof,alternative,p-val,CI95%,cohen-d,BF10,power
T-test,0.741346,4,two-sided,0.499647,[-0.16 0.27],0.327583,0.493,0.088525



Test language: en vs sk


Unnamed: 0,T,dof,alternative,p-val,CI95%,cohen-d,BF10,power
T-test,4.146451,4,two-sided,0.014301,[0.06 0.29],0.982819,5.444,0.390529



Test language: en vs hu


Unnamed: 0,T,dof,alternative,p-val,CI95%,cohen-d,BF10,power
T-test,2.076706,4,two-sided,0.106419,[-0.05 0.33],0.614094,1.304,0.186913



Test language: en vs ga


Unnamed: 0,T,dof,alternative,p-val,CI95%,cohen-d,BF10,power
T-test,3.551455,4,two-sided,0.023768,[0.03 0.21],0.662992,3.777,0.209568



Test language: en vs et


Unnamed: 0,T,dof,alternative,p-val,CI95%,cohen-d,BF10,power
T-test,2.346638,4,two-sided,0.078804,[-0.02 0.29],0.592036,1.608,0.177224



Test language: en vs gd


Unnamed: 0,T,dof,alternative,p-val,CI95%,cohen-d,BF10,power
T-test,3.695155,4,two-sided,0.020924,[0.03 0.24],1.047285,4.139,0.431582



Test language: en vs bg


Unnamed: 0,T,dof,alternative,p-val,CI95%,cohen-d,BF10,power
T-test,1.702212,4,two-sided,0.163929,[-0.04 0.18],0.421015,0.971,0.113951



Test language: en vs ru


Unnamed: 0,T,dof,alternative,p-val,CI95%,cohen-d,BF10,power
T-test,0.926628,4,two-sided,0.406567,[-0.11 0.23],0.39142,0.55,0.105192



Test language: en vs sl


Unnamed: 0,T,dof,alternative,p-val,CI95%,cohen-d,BF10,power
T-test,6.8028,4,two-sided,0.002439,[0.1 0.24],1.145239,19.62,0.494876



Test language: en vs zh


Unnamed: 0,T,dof,alternative,p-val,CI95%,cohen-d,BF10,power
T-test,1.878908,4,two-sided,0.13345,[-0.04 0.21],0.527093,1.116,0.150688



Test language: en vs others




Unnamed: 0,T,dof,alternative,p-val,CI95%,cohen-d,BF10,power
T-test,1.743542,4.517272,two-sided,0.147931,[-0.07 0.33],0.703263,1.156,0.33122


********** statistical

Test language: en vs {all}


Unnamed: 0,T,dof,alternative,p-val,CI95%,cohen-d,BF10,power
T-test,7.766914,4,two-sided,0.001481,[0.04 0.07],5.370806,28.237,1.0



Test language: en vs ro


Unnamed: 0,T,dof,alternative,p-val,CI95%,cohen-d,BF10,power
T-test,-1.059245,4,two-sided,0.349213,[-0.12 0.06],0.726037,0.6,0.241058



Test language: en vs cs


Unnamed: 0,T,dof,alternative,p-val,CI95%,cohen-d,BF10,power
T-test,-1.686411,4,two-sided,0.166994,[-0.19 0.05],1.131306,0.959,0.485855



Test language: en vs pl


Unnamed: 0,T,dof,alternative,p-val,CI95%,cohen-d,BF10,power
T-test,-1.186396,4,two-sided,0.301117,[-0.13 0.05],0.809836,0.655,0.286524



Test language: en vs hr


Unnamed: 0,T,dof,alternative,p-val,CI95%,cohen-d,BF10,power
T-test,-2.090475,4,two-sided,0.104774,[-0.13 0.02],1.454604,1.318,0.685633



Test language: en vs ar


Unnamed: 0,T,dof,alternative,p-val,CI95%,cohen-d,BF10,power
T-test,0.946763,4,two-sided,0.397369,[-0.04 0.09],0.663512,0.557,0.209817



Test language: en vs pt


Unnamed: 0,T,dof,alternative,p-val,CI95%,cohen-d,BF10,power
T-test,-0.603612,4,two-sided,0.57866,[-0.07 0.05],0.422579,0.46,0.114431



Test language: en vs de


Unnamed: 0,T,dof,alternative,p-val,CI95%,cohen-d,BF10,power
T-test,2.324558,4,two-sided,0.080732,[-0. 0.05],1.712358,1.581,0.813008



Test language: en vs es


Unnamed: 0,T,dof,alternative,p-val,CI95%,cohen-d,BF10,power
T-test,0.198608,4,two-sided,0.852255,[-0.05 0.06],0.1381,0.404,0.056788



Test language: en vs nl


Unnamed: 0,T,dof,alternative,p-val,CI95%,cohen-d,BF10,power
T-test,2.138039,4,two-sided,0.099306,[-0.01 0.08],1.503541,1.368,0.712615



Test language: en vs ca


Unnamed: 0,T,dof,alternative,p-val,CI95%,cohen-d,BF10,power
T-test,7.20866,4,two-sided,0.001963,[0.09 0.19],5.135386,22.987,1.0



Test language: en vs uk


Unnamed: 0,T,dof,alternative,p-val,CI95%,cohen-d,BF10,power
T-test,3.303033,4,two-sided,0.029848,[0.01 0.15],2.294057,3.207,0.962575



Test language: en vs el


Unnamed: 0,T,dof,alternative,p-val,CI95%,cohen-d,BF10,power
T-test,-0.572219,4,two-sided,0.597798,[-0.15 0.1 ],0.380643,0.454,0.102165



Test language: en vs sk


Unnamed: 0,T,dof,alternative,p-val,CI95%,cohen-d,BF10,power
T-test,-0.021877,4,two-sided,0.983594,[-0.11 0.11],0.014768,0.397,0.050077



Test language: en vs hu


Unnamed: 0,T,dof,alternative,p-val,CI95%,cohen-d,BF10,power
T-test,-1.819351,4,two-sided,0.142985,[-0.2 0.04],1.220687,1.065,0.54349



Test language: en vs ga


Unnamed: 0,T,dof,alternative,p-val,CI95%,cohen-d,BF10,power
T-test,1.187866,4,two-sided,0.300599,[-0.04 0.11],0.829452,0.656,0.2977



Test language: en vs et


Unnamed: 0,T,dof,alternative,p-val,CI95%,cohen-d,BF10,power
T-test,-0.514952,4,two-sided,0.633736,[-0.15 0.1 ],0.344264,0.443,0.092587



Test language: en vs gd


Unnamed: 0,T,dof,alternative,p-val,CI95%,cohen-d,BF10,power
T-test,2.469193,4,two-sided,0.069007,[-0.01 0.09],1.864327,1.766,0.869711



Test language: en vs bg


Unnamed: 0,T,dof,alternative,p-val,CI95%,cohen-d,BF10,power
T-test,0.306905,4,two-sided,0.774229,[-0.12 0.15],0.204056,0.413,0.064857



Test language: en vs ru


Unnamed: 0,T,dof,alternative,p-val,CI95%,cohen-d,BF10,power
T-test,2.129397,4,two-sided,0.100276,[-0.01 0.08],1.529536,1.359,0.726452



Test language: en vs sl


Unnamed: 0,T,dof,alternative,p-val,CI95%,cohen-d,BF10,power
T-test,1.195402,4,two-sided,0.297954,[-0.03 0.08],0.859886,0.66,0.315397



Test language: en vs zh


Unnamed: 0,T,dof,alternative,p-val,CI95%,cohen-d,BF10,power
T-test,2.253167,4,two-sided,0.087338,[-0.01 0.07],1.667215,1.496,0.793465



Test language: en vs others




Unnamed: 0,T,dof,alternative,p-val,CI95%,cohen-d,BF10,power
T-test,0.760675,31.236026,two-sided,0.452555,[-0.01 0.02],0.092591,0.491,0.054617


In [50]:
#based on mean AUC across detectors calculated on EN and nonEN samples
non_en = [0.7442879083629403, 0.744780307053057, 0.7420986995803567, 0.7368291096667234, 0.7171516831066395, 0.7503867288357737, 0.7074464884477927]
en = [0.7961044832461726, 0.8008579759826806, 0.7748092906493541, 0.8014880529805979, 0.8015463632661355, 0.8034267459856592, 0.8970331400804356]

print(f"\nTest language: en vs others")
res = pg.ttest(en, non_en, paired=True)
display(res.style.apply(lambda _: np.where(res['p-val'] >= 0.05, 'background-color: yellow', '')))


Test language: en vs others


Unnamed: 0,T,dof,alternative,p-val,CI95%,cohen-d,BF10,power
T-test,3.838321,6,two-sided,0.008576,[0.03 0.12],2.532048,7.562,0.999748


## RQ2: Fine-tuned MGTD

In [51]:
#social media test data

auc_dict = {}
for detector in tqdm(test_results, total=len(test_results)):
  for model,data in detector.items():
    temp = data[(data.split == 'test') & (data.domain == 'social_media')]
  if (to_category[model] == 'F') and ('social-media' not in model): continue
  #if (temp['prediction_probs'].min() >= 0.5) and (temp['prediction_probs'].max() <= 1.0):
    #temp.loc[temp.predictions == 'human', 'prediction_probs'] = 1 - temp['prediction_probs'] #adjust human prediction prob to be in range <0, 0.5)
  labels = [label2id[x] for x in temp['label']]
  fpr, tpr, thresholds = roc_curve(labels, temp['prediction_probs'])
  auc_dict[model] = {'auc':  auc_roc_reliable(fpr, tpr, labels), 'th_optim': thresholds[np.argmax(tpr - fpr)], 'tpr_1%fpr': tpr[fpr <= 0.01][-1], 'tpr_5%fpr': tpr[fpr <= 0.05][-1], 'th_1%fpr': thresholds[fpr <= 0.01][-1], 'th_3%fpr': thresholds[fpr <= 0.03][-1], 'th_5%fpr': thresholds[fpr <= 0.05][-1], 'th_10%fpr': thresholds[fpr <= 0.10][-1], 'th_15%fpr': thresholds[fpr <= 0.15][-1], 'th_20%fpr': thresholds[fpr <= 0.20][-1], 'th_25%fpr': thresholds[fpr <= 0.25][-1], 'th_30%fpr': thresholds[fpr <= 0.30][-1], 'th_40%fpr': thresholds[fpr <= 0.40][-1], 'th_50%fpr': thresholds[fpr <= 0.50][-1]}
  for test_language in temp.language.unique():
    temp2 = temp[temp.language == test_language].reset_index(drop=True)
    labels = [label2id[x] for x in temp2['label']]
    fpr, tpr, thresholds = roc_curve(labels, temp2['prediction_probs'])
    auc_dict[model][test_language] = {'auc':  auc_roc_reliable(fpr, tpr, labels), 'th_optim': thresholds[np.argmax(tpr - fpr)], 'tpr_1%fpr': tpr[fpr <= 0.01][-1], 'tpr_5%fpr': tpr[fpr <= 0.05][-1], 'th_1%fpr': thresholds[fpr <= 0.01][-1], 'th_3%fpr': thresholds[fpr <= 0.03][-1], 'th_5%fpr': thresholds[fpr <= 0.05][-1], 'th_10%fpr': thresholds[fpr <= 0.10][-1], 'th_15%fpr': thresholds[fpr <= 0.15][-1], 'th_20%fpr': thresholds[fpr <= 0.20][-1], 'th_25%fpr': thresholds[fpr <= 0.25][-1], 'th_30%fpr': thresholds[fpr <= 0.30][-1], 'th_40%fpr': thresholds[fpr <= 0.40][-1], 'th_50%fpr': thresholds[fpr <= 0.50][-1]}
results_all = pd.DataFrame()
for model, v in tqdm(auc_dict.items(), total=len(auc_dict)):
  temp = pd.DataFrame({'Detector': model, 'Language': '{all}', 'AUC ROC': v['auc']}, index=[0])
  results_all = pd.concat([results_all, temp])
  for test_language,val in v.items():
    if (test_language == 'auc') or ('_' in test_language): continue
    temp = pd.DataFrame({'Detector': model, 'Language': test_language, 'AUC ROC': val['auc']}, index=[0])
    results_all = pd.concat([results_all, temp])
#results_all.set_index(['Detector', 'Language']).unstack().sort_values(by=('AUC ROC','{all}'), ascending=False).style.format(precision=2).apply(lambda x: ["background-color: lightyellow" if v >= 0.6 else "" for v in x], axis = 0, subset='AUC ROC').highlight_max(props='font-weight: bold;', axis=0)
results_all['Category'] = results_all['Detector'].map(to_category)
results_all['Detector'] = results_all['Detector'].apply(rename_detector)
results_perlanguage = results_all.copy()

100%|███████████████████████████████████████████████████████████████████████████████████| 83/83 [00:13<00:00,  6.16it/s]
100%|██████████████████████████████████████████████████████████████████████████████████| 17/17 [00:00<00:00, 126.64it/s]


In [52]:
results_all = results_perlanguage
temp = results_all[results_all.Category == 'F'].drop(columns=['Category']).set_index(['Detector', 'Language']).unstack().sort_values(by=('AUC ROC','{all}'), ascending=False).sort_index().style.format(precision=2).background_gradient(vmin=0.5, vmax=1.5, text_color_threshold=0, axis=None).applymap(lambda x: 'background-color: white;' if str(x)=='nan' else '')
display(temp)
print(temp.applymap_index(lambda v: "font-weight: bold;", axis=0).applymap_index(lambda v: "font-weight: bold;", axis=1).to_latex(convert_css=True).replace('\\multirow', '\\hline\n\\multirow').replace('nan', 'N/A'))

  temp = results_all[results_all.Category == 'F'].drop(columns=['Category']).set_index(['Detector', 'Language']).unstack().sort_values(by=('AUC ROC','{all}'), ascending=False).sort_index().style.format(precision=2).background_gradient(vmin=0.5, vmax=1.5, text_color_threshold=0, axis=None).applymap(lambda x: 'background-color: white;' if str(x)=='nan' else '')


Unnamed: 0_level_0,AUC ROC,AUC ROC,AUC ROC,AUC ROC,AUC ROC,AUC ROC,AUC ROC,AUC ROC,AUC ROC,AUC ROC,AUC ROC,AUC ROC,AUC ROC,AUC ROC,AUC ROC,AUC ROC,AUC ROC,AUC ROC,AUC ROC,AUC ROC,AUC ROC,AUC ROC,AUC ROC
Language,ar,bg,ca,cs,de,el,en,es,et,ga,gd,hr,hu,nl,pl,pt,ro,ru,sk,sl,uk,zh,{all}
Detector,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2,Unnamed: 22_level_2,Unnamed: 23_level_2
Aya-101-MultiSocial,0.97,0.99,0.97,0.98,0.97,0.97,0.98,0.98,0.98,0.95,0.92,0.98,0.99,0.97,0.98,0.98,0.98,0.96,0.98,0.95,0.95,0.97,0.97
BLOOMZ-3b-MultiSocial,0.96,0.98,0.96,0.97,0.95,0.96,0.98,0.97,0.98,0.9,0.82,0.96,0.99,0.94,0.95,0.97,0.95,0.94,0.95,0.88,0.9,0.97,0.96
Falcon-rw-1b-MultiSocial,0.95,0.98,0.97,0.97,0.96,0.96,0.98,0.96,0.98,0.92,0.87,0.96,0.99,0.95,0.96,0.96,0.95,0.94,0.95,0.87,0.91,0.96,0.96
Llama-3-8b-MultiSocial,0.97,0.99,0.98,0.99,0.98,0.97,0.99,0.98,0.99,0.94,0.9,0.98,0.99,0.98,0.98,0.98,0.98,0.96,0.98,0.95,0.95,0.98,0.98
Mistral-7b-v0.1-MultiSocial,0.97,0.99,0.98,0.99,0.98,0.97,0.99,0.98,0.98,0.93,0.93,0.99,1.0,0.97,0.98,0.98,0.97,0.97,0.97,0.94,0.96,0.98,0.98
XLM-RoBERTa-large-MultiSocial,0.95,0.98,0.94,0.98,0.96,0.95,0.96,0.96,0.97,0.88,0.78,0.97,0.99,0.95,0.97,0.95,0.96,0.95,0.96,0.91,0.92,0.93,0.96
mDeBERTa-v3-base-MultiSocial,0.94,0.98,0.94,0.97,0.95,0.94,0.96,0.96,0.98,0.9,0.79,0.97,0.99,0.95,0.96,0.96,0.96,0.93,0.96,0.92,0.93,0.94,0.95


\begin{tabular}{lrrrrrrrrrrrrrrrrrrrrrrr}
 & \multicolumn{23}{r}{\bfseries AUC ROC} \\
Language & \bfseries ar & \bfseries bg & \bfseries ca & \bfseries cs & \bfseries de & \bfseries el & \bfseries en & \bfseries es & \bfseries et & \bfseries ga & \bfseries gd & \bfseries hr & \bfseries hu & \bfseries nl & \bfseries pl & \bfseries pt & \bfseries ro & \bfseries ru & \bfseries sk & \bfseries sl & \bfseries uk & \bfseries zh & \bfseries {all} \\
Detector &  &  &  &  &  &  &  &  &  &  &  &  &  &  &  &  &  &  &  &  &  &  &  \\
\bfseries Aya-101-MultiSocial & {\cellcolor[HTML]{81AED2}} \color[HTML]{000000} 0.97 & {\cellcolor[HTML]{78ABD0}} \color[HTML]{000000} 0.99 & {\cellcolor[HTML]{7EADD1}} \color[HTML]{000000} 0.97 & {\cellcolor[HTML]{7BACD1}} \color[HTML]{000000} 0.98 & {\cellcolor[HTML]{80AED2}} \color[HTML]{000000} 0.97 & {\cellcolor[HTML]{80AED2}} \color[HTML]{000000} 0.97 & {\cellcolor[HTML]{7EADD1}} \color[HTML]{000000} 0.98 & {\cellcolor[HTML]{7DACD1}} \color[HTML]{000000} 0.98 & 

  print(temp.applymap_index(lambda v: "font-weight: bold;", axis=0).applymap_index(lambda v: "font-weight: bold;", axis=1).to_latex(convert_css=True).replace('\\multirow', '\\hline\n\\multirow').replace('nan', 'N/A'))


In [53]:
results_all.Language.unique()

array(['{all}', 'ro', 'cs', 'pl', 'hr', 'ar', 'en', 'pt', 'de', 'es',
       'nl', 'ca', 'uk', 'el', 'sk', 'hu', 'ga', 'et', 'gd', 'bg', 'ru',
       'sl', 'zh'], dtype=object)

In [54]:
sorted_languages = [
                    'el',
                    'ar', 'zh',
                    'ca', 'es', 'pt',
                    'en', 'ga', 'gd',
                    'hu', 'et',
                    'de', 'nl',
                    'cs', 'pl', 'sk',
                    'hr', 'sl',
                    'ro',
                    'bg', 'uk',  'ru',
                    ]
temp = results_all[results_all.Category == 'F'].drop(columns=['Category']).set_index(['Detector', 'Language']).unstack()[[('AUC ROC', x) for x in sorted_languages]].corr(numeric_only=True).reset_index().drop(columns=['level_0']).set_index('Language').style.format(precision=2).background_gradient(cmap='Greens', vmin=-1, vmax=3, text_color_threshold=0, axis=None).applymap(lambda x: 'background-color: white;' if str(x)=='nan' else '')
display(temp)
print(temp.applymap_index(lambda v: "font-weight: bold;", axis=0).applymap_index(lambda v: "font-weight: bold;", axis=1).to_latex(convert_css=True))

  temp = results_all[results_all.Category == 'F'].drop(columns=['Category']).set_index(['Detector', 'Language']).unstack()[[('AUC ROC', x) for x in sorted_languages]].corr(numeric_only=True).reset_index().drop(columns=['level_0']).set_index('Language').style.format(precision=2).background_gradient(cmap='Greens', vmin=-1, vmax=3, text_color_threshold=0, axis=None).applymap(lambda x: 'background-color: white;' if str(x)=='nan' else '')
  temp = results_all[results_all.Category == 'F'].drop(columns=['Category']).set_index(['Detector', 'Language']).unstack()[[('AUC ROC', x) for x in sorted_languages]].corr(numeric_only=True).reset_index().drop(columns=['level_0']).set_index('Language').style.format(precision=2).background_gradient(cmap='Greens', vmin=-1, vmax=3, text_color_threshold=0, axis=None).applymap(lambda x: 'background-color: white;' if str(x)=='nan' else '')


Unnamed: 0_level_0,AUC ROC,AUC ROC,AUC ROC,AUC ROC,AUC ROC,AUC ROC,AUC ROC,AUC ROC,AUC ROC,AUC ROC,AUC ROC,AUC ROC,AUC ROC,AUC ROC,AUC ROC,AUC ROC,AUC ROC,AUC ROC,AUC ROC,AUC ROC,AUC ROC,AUC ROC
Language,el,ar,zh,ca,es,pt,en,ga,gd,hu,et,de,nl,cs,pl,sk,hr,sl,ro,bg,uk,ru
Language,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2,Unnamed: 22_level_2
el,1.0,0.92,0.84,0.9,0.88,0.81,0.85,0.83,0.94,0.71,0.76,0.89,0.78,0.85,0.68,0.73,0.74,0.46,0.69,0.63,0.61,0.82
ar,0.92,1.0,0.86,0.83,0.94,0.91,0.75,0.75,0.79,0.7,0.7,0.89,0.76,0.85,0.72,0.82,0.76,0.6,0.75,0.65,0.66,0.87
zh,0.84,0.86,1.0,0.97,0.95,0.96,0.91,0.88,0.85,0.91,0.92,0.78,0.71,0.68,0.53,0.61,0.66,0.41,0.53,0.56,0.49,0.68
ca,0.9,0.83,0.97,1.0,0.9,0.88,0.95,0.93,0.93,0.9,0.94,0.81,0.76,0.72,0.56,0.61,0.67,0.37,0.55,0.57,0.5,0.68
es,0.88,0.94,0.95,0.9,1.0,0.98,0.85,0.78,0.81,0.85,0.8,0.88,0.77,0.82,0.69,0.75,0.78,0.57,0.67,0.68,0.64,0.85
pt,0.81,0.91,0.96,0.88,0.98,1.0,0.8,0.78,0.76,0.87,0.82,0.8,0.71,0.73,0.61,0.7,0.71,0.54,0.61,0.62,0.58,0.76
en,0.85,0.75,0.91,0.95,0.85,0.8,1.0,0.79,0.84,0.79,0.82,0.73,0.63,0.65,0.44,0.45,0.56,0.18,0.38,0.42,0.36,0.63
ga,0.83,0.75,0.88,0.93,0.78,0.78,0.79,1.0,0.94,0.89,0.97,0.79,0.83,0.71,0.62,0.69,0.71,0.49,0.66,0.65,0.6,0.62
gd,0.94,0.79,0.85,0.93,0.81,0.76,0.84,0.94,1.0,0.84,0.89,0.86,0.85,0.81,0.68,0.71,0.77,0.49,0.7,0.7,0.65,0.73
hu,0.71,0.7,0.91,0.9,0.85,0.87,0.79,0.89,0.84,1.0,0.96,0.77,0.81,0.69,0.63,0.65,0.76,0.57,0.61,0.73,0.64,0.65


\begin{tabular}{lrrrrrrrrrrrrrrrrrrrrrr}
 & \multicolumn{22}{r}{\bfseries AUC ROC} \\
Language & \bfseries el & \bfseries ar & \bfseries zh & \bfseries ca & \bfseries es & \bfseries pt & \bfseries en & \bfseries ga & \bfseries gd & \bfseries hu & \bfseries et & \bfseries de & \bfseries nl & \bfseries cs & \bfseries pl & \bfseries sk & \bfseries hr & \bfseries sl & \bfseries ro & \bfseries bg & \bfseries uk & \bfseries ru \\
Language &  &  &  &  &  &  &  &  &  &  &  &  &  &  &  &  &  &  &  &  &  &  \\
\bfseries el & {\cellcolor[HTML]{73C476}} \color[HTML]{000000} 1.00 & {\cellcolor[HTML]{7CC87C}} \color[HTML]{000000} 0.92 & {\cellcolor[HTML]{81CA81}} \color[HTML]{000000} 0.84 & {\cellcolor[HTML]{7DC87E}} \color[HTML]{000000} 0.90 & {\cellcolor[HTML]{7FC97F}} \color[HTML]{000000} 0.88 & {\cellcolor[HTML]{86CC85}} \color[HTML]{000000} 0.81 & {\cellcolor[HTML]{81CA81}} \color[HTML]{000000} 0.85 & {\cellcolor[HTML]{83CB82}} \color[HTML]{000000} 0.83 & {\cellcolor[HTML]{7AC77B}} \color[HTML]

  print(temp.applymap_index(lambda v: "font-weight: bold;", axis=0).applymap_index(lambda v: "font-weight: bold;", axis=1).to_latex(convert_css=True))


In [55]:
#social media telegram test data - per language of telegram trained mono and multilingually traned EN,ES,RU detectors

auc_dict = {}
for detector in tqdm(test_results, total=len(test_results)):
  for model,data in detector.items():
    temp = data[(data.split == 'test') & (data.domain == 'social_media')]
  if ('cross-language' not in model): continue
  temp['source'] = temp['source'].str.replace('multisocial_', '')
  temp = temp[temp.source == 'telegram']
  #if (temp['prediction_probs'].min() >= 0.5) and (temp['prediction_probs'].max() <= 1.0):
    #temp.loc[temp.predictions == 'human', 'prediction_probs'] = 1 - temp['prediction_probs'] #adjust human prediction prob to be in range <0, 0.5)
  temp['prediction_probs']= temp['prediction_probs'].astype(float)
  labels = [label2id[x] for x in temp['label']]
  fpr, tpr, thresholds = roc_curve(labels, temp['prediction_probs'].astype(float))
  auc_dict[model] = {'auc':  auc_roc_reliable(fpr, tpr, labels), 'th_optim': thresholds[np.argmax(tpr - fpr)], 'tpr_1%fpr': tpr[fpr <= 0.01][-1], 'tpr_5%fpr': tpr[fpr <= 0.05][-1], 'th_1%fpr': thresholds[fpr <= 0.01][-1], 'th_3%fpr': thresholds[fpr <= 0.03][-1], 'th_5%fpr': thresholds[fpr <= 0.05][-1], 'th_10%fpr': thresholds[fpr <= 0.10][-1], 'th_15%fpr': thresholds[fpr <= 0.15][-1], 'th_20%fpr': thresholds[fpr <= 0.20][-1], 'th_25%fpr': thresholds[fpr <= 0.25][-1], 'th_30%fpr': thresholds[fpr <= 0.30][-1], 'th_40%fpr': thresholds[fpr <= 0.40][-1], 'th_50%fpr': thresholds[fpr <= 0.50][-1]}
  for test_language in temp.language.unique():
    temp2 = temp[temp.language == test_language].reset_index(drop=True)
    labels = [label2id[x] for x in temp2['label']]
    fpr, tpr, thresholds = roc_curve(labels, temp2['prediction_probs'])
    auc_dict[model][test_language] = {'auc':  auc_roc_reliable(fpr, tpr, labels), 'th_optim': thresholds[np.argmax(tpr - fpr)], 'tpr_1%fpr': tpr[fpr <= 0.01][-1], 'tpr_5%fpr': tpr[fpr <= 0.05][-1], 'th_1%fpr': thresholds[fpr <= 0.01][-1], 'th_3%fpr': thresholds[fpr <= 0.03][-1], 'th_5%fpr': thresholds[fpr <= 0.05][-1], 'th_10%fpr': thresholds[fpr <= 0.10][-1], 'th_15%fpr': thresholds[fpr <= 0.15][-1], 'th_20%fpr': thresholds[fpr <= 0.20][-1], 'th_25%fpr': thresholds[fpr <= 0.25][-1], 'th_30%fpr': thresholds[fpr <= 0.30][-1], 'th_40%fpr': thresholds[fpr <= 0.40][-1], 'th_50%fpr': thresholds[fpr <= 0.50][-1]}
results_all = pd.DataFrame()
for model, v in tqdm(auc_dict.items(), total=len(auc_dict)):
  temp = pd.DataFrame({'Detector': extract_language(model).split('_')[0], 'Train Language': extract_language(model).split('_')[1], 'Language': '{all}', 'AUC ROC': v['auc']}, index=[0])
  results_all = pd.concat([results_all, temp])
  for test_language,val in v.items():
    if (test_language == 'auc') or ('_' in test_language): continue
    temp = pd.DataFrame({'Detector': extract_language(model).split('_')[0], 'Train Language': extract_language(model).split('_')[1], 'Language': test_language, 'AUC ROC': val['auc']}, index=[0])
    results_all = pd.concat([results_all, temp])
#results_all.set_index(['Detector', 'Language']).unstack().sort_values(by=('AUC ROC','{all}'), ascending=False).style.format(precision=2).apply(lambda x: ["background-color: lightyellow" if v >= 0.6 else "" for v in x], axis = 0, subset='AUC ROC').highlight_max(props='font-weight: bold;', axis=0)
results_all['Detector'] = results_all['Detector'].apply(rename_detector)
results_language = results_all.copy()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  temp['source'] = temp['source'].str.replace('multisocial_', '')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  temp['source'] = temp['source'].str.replace('multisocial_', '')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  temp['source'] = temp['source'].str.replace('multisocial_', '')
A value is tr

In [56]:
results_all = results_language

In [57]:
temp = pd.DataFrame(multidomain[(multidomain.domain=='social_media') & multidomain.source.str.contains('telegram')][['language']].value_counts())
temp = temp.reset_index()
temp = temp.groupby(['language']).sum().unstack().fillna(0).astype(int) > 2000 #if there are enough samples (~ 250 of each generator)
temp.index = temp.index.droplevel(0)
temp['{all}'] = True
enough_platform_language_samples = temp

In [58]:
#temp = results_all.set_index(['Detector', 'Train Language', 'Language']).unstack()
temp = results_all[~results_all['Train Language'].str.contains('x3')].set_index(['Detector', 'Train Language', 'Language']).unstack()
temp[[('AUC ROC', x) for x in enough_platform_language_samples[~enough_platform_language_samples].index.tolist()]] = 'nan'
temp = temp.loc[[(x,y) for x,y in temp.index if ('llama' in x.lower()) or ('mistral' in x.lower()) or ('mdeberta' in x.lower())]]
temp = temp.style.format(precision=2).background_gradient(vmin=0.5, vmax=1.5, text_color_threshold=0, axis=None)
display(temp)
print(temp.applymap_index(lambda v: "font-weight: bold;", axis=0).applymap_index(lambda v: "font-weight: bold;", axis=1).to_latex(convert_css=True).replace('\\multirow', '\\hline\n\\multirow').replace('{\\cellcolor[HTML]{000000}} \\color[HTML]{000000} nan', 'N/A').replace('nan', 'N/A').replace('\\multirow[c]{4}{*}{', '\\multirow[c]{4}{*}{\\rotatebox{90}{\\parbox{1.8cm}{').replace('} &', '}}} &').replace('{en-es-ru}}}', '{en-es-ru}').replace('mDeBERTa-v3-base', 'mDeBER\-Ta-v3-base'))

Unnamed: 0_level_0,Unnamed: 1_level_0,AUC ROC,AUC ROC,AUC ROC,AUC ROC,AUC ROC,AUC ROC,AUC ROC,AUC ROC,AUC ROC,AUC ROC,AUC ROC,AUC ROC,AUC ROC,AUC ROC,AUC ROC,AUC ROC,AUC ROC,AUC ROC,AUC ROC,AUC ROC,AUC ROC,AUC ROC,AUC ROC
Unnamed: 0_level_1,Language,ar,bg,ca,cs,de,el,en,es,et,ga,gd,hr,hu,nl,pl,pt,ro,ru,sk,sl,uk,zh,{all}
Detector,Train Language,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2,Unnamed: 22_level_2,Unnamed: 23_level_2,Unnamed: 24_level_2
Llama-3-8b,en,0.85,0.96,0.76,0.89,0.89,0.9,0.97,0.91,0.93,,,0.96,0.98,0.84,0.92,0.94,0.93,0.91,,,0.85,0.73,0.87
Llama-3-8b,es,0.8,0.94,0.85,0.79,0.9,0.88,0.91,0.95,0.86,,,0.9,0.95,0.84,0.92,0.92,0.92,0.9,,,0.86,0.64,0.83
Llama-3-8b,ru,0.76,0.95,0.67,0.79,0.84,0.88,0.86,0.81,0.76,,,0.83,0.89,0.8,0.9,0.83,0.82,0.95,,,0.9,0.59,0.78
Llama-3-8b,{en-es-ru},0.92,0.98,0.89,0.93,0.95,0.95,0.97,0.96,0.95,,,0.97,0.99,0.89,0.95,0.95,0.95,0.96,,,0.93,0.93,0.93
Mistral-7b-v0.1,en,0.77,0.83,0.81,0.86,0.89,0.85,0.96,0.86,0.89,,,0.92,0.92,0.82,0.81,0.88,0.92,0.8,,,0.75,0.46,0.82
Mistral-7b-v0.1,es,0.77,0.82,0.86,0.85,0.89,0.75,0.83,0.93,0.89,,,0.91,0.94,0.8,0.78,0.91,0.91,0.82,,,0.76,0.51,0.82
Mistral-7b-v0.1,ru,0.78,0.94,0.76,0.89,0.82,0.85,0.82,0.84,0.89,,,0.92,0.9,0.74,0.8,0.87,0.88,0.95,,,0.9,0.44,0.82
Mistral-7b-v0.1,{en-es-ru},0.9,0.94,0.87,0.93,0.91,0.92,0.95,0.93,0.93,,,0.95,0.97,0.85,0.86,0.93,0.94,0.96,,,0.91,0.68,0.9
mDeBERTa-v3-base,en,0.83,0.96,0.76,0.97,0.84,0.93,0.96,0.88,0.94,,,0.96,0.99,0.84,0.92,0.94,0.96,0.9,,,0.84,0.74,0.9
mDeBERTa-v3-base,es,0.86,0.95,0.82,0.96,0.87,0.87,0.94,0.92,0.94,,,0.95,0.98,0.85,0.93,0.94,0.95,0.91,,,0.87,0.82,0.9


\begin{tabular}{llrrrrrrrrrllrrrrrrrllrrr}
 &  & \multicolumn{23}{r}{\bfseries AUC ROC} \\
 & Language & \bfseries ar & \bfseries bg & \bfseries ca & \bfseries cs & \bfseries de & \bfseries el & \bfseries en & \bfseries es & \bfseries et & \bfseries ga & \bfseries gd & \bfseries hr & \bfseries hu & \bfseries nl & \bfseries pl & \bfseries pt & \bfseries ro & \bfseries ru & \bfseries sk & \bfseries sl & \bfseries uk & \bfseries zh & \bfseries {all} \\
Detector & Train Language &  &  &  &  &  &  &  &  &  &  &  &  &  &  &  &  &  &  &  &  &  &  &  \\
\hline
\multirow[c]{4}{*}{\rotatebox{90}{\parbox{1.8cm}{\bfseries Llama-3-8b}}} & \bfseries en & {\cellcolor[HTML]{ADC1DD}} \color[HTML]{000000} 0.85 & {\cellcolor[HTML]{83AFD3}} \color[HTML]{000000} 0.96 & {\cellcolor[HTML]{CCCFE5}} \color[HTML]{000000} 0.76 & {\cellcolor[HTML]{9FBAD9}} \color[HTML]{000000} 0.89 & {\cellcolor[HTML]{9FBAD9}} \color[HTML]{000000} 0.89 & {\cellcolor[HTML]{9AB8D8}} \color[HTML]{000000} 0.90 & {\cellcolor[HTML]{80A

  print(temp.applymap_index(lambda v: "font-weight: bold;", axis=0).applymap_index(lambda v: "font-weight: bold;", axis=1).to_latex(convert_css=True).replace('\\multirow', '\\hline\n\\multirow').replace('{\\cellcolor[HTML]{000000}} \\color[HTML]{000000} nan', 'N/A').replace('nan', 'N/A').replace('\\multirow[c]{4}{*}{', '\\multirow[c]{4}{*}{\\rotatebox{90}{\\parbox{1.8cm}{').replace('} &', '}}} &').replace('{en-es-ru}}}', '{en-es-ru}').replace('mDeBERTa-v3-base', 'mDeBER\-Ta-v3-base'))


In [59]:
#temp = results_all.set_index(['Detector', 'Train Language', 'Language']).unstack()
temp = results_all[~results_all['Train Language'].str.contains('x3')].set_index(['Detector', 'Train Language', 'Language']).unstack()
temp[[('AUC ROC', x) for x in enough_platform_language_samples[~enough_platform_language_samples].index.tolist()]] = 'nan'
temp = temp.style.format(precision=2).background_gradient(vmin=0.5, vmax=1.5, text_color_threshold=0, axis=None)
display(temp)
print(temp.applymap_index(lambda v: "font-weight: bold;", axis=0).applymap_index(lambda v: "font-weight: bold;", axis=1).to_latex(convert_css=True).replace('\\multirow', '\\hline\n\\multirow').replace('{\\cellcolor[HTML]{000000}} \\color[HTML]{000000} nan', 'N/A').replace('nan', 'N/A').replace('\\multirow[c]{4}{*}{', '\\multirow[c]{4}{*}{\\rotatebox{90}{\\parbox{1.8cm}{').replace('} &', '}}} &').replace('{en-es-ru}}}', '{en-es-ru}').replace('mDeBERTa-v3-base', 'mDeBER\-Ta-v3-base'))

Unnamed: 0_level_0,Unnamed: 1_level_0,AUC ROC,AUC ROC,AUC ROC,AUC ROC,AUC ROC,AUC ROC,AUC ROC,AUC ROC,AUC ROC,AUC ROC,AUC ROC,AUC ROC,AUC ROC,AUC ROC,AUC ROC,AUC ROC,AUC ROC,AUC ROC,AUC ROC,AUC ROC,AUC ROC,AUC ROC,AUC ROC
Unnamed: 0_level_1,Language,ar,bg,ca,cs,de,el,en,es,et,ga,gd,hr,hu,nl,pl,pt,ro,ru,sk,sl,uk,zh,{all}
Detector,Train Language,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2,Unnamed: 22_level_2,Unnamed: 23_level_2,Unnamed: 24_level_2
Aya-101,en,0.89,0.97,0.84,0.97,0.92,0.95,0.97,0.93,0.96,,,0.96,0.99,0.87,0.95,0.95,0.97,0.93,,,0.9,0.91,0.93
Aya-101,es,0.91,0.97,0.9,0.96,0.95,0.93,0.95,0.96,0.96,,,0.96,0.99,0.9,0.95,0.95,0.97,0.95,,,0.9,0.91,0.94
Aya-101,ru,0.92,0.98,0.89,0.95,0.92,0.94,0.95,0.93,0.96,,,0.96,0.98,0.89,0.95,0.93,0.97,0.97,,,0.94,0.91,0.94
Aya-101,{en-es-ru},0.93,0.98,0.9,0.96,0.94,0.94,0.96,0.95,0.96,,,0.96,0.99,0.89,0.95,0.95,0.97,0.96,,,0.91,0.92,0.94
BLOOMZ-3b,en,0.75,0.91,0.73,0.89,0.87,0.9,0.95,0.74,0.88,,,0.9,0.95,0.84,0.87,0.9,0.92,0.87,,,0.82,0.61,0.82
BLOOMZ-3b,es,0.78,0.85,0.84,0.84,0.85,0.82,0.9,0.93,0.83,,,0.82,0.88,0.8,0.8,0.9,0.88,0.85,,,0.79,0.61,0.81
BLOOMZ-3b,ru,0.69,0.86,0.57,0.8,0.8,0.76,0.81,0.56,0.8,,,0.78,0.83,0.73,0.75,0.74,0.74,0.9,,,0.84,0.64,0.72
BLOOMZ-3b,{en-es-ru},0.87,0.9,0.83,0.87,0.89,0.92,0.95,0.92,0.86,,,0.85,0.92,0.79,0.85,0.92,0.9,0.9,,,0.86,0.88,0.86
Falcon-rw-1b,en,0.74,0.74,0.78,0.85,0.86,0.87,0.95,0.87,0.88,,,0.91,0.95,0.79,0.85,0.89,0.92,0.81,,,0.74,0.8,0.83
Falcon-rw-1b,es,0.78,0.73,0.79,0.85,0.89,0.91,0.86,0.93,0.86,,,0.9,0.92,0.82,0.87,0.92,0.93,0.85,,,0.75,0.8,0.83


\begin{tabular}{llrrrrrrrrrllrrrrrrrllrrr}
 &  & \multicolumn{23}{r}{\bfseries AUC ROC} \\
 & Language & \bfseries ar & \bfseries bg & \bfseries ca & \bfseries cs & \bfseries de & \bfseries el & \bfseries en & \bfseries es & \bfseries et & \bfseries ga & \bfseries gd & \bfseries hr & \bfseries hu & \bfseries nl & \bfseries pl & \bfseries pt & \bfseries ro & \bfseries ru & \bfseries sk & \bfseries sl & \bfseries uk & \bfseries zh & \bfseries {all} \\
Detector & Train Language &  &  &  &  &  &  &  &  &  &  &  &  &  &  &  &  &  &  &  &  &  &  &  \\
\hline
\multirow[c]{4}{*}{\rotatebox{90}{\parbox{1.8cm}{\bfseries Aya-101}}} & \bfseries en & {\cellcolor[HTML]{9FBAD9}} \color[HTML]{000000} 0.89 & {\cellcolor[HTML]{7EADD1}} \color[HTML]{000000} 0.97 & {\cellcolor[HTML]{B1C2DE}} \color[HTML]{000000} 0.84 & {\cellcolor[HTML]{80AED2}} \color[HTML]{000000} 0.97 & {\cellcolor[HTML]{93B5D6}} \color[HTML]{000000} 0.92 & {\cellcolor[HTML]{88B1D4}} \color[HTML]{000000} 0.95 & {\cellcolor[HTML]{80AED2

  print(temp.applymap_index(lambda v: "font-weight: bold;", axis=0).applymap_index(lambda v: "font-weight: bold;", axis=1).to_latex(convert_css=True).replace('\\multirow', '\\hline\n\\multirow').replace('{\\cellcolor[HTML]{000000}} \\color[HTML]{000000} nan', 'N/A').replace('nan', 'N/A').replace('\\multirow[c]{4}{*}{', '\\multirow[c]{4}{*}{\\rotatebox{90}{\\parbox{1.8cm}{').replace('} &', '}}} &').replace('{en-es-ru}}}', '{en-es-ru}').replace('mDeBERTa-v3-base', 'mDeBER\-Ta-v3-base'))


In [60]:
temp = results_all[results_all['Train Language'] != '{en-es-ru-x3}']
temp['Train'] = 'monolingual'
temp.loc[temp['Train Language'] == '{en-es-ru}', 'Train'] = 'multilingual'
#temp = temp.set_index(['Train', 'Detector', 'Language']).unstack().groupby('Train').mean().style.format(precision=2).background_gradient(vmin=0.5, vmax=1.5, text_color_threshold=0, axis=None)
temp = temp.groupby(['Train', 'Language'])['AUC ROC'].mean().unstack()
temp[[x for x in enough_platform_language_samples[~enough_platform_language_samples].index.tolist()]] = 'nan'
temp = temp.style.format(precision=2).background_gradient(vmin=0.5, vmax=1.5, text_color_threshold=0, axis=None)
display(temp)
#print(temp.applymap_index(lambda v: "font-weight: bold;", axis=0).applymap_index(lambda v: "font-weight: bold;", axis=1).to_latex(convert_css=True).replace('\\multirow', '\\hline\n\\multirow').replace('\\multirow[c]{6}{*}{', '\\multirow[c]{6}{*}{\\rotatebox{90}{\\parbox{2cm}{').replace('} &', '}}} &').replace('{\\cellcolor[HTML]{000000}} \\color[HTML]{000000} nan', 'N/A'))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  temp['Train'] = 'monolingual'


Language,ar,bg,ca,cs,de,el,en,es,et,ga,gd,hr,hu,nl,pl,pt,ro,ru,sk,sl,uk,zh,{all}
Train,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1
monolingual,0.81,0.91,0.79,0.89,0.87,0.89,0.91,0.88,0.9,,,0.92,0.94,0.82,0.88,0.9,0.92,0.9,,,0.84,0.73,0.86
multilingual,0.89,0.94,0.86,0.93,0.91,0.93,0.95,0.93,0.93,,,0.94,0.97,0.86,0.91,0.94,0.95,0.93,,,0.89,0.86,0.91


In [61]:
temp = results_all.set_index(['Train Language', 'Detector', 'Language']).unstack().groupby('Train Language').mean().drop(index=['{en-es-ru-x3}'])
temp[[('AUC ROC', x) for x in enough_platform_language_samples[~enough_platform_language_samples].index.tolist()]] = 'nan'
temp = temp.style.format(precision=2).background_gradient(vmin=0.5, vmax=1.5, text_color_threshold=0, axis=None)
display(temp)
#print(temp.applymap_index(lambda v: "font-weight: bold;", axis=0).applymap_index(lambda v: "font-weight: bold;", axis=1).to_latex(convert_css=True).replace('\\multirow', '\\hline\n\\multirow').replace('\\multirow[c]{6}{*}{', '\\multirow[c]{6}{*}{\\rotatebox{90}{\\parbox{2cm}{').replace('} &', '}}} &').replace('{\\cellcolor[HTML]{000000}} \\color[HTML]{000000} nan', 'N/A'))

Unnamed: 0_level_0,AUC ROC,AUC ROC,AUC ROC,AUC ROC,AUC ROC,AUC ROC,AUC ROC,AUC ROC,AUC ROC,AUC ROC,AUC ROC,AUC ROC,AUC ROC,AUC ROC,AUC ROC,AUC ROC,AUC ROC,AUC ROC,AUC ROC,AUC ROC,AUC ROC,AUC ROC,AUC ROC
Language,ar,bg,ca,cs,de,el,en,es,et,ga,gd,hr,hu,nl,pl,pt,ro,ru,sk,sl,uk,zh,{all}
Train Language,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2,Unnamed: 22_level_2,Unnamed: 23_level_2
en,0.81,0.9,0.78,0.91,0.88,0.9,0.96,0.87,0.92,,,0.94,0.97,0.84,0.89,0.92,0.94,0.87,,,0.81,0.73,0.87
es,0.82,0.89,0.85,0.89,0.89,0.87,0.9,0.94,0.9,,,0.92,0.95,0.84,0.88,0.93,0.93,0.88,,,0.82,0.73,0.86
ru,0.81,0.93,0.76,0.87,0.84,0.88,0.87,0.82,0.88,,,0.89,0.91,0.79,0.87,0.86,0.88,0.94,,,0.89,0.73,0.84
{en-es-ru},0.89,0.94,0.86,0.93,0.91,0.93,0.95,0.93,0.93,,,0.94,0.97,0.86,0.91,0.94,0.95,0.93,,,0.89,0.86,0.91


In [62]:
autoregressive = [x for x in results_all.Detector.unique() if ('aya' not in x.lower()) and ('mdeberta' not in x.lower()) and ('xlm' not in x.lower())]

In [63]:
#autoregressive decoder-only models
temp = results_all[results_all.Detector.isin(autoregressive)].set_index(['Train Language', 'Detector', 'Language']).unstack().groupby('Train Language').mean().drop(index=['{en-es-ru-x3}'])
temp[[('AUC ROC', x) for x in enough_platform_language_samples[~enough_platform_language_samples].index.tolist()]] = 'nan'
temp = temp.style.format(precision=2).background_gradient(vmin=0.5, vmax=1.5, text_color_threshold=0, axis=None)
display(temp)
print(temp.applymap_index(lambda v: "font-weight: bold;", axis=0).applymap_index(lambda v: "font-weight: bold;", axis=1).to_latex(convert_css=True).replace('{\\cellcolor[HTML]{000000}} \\color[HTML]{000000} nan', 'N/A').replace('nan', 'N/A').replace('\\bfseries {en-es-ru}', '\hline\n\\bfseries {en-es-ru}'))

Unnamed: 0_level_0,AUC ROC,AUC ROC,AUC ROC,AUC ROC,AUC ROC,AUC ROC,AUC ROC,AUC ROC,AUC ROC,AUC ROC,AUC ROC,AUC ROC,AUC ROC,AUC ROC,AUC ROC,AUC ROC,AUC ROC,AUC ROC,AUC ROC,AUC ROC,AUC ROC,AUC ROC,AUC ROC
Language,ar,bg,ca,cs,de,el,en,es,et,ga,gd,hr,hu,nl,pl,pt,ro,ru,sk,sl,uk,zh,{all}
Train Language,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2,Unnamed: 22_level_2,Unnamed: 23_level_2
en,0.78,0.86,0.77,0.87,0.88,0.88,0.96,0.85,0.89,,,0.92,0.95,0.82,0.86,0.9,0.92,0.85,,,0.79,0.65,0.84
es,0.78,0.84,0.83,0.83,0.88,0.84,0.87,0.94,0.86,,,0.88,0.92,0.81,0.84,0.91,0.91,0.86,,,0.79,0.64,0.82
ru,0.73,0.9,0.68,0.81,0.81,0.84,0.83,0.76,0.82,,,0.84,0.86,0.74,0.82,0.81,0.83,0.93,,,0.87,0.61,0.77
{en-es-ru},0.88,0.92,0.86,0.91,0.91,0.93,0.95,0.94,0.91,,,0.93,0.96,0.84,0.89,0.93,0.93,0.93,,,0.88,0.84,0.9


\begin{tabular}{lrrrrrrrrrllrrrrrrrllrrr}
 & \multicolumn{23}{r}{\bfseries AUC ROC} \\
Language & \bfseries ar & \bfseries bg & \bfseries ca & \bfseries cs & \bfseries de & \bfseries el & \bfseries en & \bfseries es & \bfseries et & \bfseries ga & \bfseries gd & \bfseries hr & \bfseries hu & \bfseries nl & \bfseries pl & \bfseries pt & \bfseries ro & \bfseries ru & \bfseries sk & \bfseries sl & \bfseries uk & \bfseries zh & \bfseries {all} \\
Train Language &  &  &  &  &  &  &  &  &  &  &  &  &  &  &  &  &  &  &  &  &  &  &  \\
\bfseries en & {\cellcolor[HTML]{C6CCE3}} \color[HTML]{000000} 0.78 & {\cellcolor[HTML]{ABBFDC}} \color[HTML]{000000} 0.86 & {\cellcolor[HTML]{C9CEE4}} \color[HTML]{000000} 0.77 & {\cellcolor[HTML]{A7BDDB}} \color[HTML]{000000} 0.87 & {\cellcolor[HTML]{A5BDDB}} \color[HTML]{000000} 0.88 & {\cellcolor[HTML]{A4BCDA}} \color[HTML]{000000} 0.88 & {\cellcolor[HTML]{84B0D3}} \color[HTML]{000000} 0.96 & {\cellcolor[HTML]{B0C2DE}} \color[HTML]{000000} 0.85 & {\cellcolor

  print(temp.applymap_index(lambda v: "font-weight: bold;", axis=0).applymap_index(lambda v: "font-weight: bold;", axis=1).to_latex(convert_css=True).replace('{\\cellcolor[HTML]{000000}} \\color[HTML]{000000} nan', 'N/A').replace('nan', 'N/A').replace('\\bfseries {en-es-ru}', '\hline\n\\bfseries {en-es-ru}'))


In [64]:
#correlation between models
temp = results_all[(~results_all['Train Language'].str.contains('{en-es-ru-x3}'))].set_index(['Train Language', 'Language', 'Detector']).unstack().groupby('Train Language').mean()
#temp[[('AUC ROC', x) for x in enough_platform_language_samples[~enough_platform_language_samples].index.tolist()]] = 'nan'
#temp = temp.style.format(precision=2).background_gradient(vmin=0.5, vmax=1.5, text_color_threshold=0, axis=None)
#display(temp)
#print(temp.applymap_index(lambda v: "font-weight: bold;", axis=0).applymap_index(lambda v: "font-weight: bold;", axis=1).to_latex(convert_css=True).replace('{\\cellcolor[HTML]{000000}} \\color[HTML]{000000} nan', 'N/A').replace('nan', 'N/A').replace('\\bfseries {en-es-ru}', '\hline\n\\bfseries {en-es-ru}'))

In [65]:
temp

Unnamed: 0_level_0,AUC ROC,AUC ROC,AUC ROC,AUC ROC,AUC ROC,AUC ROC,AUC ROC
Detector,Aya-101,BLOOMZ-3b,Falcon-rw-1b,Llama-3-8b,Mistral-7b-v0.1,XLM-RoBERTa-large,mDeBERTa-v3-base
Train Language,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2
en,0.932335,0.843346,0.84171,0.891368,0.826079,0.897961,0.899458
es,0.937753,0.820825,0.846767,0.870279,0.822502,0.913116,0.906526
ru,0.935936,0.754461,0.799556,0.825385,0.827751,0.930225,0.91644
{en-es-ru},0.939851,0.871641,0.892501,0.940352,0.903409,0.937756,0.913727


In [66]:
sorted_detectors = [
                    'BLOOMZ-3b', 'Falcon-rw-1b', 'Llama-3-8b', 'Mistral-7b-v0.1',
                    'Aya-101', 'XLM-RoBERTa-large', 'mDeBERTa-v3-base'
                    ]
temp[[('AUC ROC', x) for x in sorted_detectors]].corr().style.format(precision=2).background_gradient(cmap='Greens', vmin=-1, vmax=3, text_color_threshold=0, axis=None).applymap(lambda x: 'background-color: white;' if str(x)=='nan' else '')

  temp[[('AUC ROC', x) for x in sorted_detectors]].corr().style.format(precision=2).background_gradient(cmap='Greens', vmin=-1, vmax=3, text_color_threshold=0, axis=None).applymap(lambda x: 'background-color: white;' if str(x)=='nan' else '')


Unnamed: 0_level_0,Unnamed: 1_level_0,AUC ROC,AUC ROC,AUC ROC,AUC ROC,AUC ROC,AUC ROC,AUC ROC
Unnamed: 0_level_1,Detector,BLOOMZ-3b,Falcon-rw-1b,Llama-3-8b,Mistral-7b-v0.1,Aya-101,XLM-RoBERTa-large,mDeBERTa-v3-base
Unnamed: 0_level_2,Detector,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2
AUC ROC,BLOOMZ-3b,1.0,0.94,0.97,0.63,0.24,-0.1,-0.41
AUC ROC,Falcon-rw-1b,0.94,1.0,0.97,0.8,0.55,0.22,-0.1
AUC ROC,Llama-3-8b,0.97,0.97,1.0,0.8,0.38,0.13,-0.19
AUC ROC,Mistral-7b-v0.1,0.63,0.8,0.8,1.0,0.69,0.69,0.43
AUC ROC,Aya-101,0.24,0.55,0.38,0.69,1.0,0.8,0.66
AUC ROC,XLM-RoBERTa-large,-0.1,0.22,0.13,0.69,0.8,1.0,0.95
AUC ROC,mDeBERTa-v3-base,-0.41,-0.1,-0.19,0.43,0.66,0.95,1.0


In [67]:
#not autoregressive models (seq2seq or masked)
temp = results_all[~results_all.Detector.isin(autoregressive)].set_index(['Train Language', 'Detector', 'Language']).unstack().groupby('Train Language').mean().drop(index=['{en-es-ru-x3}'])
temp[[('AUC ROC', x) for x in enough_platform_language_samples[~enough_platform_language_samples].index.tolist()]] = 'nan'
temp = temp.style.format(precision=2).background_gradient(vmin=0.5, vmax=1.5, text_color_threshold=0, axis=None)
display(temp)
print(temp.applymap_index(lambda v: "font-weight: bold;", axis=0).applymap_index(lambda v: "font-weight: bold;", axis=1).to_latex(convert_css=True).replace('{\\cellcolor[HTML]{000000}} \\color[HTML]{000000} nan', 'N/A').replace('nan', 'N/A').replace('\\bfseries {en-es-ru}', '\hline\n\\bfseries {en-es-ru}'))

Unnamed: 0_level_0,AUC ROC,AUC ROC,AUC ROC,AUC ROC,AUC ROC,AUC ROC,AUC ROC,AUC ROC,AUC ROC,AUC ROC,AUC ROC,AUC ROC,AUC ROC,AUC ROC,AUC ROC,AUC ROC,AUC ROC,AUC ROC,AUC ROC,AUC ROC,AUC ROC,AUC ROC,AUC ROC
Language,ar,bg,ca,cs,de,el,en,es,et,ga,gd,hr,hu,nl,pl,pt,ro,ru,sk,sl,uk,zh,{all}
Train Language,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2,Unnamed: 22_level_2,Unnamed: 23_level_2
en,0.85,0.95,0.79,0.96,0.88,0.93,0.96,0.9,0.95,,,0.96,0.99,0.86,0.93,0.94,0.96,0.9,,,0.85,0.83,0.91
es,0.88,0.96,0.86,0.96,0.91,0.92,0.94,0.94,0.95,,,0.96,0.99,0.87,0.93,0.94,0.96,0.92,,,0.87,0.86,0.92
ru,0.91,0.97,0.86,0.96,0.88,0.94,0.94,0.9,0.96,,,0.96,0.98,0.86,0.94,0.93,0.95,0.96,,,0.92,0.9,0.93
{en-es-ru},0.91,0.97,0.87,0.96,0.91,0.93,0.96,0.93,0.96,,,0.96,0.99,0.88,0.94,0.95,0.97,0.94,,,0.9,0.89,0.93


\begin{tabular}{lrrrrrrrrrllrrrrrrrllrrr}
 & \multicolumn{23}{r}{\bfseries AUC ROC} \\
Language & \bfseries ar & \bfseries bg & \bfseries ca & \bfseries cs & \bfseries de & \bfseries el & \bfseries en & \bfseries es & \bfseries et & \bfseries ga & \bfseries gd & \bfseries hr & \bfseries hu & \bfseries nl & \bfseries pl & \bfseries pt & \bfseries ro & \bfseries ru & \bfseries sk & \bfseries sl & \bfseries uk & \bfseries zh & \bfseries {all} \\
Train Language &  &  &  &  &  &  &  &  &  &  &  &  &  &  &  &  &  &  &  &  &  &  &  \\
\bfseries en & {\cellcolor[HTML]{AFC1DD}} \color[HTML]{000000} 0.85 & {\cellcolor[HTML]{88B1D4}} \color[HTML]{000000} 0.95 & {\cellcolor[HTML]{C2CBE2}} \color[HTML]{000000} 0.79 & {\cellcolor[HTML]{84B0D3}} \color[HTML]{000000} 0.96 & {\cellcolor[HTML]{A4BCDA}} \color[HTML]{000000} 0.88 & {\cellcolor[HTML]{8EB3D5}} \color[HTML]{000000} 0.93 & {\cellcolor[HTML]{83AFD3}} \color[HTML]{000000} 0.96 & {\cellcolor[HTML]{9AB8D8}} \color[HTML]{000000} 0.90 & {\cellcolor

  print(temp.applymap_index(lambda v: "font-weight: bold;", axis=0).applymap_index(lambda v: "font-weight: bold;", axis=1).to_latex(convert_css=True).replace('{\\cellcolor[HTML]{000000}} \\color[HTML]{000000} nan', 'N/A').replace('nan', 'N/A').replace('\\bfseries {en-es-ru}', '\hline\n\\bfseries {en-es-ru}'))


In [68]:
sorted_languages = [
                    'el',
                    'ar', 'zh',
                    'ca', 'es', 'pt',
                    'en', 'ga', 'gd',
                    'hu', 'et',
                    'de', 'nl',
                    'cs', 'pl', 'sk',
                    'hr', 'sl',
                    'ro',
                    'bg', 'uk',  'ru',
                    ]
temp = results_all.set_index(['Detector', 'Train Language', 'Language']).unstack()[[('AUC ROC', x) for x in sorted_languages]].corr(numeric_only=True).reset_index().drop(columns=['level_0']).set_index('Language').style.format(precision=2).background_gradient(cmap='Greens', vmin=-1, vmax=3, text_color_threshold=0, axis=None).applymap(lambda x: 'background-color: white;' if str(x)=='nan' else '')
#temp = results_all.set_index(['Detector', 'Train Language', 'Language']).unstack().corr(numeric_only=True).reset_index().drop(columns=['level_0']).set_index('Language').style.format(precision=2).background_gradient(cmap='Greens', vmin=-1, vmax=3, text_color_threshold=0, axis=None).applymap(lambda x: 'background-color: white;' if str(x)=='nan' else '')
display(temp)
print(temp.applymap_index(lambda v: "font-weight: bold;", axis=0).applymap_index(lambda v: "font-weight: bold;", axis=1).to_latex(convert_css=True))

  temp = results_all.set_index(['Detector', 'Train Language', 'Language']).unstack()[[('AUC ROC', x) for x in sorted_languages]].corr(numeric_only=True).reset_index().drop(columns=['level_0']).set_index('Language').style.format(precision=2).background_gradient(cmap='Greens', vmin=-1, vmax=3, text_color_threshold=0, axis=None).applymap(lambda x: 'background-color: white;' if str(x)=='nan' else '')
  temp = results_all.set_index(['Detector', 'Train Language', 'Language']).unstack()[[('AUC ROC', x) for x in sorted_languages]].corr(numeric_only=True).reset_index().drop(columns=['level_0']).set_index('Language').style.format(precision=2).background_gradient(cmap='Greens', vmin=-1, vmax=3, text_color_threshold=0, axis=None).applymap(lambda x: 'background-color: white;' if str(x)=='nan' else '')


Unnamed: 0_level_0,AUC ROC,AUC ROC,AUC ROC,AUC ROC,AUC ROC,AUC ROC,AUC ROC,AUC ROC,AUC ROC,AUC ROC,AUC ROC,AUC ROC,AUC ROC,AUC ROC,AUC ROC,AUC ROC,AUC ROC,AUC ROC,AUC ROC,AUC ROC,AUC ROC,AUC ROC
Language,el,ar,zh,ca,es,pt,en,ga,gd,hu,et,de,nl,cs,pl,sk,hr,sl,ro,bg,uk,ru
Language,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2,Unnamed: 22_level_2
el,1.0,0.75,0.76,0.53,0.53,0.69,0.72,0.7,,0.69,0.66,0.58,0.73,0.69,0.85,0.8,0.7,0.72,0.74,0.57,0.55,0.57
ar,0.75,1.0,0.7,0.79,0.66,0.83,0.71,0.85,,0.84,0.84,0.75,0.85,0.86,0.81,0.81,0.78,0.72,0.75,0.73,0.71,0.67
zh,0.76,0.7,1.0,0.49,0.44,0.56,0.57,0.69,,0.57,0.62,0.49,0.63,0.63,0.75,0.78,0.52,0.58,0.61,0.41,0.47,0.49
ca,0.53,0.79,0.49,1.0,0.89,0.84,0.57,0.71,,0.69,0.71,0.82,0.74,0.62,0.55,0.47,0.69,0.5,0.81,0.37,0.37,0.3
es,0.53,0.66,0.44,0.89,1.0,0.84,0.5,0.59,,0.62,0.58,0.71,0.62,0.48,0.55,0.44,0.64,0.5,0.79,0.27,0.22,0.2
pt,0.69,0.83,0.56,0.84,0.84,1.0,0.78,0.75,,0.91,0.82,0.78,0.85,0.78,0.74,0.66,0.84,0.65,0.92,0.46,0.32,0.27
en,0.72,0.71,0.57,0.57,0.5,0.78,1.0,0.69,,0.81,0.7,0.66,0.82,0.72,0.73,0.65,0.69,0.58,0.75,0.46,0.3,0.22
ga,0.7,0.85,0.69,0.71,0.59,0.75,0.69,1.0,,0.73,0.63,0.7,0.83,0.68,0.82,0.77,0.58,0.55,0.65,0.62,0.63,0.57
gd,,,,,,,,,,,,,,,,,,,,,,
hu,0.69,0.84,0.57,0.69,0.62,0.91,0.81,0.73,,1.0,0.88,0.72,0.9,0.88,0.81,0.79,0.9,0.79,0.86,0.59,0.42,0.37


  print(temp.applymap_index(lambda v: "font-weight: bold;", axis=0).applymap_index(lambda v: "font-weight: bold;", axis=1).to_latex(convert_css=True))


\begin{tabular}{lrrrrrrrrrrrrrrrrrrrrrr}
 & \multicolumn{22}{r}{\bfseries AUC ROC} \\
Language & \bfseries el & \bfseries ar & \bfseries zh & \bfseries ca & \bfseries es & \bfseries pt & \bfseries en & \bfseries ga & \bfseries gd & \bfseries hu & \bfseries et & \bfseries de & \bfseries nl & \bfseries cs & \bfseries pl & \bfseries sk & \bfseries hr & \bfseries sl & \bfseries ro & \bfseries bg & \bfseries uk & \bfseries ru \\
Language &  &  &  &  &  &  &  &  &  &  &  &  &  &  &  &  &  &  &  &  &  &  \\
\bfseries el & {\cellcolor[HTML]{73C476}} \color[HTML]{000000} 1.00 & {\cellcolor[HTML]{8BCF89}} \color[HTML]{000000} 0.75 & {\cellcolor[HTML]{8ACE88}} \color[HTML]{000000} 0.76 & {\cellcolor[HTML]{9FD899}} \color[HTML]{000000} 0.53 & {\cellcolor[HTML]{9FD899}} \color[HTML]{000000} 0.53 & {\cellcolor[HTML]{90D18D}} \color[HTML]{000000} 0.69 & {\cellcolor[HTML]{8DD08A}} \color[HTML]{000000} 0.72 & {\cellcolor[HTML]{90D18D}} \color[HTML]{000000} 0.70 & {\cellcolor[HTML]{000000}} \color[HTML]

In [69]:

#social media telegram test data - per language-family of telegram trained mono and multilingually traned EN,ES,RU detectors
auc_dict = {}
for detector in tqdm(test_results, total=len(test_results)):
  for model,data in detector.items():
    temp = data[(data.split == 'test') & (data.domain == 'social_media')]
  if ('cross-language' not in model): continue
  temp['source'] = temp['source'].str.replace('multisocial_', '')
  temp = temp[temp.source == 'telegram']
  #if (temp['prediction_probs'].min() >= 0.5) and (temp['prediction_probs'].max() <= 1.0):
    #temp.loc[temp.predictions == 'human', 'prediction_probs'] = 1 - temp['prediction_probs'] #adjust human prediction prob to be in range <0, 0.5)
  temp['language'] = temp['language'].map(to_language_family)
  temp['prediction_probs']= temp['prediction_probs'].astype(float)
  labels = [label2id[x] for x in temp['label']]
  fpr, tpr, thresholds = roc_curve(labels, temp['prediction_probs'].astype(float))
  auc_dict[model] = {'auc': auc_roc_reliable(fpr, tpr, labels), 'th_optim': thresholds[np.argmax(tpr - fpr)], 'tpr_1%fpr': tpr[fpr <= 0.01][-1], 'tpr_5%fpr': tpr[fpr <= 0.05][-1], 'th_1%fpr': thresholds[fpr <= 0.01][-1], 'th_3%fpr': thresholds[fpr <= 0.03][-1], 'th_5%fpr': thresholds[fpr <= 0.05][-1], 'th_10%fpr': thresholds[fpr <= 0.10][-1], 'th_15%fpr': thresholds[fpr <= 0.15][-1], 'th_20%fpr': thresholds[fpr <= 0.20][-1], 'th_25%fpr': thresholds[fpr <= 0.25][-1], 'th_30%fpr': thresholds[fpr <= 0.30][-1], 'th_40%fpr': thresholds[fpr <= 0.40][-1], 'th_50%fpr': thresholds[fpr <= 0.50][-1]}
  for test_language in temp.language.unique():
    temp2 = temp[temp.language == test_language].reset_index(drop=True)
    labels = [label2id[x] for x in temp2['label']]
    fpr, tpr, thresholds = roc_curve(labels, temp2['prediction_probs'])
    auc_dict[model][test_language] = {'auc': auc_roc_reliable(fpr, tpr, labels), 'th_optim': thresholds[np.argmax(tpr - fpr)], 'tpr_1%fpr': tpr[fpr <= 0.01][-1], 'tpr_5%fpr': tpr[fpr <= 0.05][-1], 'th_1%fpr': thresholds[fpr <= 0.01][-1], 'th_3%fpr': thresholds[fpr <= 0.03][-1], 'th_5%fpr': thresholds[fpr <= 0.05][-1], 'th_10%fpr': thresholds[fpr <= 0.10][-1], 'th_15%fpr': thresholds[fpr <= 0.15][-1], 'th_20%fpr': thresholds[fpr <= 0.20][-1], 'th_25%fpr': thresholds[fpr <= 0.25][-1], 'th_30%fpr': thresholds[fpr <= 0.30][-1], 'th_40%fpr': thresholds[fpr <= 0.40][-1], 'th_50%fpr': thresholds[fpr <= 0.50][-1]}
results_all = pd.DataFrame()
for model, v in tqdm(auc_dict.items(), total=len(auc_dict)):
  temp = pd.DataFrame({'Detector': extract_language(model).split('_')[0], 'Train Language': extract_language(model).split('_')[1], 'Language': '{all}', 'AUC ROC': v['auc']}, index=[0])
  results_all = pd.concat([results_all, temp])
  for test_language,val in v.items():
    if (test_language == 'auc') or ('_' in test_language): continue
    temp = pd.DataFrame({'Detector': extract_language(model).split('_')[0], 'Train Language': extract_language(model).split('_')[1], 'Language': test_language, 'AUC ROC': val['auc']}, index=[0])
    results_all = pd.concat([results_all, temp])
#results_all.set_index(['Detector', 'Language']).unstack().sort_values(by=('AUC ROC','{all}'), ascending=False).style.format(precision=2).apply(lambda x: ["background-color: lightyellow" if v >= 0.6 else "" for v in x], axis = 0, subset='AUC ROC').highlight_max(props='font-weight: bold;', axis=0)
results_all['Detector'] = results_all['Detector'].apply(rename_detector)
results_language_family = results_all.copy()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  temp['source'] = temp['source'].str.replace('multisocial_', '')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  temp['source'] = temp['source'].str.replace('multisocial_', '')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  temp['source'] = temp['source'].str.replace('multisocial_', '')
A value is tr

In [70]:
results_all = results_language_family

In [71]:
temp = pd.DataFrame(multidomain[(multidomain.domain=='social_media') & multidomain.source.str.contains('telegram')][['language']].value_counts())
temp = temp.reset_index()
temp['language'] = temp['language'].map(to_language_family)
temp = temp.groupby(['language']).sum().unstack().fillna(0).astype(int) > 2000 #if there are enough samples (~ 250 of each generator)
temp.index = temp.index.droplevel(0)
temp['{all}'] = True
enough_platform_language_samples = temp

#temp = results_all.set_index(['Detector', 'Train Language', 'Language']).unstack()
temp = results_all[~results_all['Train Language'].str.contains('x3')].set_index(['Detector', 'Train Language', 'Language']).unstack()
temp[[('AUC ROC', x) for x in enough_platform_language_samples[~enough_platform_language_samples].index.tolist()]] = 'nan'
temp = temp.style.format(precision=2).background_gradient(vmin=0.5, vmax=1.5, text_color_threshold=0, axis=None)
display(temp)
print(temp.applymap_index(lambda v: "font-weight: bold;", axis=0).applymap_index(lambda v: "font-weight: bold;", axis=1).to_latex(convert_css=True).replace('\\multirow', '\\hline\n\\multirow').replace('{\\cellcolor[HTML]{000000}} \\color[HTML]{000000} nan', 'N/A').replace('nan', 'N/A').replace('\\multirow[c]{4}{*}{', '\\multirow[c]{4}{*}{\\rotatebox{90}{\\parbox{1.8cm}{').replace('} &', '}}} &').replace('{en-es-ru}}}', '{en-es-ru}').replace('mDeBERTa-v3-base', 'mDeBER\-Ta-v3-base'))

Unnamed: 0_level_0,Unnamed: 1_level_0,AUC ROC,AUC ROC,AUC ROC,AUC ROC,AUC ROC,AUC ROC,AUC ROC,AUC ROC,AUC ROC,AUC ROC
Unnamed: 0_level_1,Language,Celtic,Germanic,Greek,Romance,Semitic,Sino-Tibetan,Slavic-Cyrillic,Slavic-Latin,Uralic,{all}
Detector,Train Language,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2
Aya-101,en,,0.92,0.95,0.93,0.89,0.91,0.94,0.95,0.98,0.93
Aya-101,es,,0.93,0.93,0.94,0.91,0.91,0.94,0.95,0.97,0.94
Aya-101,ru,,0.92,0.94,0.93,0.92,0.91,0.96,0.95,0.97,0.94
Aya-101,{en-es-ru},,0.93,0.94,0.95,0.93,0.92,0.95,0.95,0.98,0.94
BLOOMZ-3b,en,,0.89,0.9,0.79,0.75,0.61,0.85,0.86,0.92,0.82
BLOOMZ-3b,es,,0.84,0.82,0.88,0.78,0.61,0.82,0.8,0.86,0.81
BLOOMZ-3b,ru,,0.78,0.76,0.62,0.69,0.64,0.86,0.77,0.81,0.72
BLOOMZ-3b,{en-es-ru},,0.88,0.92,0.88,0.87,0.88,0.88,0.85,0.89,0.86
Falcon-rw-1b,en,,0.87,0.87,0.87,0.74,0.8,0.76,0.86,0.91,0.83
Falcon-rw-1b,es,,0.85,0.91,0.9,0.78,0.8,0.77,0.85,0.89,0.83


\begin{tabular}{lllrrrrrrrrr}
 &  & \multicolumn{10}{r}{\bfseries AUC ROC} \\
 & Language & \bfseries Celtic & \bfseries Germanic & \bfseries Greek & \bfseries Romance & \bfseries Semitic & \bfseries Sino-Tibetan & \bfseries Slavic-Cyrillic & \bfseries Slavic-Latin & \bfseries Uralic & \bfseries {all} \\
Detector & Train Language &  &  &  &  &  &  &  &  &  &  \\
\hline
\multirow[c]{4}{*}{\rotatebox{90}{\parbox{1.8cm}{\bfseries Aya-101}}} & \bfseries en & N/A & {\cellcolor[HTML]{93B5D6}} \color[HTML]{000000} 0.92 & {\cellcolor[HTML]{88B1D4}} \color[HTML]{000000} 0.95 & {\cellcolor[HTML]{91B5D6}} \color[HTML]{000000} 0.93 & {\cellcolor[HTML]{9FBAD9}} \color[HTML]{000000} 0.89 & {\cellcolor[HTML]{97B7D7}} \color[HTML]{000000} 0.91 & {\cellcolor[HTML]{8EB3D5}} \color[HTML]{000000} 0.94 & {\cellcolor[HTML]{88B1D4}} \color[HTML]{000000} 0.95 & {\cellcolor[HTML]{7DACD1}} \color[HTML]{000000} 0.98 & {\cellcolor[HTML]{8EB3D5}} \color[HTML]{000000} 0.93 \\
\bfseries  & \bfseries es & N/A & {\cel

  print(temp.applymap_index(lambda v: "font-weight: bold;", axis=0).applymap_index(lambda v: "font-weight: bold;", axis=1).to_latex(convert_css=True).replace('\\multirow', '\\hline\n\\multirow').replace('{\\cellcolor[HTML]{000000}} \\color[HTML]{000000} nan', 'N/A').replace('nan', 'N/A').replace('\\multirow[c]{4}{*}{', '\\multirow[c]{4}{*}{\\rotatebox{90}{\\parbox{1.8cm}{').replace('} &', '}}} &').replace('{en-es-ru}}}', '{en-es-ru}').replace('mDeBERTa-v3-base', 'mDeBER\-Ta-v3-base'))


In [72]:
temp = results_all.set_index(['Train Language', 'Detector', 'Language']).unstack().groupby('Train Language').mean().drop(index=['{en-es-ru-x3}'])
temp[[('AUC ROC', x) for x in enough_platform_language_samples[~enough_platform_language_samples].index.tolist()]] = 'nan'
temp = temp.style.format(precision=2).background_gradient(vmin=0.5, vmax=1.5, text_color_threshold=0, axis=None)
display(temp)
#print(temp.applymap_index(lambda v: "font-weight: bold;", axis=0).applymap_index(lambda v: "font-weight: bold;", axis=1).to_latex(convert_css=True).replace('\\multirow', '\\hline\n\\multirow').replace('\\multirow[c]{6}{*}{', '\\multirow[c]{6}{*}{\\rotatebox{90}{\\parbox{2cm}{').replace('} &', '}}} &').replace('{\\cellcolor[HTML]{000000}} \\color[HTML]{000000} nan', 'N/A'))

Unnamed: 0_level_0,AUC ROC,AUC ROC,AUC ROC,AUC ROC,AUC ROC,AUC ROC,AUC ROC,AUC ROC,AUC ROC,AUC ROC
Language,Celtic,Germanic,Greek,Romance,Semitic,Sino-Tibetan,Slavic-Cyrillic,Slavic-Latin,Uralic,{all}
Train Language,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2
en,,0.89,0.9,0.87,0.81,0.73,0.86,0.9,0.94,0.87
es,,0.87,0.87,0.91,0.82,0.73,0.86,0.88,0.92,0.86
ru,,0.83,0.88,0.83,0.81,0.73,0.92,0.88,0.89,0.84
{en-es-ru},,0.91,0.93,0.92,0.89,0.86,0.92,0.92,0.95,0.91


In [73]:
#autoregressive decoder-only models
temp = results_all[results_all.Detector.isin(autoregressive)].set_index(['Train Language', 'Detector', 'Language']).unstack().groupby('Train Language').mean().drop(index=['{en-es-ru-x3}'])
temp[[('AUC ROC', x) for x in enough_platform_language_samples[~enough_platform_language_samples].index.tolist()]] = 'nan'
temp = temp.style.format(precision=2).background_gradient(vmin=0.5, vmax=1.5, text_color_threshold=0, axis=None)
display(temp)
#print(temp.applymap_index(lambda v: "font-weight: bold;", axis=0).applymap_index(lambda v: "font-weight: bold;", axis=1).to_latex(convert_css=True).replace('\\multirow', '\\hline\n\\multirow').replace('\\multirow[c]{6}{*}{', '\\multirow[c]{6}{*}{\\rotatebox{90}{\\parbox{2cm}{').replace('} &', '}}} &').replace('{\\cellcolor[HTML]{000000}} \\color[HTML]{000000} nan', 'N/A'))

Unnamed: 0_level_0,AUC ROC,AUC ROC,AUC ROC,AUC ROC,AUC ROC,AUC ROC,AUC ROC,AUC ROC,AUC ROC,AUC ROC
Language,Celtic,Germanic,Greek,Romance,Semitic,Sino-Tibetan,Slavic-Cyrillic,Slavic-Latin,Uralic,{all}
Train Language,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2
en,,0.89,0.88,0.85,0.78,0.65,0.83,0.87,0.92,0.84
es,,0.85,0.84,0.9,0.78,0.64,0.82,0.83,0.89,0.82
ru,,0.79,0.84,0.76,0.73,0.61,0.89,0.82,0.84,0.77
{en-es-ru},,0.9,0.93,0.91,0.88,0.84,0.91,0.9,0.94,0.9


In [74]:
#not autoregressive models (seq2seq or masked)
temp = results_all[~results_all.Detector.isin(autoregressive)].set_index(['Train Language', 'Detector', 'Language']).unstack().groupby('Train Language').mean().drop(index=['{en-es-ru-x3}'])
temp[[('AUC ROC', x) for x in enough_platform_language_samples[~enough_platform_language_samples].index.tolist()]] = 'nan'
temp = temp.style.format(precision=2).background_gradient(vmin=0.5, vmax=1.5, text_color_threshold=0, axis=None)
display(temp)
#print(temp.applymap_index(lambda v: "font-weight: bold;", axis=0).applymap_index(lambda v: "font-weight: bold;", axis=1).to_latex(convert_css=True).replace('\\multirow', '\\hline\n\\multirow').replace('\\multirow[c]{6}{*}{', '\\multirow[c]{6}{*}{\\rotatebox{90}{\\parbox{2cm}{').replace('} &', '}}} &').replace('{\\cellcolor[HTML]{000000}} \\color[HTML]{000000} nan', 'N/A'))

Unnamed: 0_level_0,AUC ROC,AUC ROC,AUC ROC,AUC ROC,AUC ROC,AUC ROC,AUC ROC,AUC ROC,AUC ROC,AUC ROC
Language,Celtic,Germanic,Greek,Romance,Semitic,Sino-Tibetan,Slavic-Cyrillic,Slavic-Latin,Uralic,{all}
Train Language,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2
en,,0.9,0.93,0.91,0.85,0.83,0.9,0.94,0.97,0.91
es,,0.9,0.92,0.93,0.88,0.86,0.91,0.94,0.97,0.92
ru,,0.89,0.94,0.91,0.91,0.9,0.95,0.95,0.97,0.93
{en-es-ru},,0.92,0.93,0.93,0.91,0.89,0.94,0.95,0.98,0.93


In [75]:
temp = results_all[~results_all['Train Language'].str.contains('x3')]
temp = temp[~temp.Language.isin(enough_platform_language_samples[~enough_platform_language_samples].index.tolist())]
temp = temp.set_index(['Detector', 'Train Language', 'Language']).unstack()
temp = temp.replace('nan', pd.NA).corr().reset_index().drop(columns=['level_0']).set_index('Language')
#temp[[('AUC ROC', x) for x in enough_platform_language_samples[~enough_platform_language_samples].index.tolist()]] = 'nan'
temp = temp.style.format(precision=2).background_gradient(cmap='Greens', vmin=-1, vmax=3, text_color_threshold=0, axis=None).applymap(lambda x: 'background-color: white;' if str(x)=='nan' else '')
#temp = results_all.set_index(['Detector', 'Train Language', 'Language']).unstack().corr(numeric_only=True).reset_index().drop(columns=['level_0']).set_index('Language').style.format(precision=2).background_gradient(cmap='Greens', vmin=-1, vmax=3, text_color_threshold=0, axis=None).applymap(lambda x: 'background-color: white;' if str(x)=='nan' else '')
display(temp)
print(temp.applymap_index(lambda v: "font-weight: bold;", axis=0).applymap_index(lambda v: "font-weight: bold;", axis=1).to_latex(convert_css=True))

  temp = temp.replace('nan', pd.NA).corr().reset_index().drop(columns=['level_0']).set_index('Language')
  temp = temp.style.format(precision=2).background_gradient(cmap='Greens', vmin=-1, vmax=3, text_color_threshold=0, axis=None).applymap(lambda x: 'background-color: white;' if str(x)=='nan' else '')


Unnamed: 0_level_0,AUC ROC,AUC ROC,AUC ROC,AUC ROC,AUC ROC,AUC ROC,AUC ROC,AUC ROC,AUC ROC
Language,Germanic,Greek,Romance,Semitic,Sino-Tibetan,Slavic-Cyrillic,Slavic-Latin,Uralic,{all}
Language,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2
Germanic,1.0,0.73,0.76,0.81,0.62,0.39,0.79,0.86,0.86
Greek,0.73,1.0,0.62,0.73,0.75,0.56,0.81,0.7,0.81
Romance,0.76,0.62,1.0,0.77,0.54,0.29,0.72,0.79,0.85
Semitic,0.81,0.73,0.77,1.0,0.69,0.71,0.87,0.86,0.94
Sino-Tibetan,0.62,0.75,0.54,0.69,1.0,0.45,0.72,0.61,0.77
Slavic-Cyrillic,0.39,0.56,0.29,0.71,0.45,1.0,0.66,0.49,0.61
Slavic-Latin,0.79,0.81,0.72,0.87,0.72,0.66,1.0,0.94,0.95
Uralic,0.86,0.7,0.79,0.86,0.61,0.49,0.94,1.0,0.94
{all},0.86,0.81,0.85,0.94,0.77,0.61,0.95,0.94,1.0


\begin{tabular}{lrrrrrrrrr}
 & \multicolumn{9}{r}{\bfseries AUC ROC} \\
Language & \bfseries Germanic & \bfseries Greek & \bfseries Romance & \bfseries Semitic & \bfseries Sino-Tibetan & \bfseries Slavic-Cyrillic & \bfseries Slavic-Latin & \bfseries Uralic & \bfseries {all} \\
Language &  &  &  &  &  &  &  &  &  \\
\bfseries Germanic & {\cellcolor[HTML]{73C476}} \color[HTML]{000000} 1.00 & {\cellcolor[HTML]{8DD08A}} \color[HTML]{000000} 0.73 & {\cellcolor[HTML]{8ACE88}} \color[HTML]{000000} 0.76 & {\cellcolor[HTML]{86CC85}} \color[HTML]{000000} 0.81 & {\cellcolor[HTML]{97D492}} \color[HTML]{000000} 0.62 & {\cellcolor[HTML]{A9DCA3}} \color[HTML]{000000} 0.39 & {\cellcolor[HTML]{87CD86}} \color[HTML]{000000} 0.79 & {\cellcolor[HTML]{81CA81}} \color[HTML]{000000} 0.86 & {\cellcolor[HTML]{81CA81}} \color[HTML]{000000} 0.86 \\
\bfseries Greek & {\cellcolor[HTML]{8DD08A}} \color[HTML]{000000} 0.73 & {\cellcolor[HTML]{73C476}} \color[HTML]{000000} 1.00 & {\cellcolor[HTML]{97D492}} \color[HTML

  print(temp.applymap_index(lambda v: "font-weight: bold;", axis=0).applymap_index(lambda v: "font-weight: bold;", axis=1).to_latex(convert_css=True))


In [76]:
#social media EN+ES test data - per platform

auc_dict = {}
for detector in tqdm(test_results, total=len(test_results)):
 for model,data in detector.items():
  temp_data = data[(data.split == 'test') & (data.domain == 'social_media') & (data.language.isin(['en', 'es']))]
 if ('cross-platform' not in model): continue
 temp_data['source'] = temp_data['source'].str.replace('multisocial_', '')
 temp_data['prediction_probs']= temp_data['prediction_probs'].astype(float)
 temp = temp_data
 llm = '{all}'
 labels = [label2id[x] for x in temp['label']]
 fpr, tpr, thresholds = roc_curve(labels, temp['prediction_probs'])
 auc_dict[f"{model} {llm}"] = {'auc':  auc_roc_reliable(fpr, tpr, labels), 'th_optim': thresholds[np.argmax(tpr - fpr)], 'tpr_1%fpr': tpr[fpr <= 0.01][-1], 'tpr_5%fpr': tpr[fpr <= 0.05][-1], 'th_1%fpr': thresholds[fpr <= 0.01][-1], 'th_3%fpr': thresholds[fpr <= 0.03][-1], 'th_5%fpr': thresholds[fpr <= 0.05][-1], 'th_10%fpr': thresholds[fpr <= 0.10][-1], 'th_15%fpr': thresholds[fpr <= 0.15][-1], 'th_20%fpr': thresholds[fpr <= 0.20][-1], 'th_25%fpr': thresholds[fpr <= 0.25][-1], 'th_30%fpr': thresholds[fpr <= 0.30][-1], 'th_40%fpr': thresholds[fpr <= 0.40][-1], 'th_50%fpr': thresholds[fpr <= 0.50][-1]}
 for llm in tqdm(temp_data.source.unique(), total=len(temp_data.multi_label.unique())):
  temp = temp_data[(temp_data.source.str.contains(f'{llm}'))]
  #if (temp['prediction_probs'].min() >= 0.5) and (temp['prediction_probs'].max() <= 1.0):
    #temp.loc[temp.predictions == 'human', 'prediction_probs'] = 1 - temp['prediction_probs'] #adjust human prediction prob to be in range <0, 0.5)
  labels = [label2id[x] for x in temp['label']]
  fpr, tpr, thresholds = roc_curve(labels, temp['prediction_probs'])
  auc_dict[f"{model} {llm}"] = {'auc':  auc_roc_reliable(fpr, tpr, labels), 'th_optim': thresholds[np.argmax(tpr - fpr)], 'tpr_1%fpr': tpr[fpr <= 0.01][-1], 'tpr_5%fpr': tpr[fpr <= 0.05][-1], 'th_1%fpr': thresholds[fpr <= 0.01][-1], 'th_3%fpr': thresholds[fpr <= 0.03][-1], 'th_5%fpr': thresholds[fpr <= 0.05][-1], 'th_10%fpr': thresholds[fpr <= 0.10][-1], 'th_15%fpr': thresholds[fpr <= 0.15][-1], 'th_20%fpr': thresholds[fpr <= 0.20][-1], 'th_25%fpr': thresholds[fpr <= 0.25][-1], 'th_30%fpr': thresholds[fpr <= 0.30][-1], 'th_40%fpr': thresholds[fpr <= 0.40][-1], 'th_50%fpr': thresholds[fpr <= 0.50][-1]}
results_all = pd.DataFrame()
for model, v in tqdm(auc_dict.items(), total=len(auc_dict)):
  temp = pd.DataFrame({'Detector': extract_platform(model.split(' ')[0]).split('_')[0], 'Train Platform': extract_platform(model.split(' ')[0]).split('_')[1], 'Platform': model.split(' ')[1], 'AUC ROC': v['auc']}, index=[0])
  results_all = pd.concat([results_all, temp])
#results_all.set_index(['Detector', 'Platform', 'Language']).unstack().style.format(precision=2).highlight_max(props='font-weight: bold;', axis=0).background_gradient(vmin=0.5, axis=None)
#results_all['Category'] = results_all['Detector'].map(to_category)
results_all['Detector'] = results_all['Detector'].apply(rename_detector)
results_all['Platform'] = results_all['Platform'].map(rename_platforms)
results_platform = results_all.copy()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  temp_data['source'] = temp_data['source'].str.replace('multisocial_', '')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  temp_data['prediction_probs']= temp_data['prediction_probs'].astype(float)

 62%|████████████████████████████████████████████████████▌                               | 5/8 [00:00<00:00, 121.58it/s][A
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-d

In [77]:
#temp = results_all.set_index(['Detector', 'Train Platform', 'Platform']).unstack().style.format(precision=2).background_gradient(vmin=0.5, vmax=1.5, text_color_threshold=0, axis=None)
temp = results_all[~results_all['Train Platform'].str.contains('x5')].set_index(['Detector', 'Train Platform', 'Platform']).unstack().style.format(precision=2).background_gradient(vmin=0.5, vmax=1.5, text_color_threshold=0, axis=None)
display(temp)
print(temp.applymap_index(lambda v: "font-weight: bold;", axis=0).applymap_index(lambda v: "font-weight: bold;", axis=1).to_latex(convert_css=True).replace('\\multirow', '\\hline\n\\multirow').replace('\\multirow[c]{6}{*}{', '\\multirow[c]{6}{*}{\\rotatebox{90}{\\parbox{2cm}{').replace('} &', '}}} &').replace('{\\cellcolor[HTML]{000000}} \\color[HTML]{000000} nan', 'N/A'))

Unnamed: 0_level_0,Unnamed: 1_level_0,AUC ROC,AUC ROC,AUC ROC,AUC ROC,AUC ROC,AUC ROC
Unnamed: 0_level_1,Platform,Discord,Gab,Telegram,Twitter,WhatsApp,{all}
Detector,Train Platform,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2
Llama-3-8b,Discord,0.99,0.83,0.82,0.72,0.86,0.83
Llama-3-8b,Gab,0.95,0.96,0.93,0.97,0.9,0.94
Llama-3-8b,Telegram,0.98,0.94,0.97,0.97,0.96,0.97
Llama-3-8b,Twitter,0.96,0.9,0.92,0.98,0.91,0.93
Llama-3-8b,WhatsApp,0.97,0.89,0.93,0.93,0.98,0.94
Llama-3-8b,all,0.98,0.94,0.95,0.97,0.95,0.96
Mistral-7b-v0.1,Discord,0.98,0.82,0.88,0.86,0.88,0.88
Mistral-7b-v0.1,Gab,0.95,0.95,0.93,0.96,0.92,0.94
Mistral-7b-v0.1,Telegram,0.98,0.94,0.97,0.97,0.95,0.96
Mistral-7b-v0.1,Twitter,0.96,0.9,0.9,0.98,0.91,0.93


\begin{tabular}{llrrrrrr}
 &  & \multicolumn{6}{r}{\bfseries AUC ROC} \\
 & Platform & \bfseries Discord & \bfseries Gab & \bfseries Telegram & \bfseries Twitter & \bfseries WhatsApp & \bfseries {all} \\
Detector & Train Platform &  &  &  &  &  &  \\
\hline
\multirow[c]{6}{*}{\rotatebox{90}{\parbox{2cm}{\bfseries Llama-3-8b}}} & \bfseries Discord & {\cellcolor[HTML]{79ABD0}} \color[HTML]{000000} 0.99 & {\cellcolor[HTML]{B7C5DF}} \color[HTML]{000000} 0.83 & {\cellcolor[HTML]{B7C5DF}} \color[HTML]{000000} 0.82 & {\cellcolor[HTML]{D8D7E9}} \color[HTML]{000000} 0.72 & {\cellcolor[HTML]{ACC0DD}} \color[HTML]{000000} 0.86 & {\cellcolor[HTML]{B4C4DF}} \color[HTML]{000000} 0.83 \\
\bfseries  & \bfseries Gab & {\cellcolor[HTML]{86B0D3}} \color[HTML]{000000} 0.95 & {\cellcolor[HTML]{86B0D3}} \color[HTML]{000000} 0.96 & {\cellcolor[HTML]{8EB3D5}} \color[HTML]{000000} 0.93 & {\cellcolor[HTML]{81AED2}} \color[HTML]{000000} 0.97 & {\cellcolor[HTML]{9CB9D9}} \color[HTML]{000000} 0.90 & {\cellcolor[HT

  print(temp.applymap_index(lambda v: "font-weight: bold;", axis=0).applymap_index(lambda v: "font-weight: bold;", axis=1).to_latex(convert_css=True).replace('\\multirow', '\\hline\n\\multirow').replace('\\multirow[c]{6}{*}{', '\\multirow[c]{6}{*}{\\rotatebox{90}{\\parbox{2cm}{').replace('} &', '}}} &').replace('{\\cellcolor[HTML]{000000}} \\color[HTML]{000000} nan', 'N/A'))


In [78]:
temp = results_all[results_all['Train Platform'] != 'all'].set_index(['Detector', 'Train Platform', 'Platform']).unstack()[[('AUC ROC', x) for x in ['Discord', 'WhatsApp', 'Telegram', 'Gab', 'Twitter']]].corr().reset_index().drop(columns=['level_0']).set_index('Platform').style.format(precision=2).background_gradient(cmap='Greens', vmin=-1, vmax=3, text_color_threshold=0, axis=None).applymap(lambda x: 'background-color: white;' if str(x)=='nan' else '')
display(temp)
print(temp.applymap_index(lambda v: "font-weight: bold;", axis=0).applymap_index(lambda v: "font-weight: bold;", axis=1).to_latex(convert_css=True))

  temp = results_all[results_all['Train Platform'] != 'all'].set_index(['Detector', 'Train Platform', 'Platform']).unstack()[[('AUC ROC', x) for x in ['Discord', 'WhatsApp', 'Telegram', 'Gab', 'Twitter']]].corr().reset_index().drop(columns=['level_0']).set_index('Platform').style.format(precision=2).background_gradient(cmap='Greens', vmin=-1, vmax=3, text_color_threshold=0, axis=None).applymap(lambda x: 'background-color: white;' if str(x)=='nan' else '')
  temp = results_all[results_all['Train Platform'] != 'all'].set_index(['Detector', 'Train Platform', 'Platform']).unstack()[[('AUC ROC', x) for x in ['Discord', 'WhatsApp', 'Telegram', 'Gab', 'Twitter']]].corr().reset_index().drop(columns=['level_0']).set_index('Platform').style.format(precision=2).background_gradient(cmap='Greens', vmin=-1, vmax=3, text_color_threshold=0, axis=None).applymap(lambda x: 'background-color: white;' if str(x)=='nan' else '')


Unnamed: 0_level_0,AUC ROC,AUC ROC,AUC ROC,AUC ROC,AUC ROC
Platform,Discord,WhatsApp,Telegram,Gab,Twitter
Platform,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
Discord,1.0,0.19,0.07,-0.14,-0.34
WhatsApp,0.19,1.0,0.8,0.64,0.58
Telegram,0.07,0.8,1.0,0.87,0.81
Gab,-0.14,0.64,0.87,1.0,0.81
Twitter,-0.34,0.58,0.81,0.81,1.0


\begin{tabular}{lrrrrr}
 & \multicolumn{5}{r}{\bfseries AUC ROC} \\
Platform & \bfseries Discord & \bfseries WhatsApp & \bfseries Telegram & \bfseries Gab & \bfseries Twitter \\
Platform &  &  &  &  &  \\
\bfseries Discord & {\cellcolor[HTML]{73C476}} \color[HTML]{000000} 1.00 & {\cellcolor[HTML]{B8E3B2}} \color[HTML]{000000} 0.19 & {\cellcolor[HTML]{C2E7BB}} \color[HTML]{000000} 0.07 & {\cellcolor[HTML]{D0EDCA}} \color[HTML]{000000} -0.14 & {\cellcolor[HTML]{DCF2D7}} \color[HTML]{000000} -0.34 \\
\bfseries WhatsApp & {\cellcolor[HTML]{B8E3B2}} \color[HTML]{000000} 0.19 & {\cellcolor[HTML]{73C476}} \color[HTML]{000000} 1.00 & {\cellcolor[HTML]{86CC85}} \color[HTML]{000000} 0.80 & {\cellcolor[HTML]{95D391}} \color[HTML]{000000} 0.64 & {\cellcolor[HTML]{99D595}} \color[HTML]{000000} 0.58 \\
\bfseries Telegram & {\cellcolor[HTML]{C2E7BB}} \color[HTML]{000000} 0.07 & {\cellcolor[HTML]{86CC85}} \color[HTML]{000000} 0.80 & {\cellcolor[HTML]{73C476}} \color[HTML]{000000} 1.00 & {\cellcolor[HT

  print(temp.applymap_index(lambda v: "font-weight: bold;", axis=0).applymap_index(lambda v: "font-weight: bold;", axis=1).to_latex(convert_css=True))


### Statistics

In [79]:
#T-test for all combinations of train languages
temp = results_language.copy()
temp = pd.DataFrame(multidomain[(multidomain.domain=='social_media') & multidomain.source.str.contains('telegram')][['language']].value_counts())
temp = temp.reset_index()
temp = temp.groupby(['language']).sum().unstack().fillna(0).astype(int) > 2000 #if there are enough samples (~ 250 of each generator)
temp.index = temp.index.droplevel(0)
temp['{all}'] = True
enough_platform_language_samples = temp

temp = results_language.copy()
temp = temp[~temp.Language.isin(['ga', 'gd', 'sk', 'sl'])]#exclude languages with not enough samples
temp = temp[~temp['Train Language'].isin(['{en-es-ru-x3}'])]#exclude training with 3x more samples
languages = temp.Language.unique()
temp = temp.pivot(index=['Train Language', 'Detector'], columns='Language', values='AUC ROC').reset_index()

res_df = pd.DataFrame()
for lang in languages:#[languages != '{all}']:
 for (src, trg) in itertools.combinations_with_replacement(temp['Train Language'].unique(), 2):
   if src == trg: continue
   try:
    res = pg.ttest(temp[(temp["Train Language"] == trg)][lang], temp[(temp["Train Language"] == src)][lang], paired=True)
    #if (res['p-val'][0] >= 0.05): continue
    print(f"\nTest language: {lang}, ({trg}, {src})")
    display(res.style.apply(lambda _: np.where(res['p-val'] >= 0.05, 'background-color: yellow', '')))

    res = stats.ttest_rel(temp[temp["Train Language"] == trg][lang], temp[temp["Train Language"] == src][lang])
    res_df = pd.concat([res_df, pd.DataFrame({'Test Language':lang, 'Train Language':trg, '0.95 Confidence Intrval Min':stats.bayes_mvs(temp[temp["Train Language"] == trg][lang], alpha=0.95)[0].minmax[0], '0.95 Confidence Intrval Max':stats.bayes_mvs(temp[temp["Train Language"] == trg][lang], alpha=0.95)[0].minmax[1]}, index=[0])], ignore_index=True)
    res_df = pd.concat([res_df, pd.DataFrame({'Test Language':lang, 'Train Language':src, '0.95 Confidence Intrval Min':stats.bayes_mvs(temp[temp["Train Language"] == src][lang], alpha=0.95)[0].minmax[0], '0.95 Confidence Intrval Max':stats.bayes_mvs(temp[temp["Train Language"] == src][lang], alpha=0.95)[0].minmax[1]}, index=[0])], ignore_index=True)
   except:
    pass
res_df = res_df.drop_duplicates().sort_values(by=['Test Language', 'Train Language']).reset_index(drop=True)

temp_mean = temp.groupby('Train Language').mean(numeric_only=True)
#sort_key = {'en': 0, 'es': 1, 'ru': 2, 'all': 3, 'en3': 4}
#temp_mean = temp_mean.sort_index(key=lambda x: x.map(sort_key)).loc[:'all',:]
res_df['ci'] = (res_df['0.95 Confidence Intrval Max'] - res_df['0.95 Confidence Intrval Min']) / 2
res_df.drop(columns=['0.95 Confidence Intrval Max', '0.95 Confidence Intrval Min'], inplace=True)
temp_ci = res_df.set_index(['Train Language', 'Test Language']).stack().unstack(level=1).reset_index().set_index('Train Language').drop(columns=['level_1'])#.sort_index(key=lambda x: x.map(sort_key))
temp_mean_bak = temp_mean.copy()

#bg_text_color_threshold = 0.1
temp_mean = temp_mean_bak.copy()
temp = temp_mean.copy()
for col in temp_mean.columns:
  temp[col] = [f"{str('%.2f' % (x))} (±{str('%.2f' % (y))})" if y!='n.s.' else f"{str('%.2f' % (x))} (n.s.)" for x,y in zip(temp_mean[col], temp_ci[col])]

#temp = pd.concat([temp, pd.DataFrame({'Language Average': temp_mean.mean(axis=1)})], axis=1)
#temp_mean = pd.concat([temp_mean, pd.DataFrame({'Language Average': temp_mean.mean(axis=1)})], axis=1)

#temp_mean.rename(index=rename_obfuscators, inplace=True)
#temp.rename(index=rename_obfuscators, inplace=True)

temp[[x for x in enough_platform_language_samples[~enough_platform_language_samples].index.tolist()]] = 'nan'
temp_mean[[x for x in enough_platform_language_samples[~enough_platform_language_samples].index.tolist()]] = 'nan'
#temp = temp.style.format(precision=2).background_gradient(vmin=0.5, vmax=1.5, text_color_threshold=0, axis=None)
#print(temp.applymap_index(lambda v: "font-weight: bold;", axis=0).applymap_index(lambda v: "font-weight: bold;", axis=1).to_latex(convert_css=True).replace('\\multirow', '\\hline\n\\multirow').replace('{\\cellcolor[HTML]{000000}} \\color[HTML]{000000} nan', 'N/A').replace('nan', 'N/A').replace('\\multirow[c]{4}{*}{', '\\multirow[c]{4}{*}{\\rotatebox{90}{\\parbox{1.8cm}{').replace('} &', '}}} &').replace('{en-es-ru}}}', '{en-es-ru}').replace('mDeBERTa-v3-base', 'mDeBER\-Ta-v3-base'))
temp = temp.sort_values(by=['Language'], axis=1)
temp = temp.style.background_gradient(vmin=0.5, vmax=1.5, text_color_threshold=0, gmap=temp_mean, axis=None).format({'Language Average': '{:.2%}'.format}, precision=2)
display(temp)
print(temp.applymap_index(lambda v: "font-weight: bold;", axis=0).applymap_index(lambda v: "font-weight: bold;", axis=1).to_latex(convert_css=True).replace('\\multirow', '\\hline\n\\multirow').replace('{\\cellcolor[HTML]{000000}} \\color[HTML]{000000} nan', 'N/A').replace('nan', 'N/A').replace('\\bfseries {en-es-ru}', '\hline\n\\bfseries {en-es-ru}').replace('\\color[HTML]{000000} ', '\\textcolor{black}{').replace(') & {\\cellcolor', ')} & {\\cellcolor').replace(') & N/A', ')} & N/A').replace(' \\', '} \\').replace('&}', '&').replace('}}}', '}}'))


Test language: {all}, (es, en)


Unnamed: 0,T,dof,alternative,p-val,CI95%,cohen-d,BF10,power
T-test,-0.638733,6,two-sided,0.54659,[-0.02 0.01],0.086229,0.417,0.054333



Test language: {all}, (ru, en)


Unnamed: 0,T,dof,alternative,p-val,CI95%,cohen-d,BF10,power
T-test,-1.325776,6,two-sided,0.233153,[-0.08 0.02],0.393308,0.675,0.142849



Test language: {all}, ({en-es-ru}, en)


Unnamed: 0,T,dof,alternative,p-val,CI95%,cohen-d,BF10,power
T-test,4.697472,6,two-sided,0.003335,[0.02 0.07],1.172777,15.796,0.734828



Test language: {all}, (ru, es)


Unnamed: 0,T,dof,alternative,p-val,CI95%,cohen-d,BF10,power
T-test,-1.446031,6,two-sided,0.198307,[-0.06 0.02],0.319832,0.75,0.110912



Test language: {all}, ({en-es-ru}, es)


Unnamed: 0,T,dof,alternative,p-val,CI95%,cohen-d,BF10,power
T-test,3.48865,6,two-sided,0.013005,[0.01 0.08],1.131194,5.485,0.704708



Test language: {all}, ({en-es-ru}, ru)


Unnamed: 0,T,dof,alternative,p-val,CI95%,cohen-d,BF10,power
T-test,2.798478,6,two-sided,0.031227,[0.01 0.13],1.090314,2.818,0.673595



Test language: cs, (es, en)


Unnamed: 0,T,dof,alternative,p-val,CI95%,cohen-d,BF10,power
T-test,-1.634649,6,two-sided,0.15324,[-0.06 0.01],0.403772,0.892,0.147957



Test language: cs, (ru, en)


Unnamed: 0,T,dof,alternative,p-val,CI95%,cohen-d,BF10,power
T-test,-1.748646,6,two-sided,0.130936,[-0.09 0.01],0.524619,0.994,0.216777



Test language: cs, ({en-es-ru}, en)


Unnamed: 0,T,dof,alternative,p-val,CI95%,cohen-d,BF10,power
T-test,1.737926,6,two-sided,0.132891,[-0.01 0.06],0.567008,0.983,0.244983



Test language: cs, (ru, es)


Unnamed: 0,T,dof,alternative,p-val,CI95%,cohen-d,BF10,power
T-test,-0.867736,6,two-sided,0.418887,[-0.05 0.02],0.163046,0.477,0.065573



Test language: cs, ({en-es-ru}, es)


Unnamed: 0,T,dof,alternative,p-val,CI95%,cohen-d,BF10,power
T-test,2.551696,6,two-sided,0.043391,[0. 0.09],0.884182,2.204,0.501279



Test language: cs, ({en-es-ru}, ru)


Unnamed: 0,T,dof,alternative,p-val,CI95%,cohen-d,BF10,power
T-test,2.575619,6,two-sided,0.042016,[0. 0.12],0.92457,2.257,0.536261



Test language: pl, (es, en)


Unnamed: 0,T,dof,alternative,p-val,CI95%,cohen-d,BF10,power
T-test,-1.133004,6,two-sided,0.300448,[-0.04 0.01],0.216914,0.577,0.077703



Test language: pl, (ru, en)


Unnamed: 0,T,dof,alternative,p-val,CI95%,cohen-d,BF10,power
T-test,-0.961578,6,two-sided,0.373395,[-0.06 0.03],0.267915,0.508,0.092492



Test language: pl, ({en-es-ru}, en)


Unnamed: 0,T,dof,alternative,p-val,CI95%,cohen-d,BF10,power
T-test,1.950668,6,two-sided,0.098967,[-0.01 0.04],0.439209,1.209,0.16628



Test language: pl, (ru, es)


Unnamed: 0,T,dof,alternative,p-val,CI95%,cohen-d,BF10,power
T-test,-0.433376,6,two-sided,0.679889,[-0.04 0.02],0.072476,0.382,0.053059



Test language: pl, ({en-es-ru}, es)


Unnamed: 0,T,dof,alternative,p-val,CI95%,cohen-d,BF10,power
T-test,2.86221,6,two-sided,0.028718,[0. 0.06],0.59047,3.001,0.261426



Test language: pl, ({en-es-ru}, ru)


Unnamed: 0,T,dof,alternative,p-val,CI95%,cohen-d,BF10,power
T-test,2.249436,6,two-sided,0.065492,[-0. 0.08],0.587794,1.628,0.259522



Test language: ar, (es, en)


Unnamed: 0,T,dof,alternative,p-val,CI95%,cohen-d,BF10,power
T-test,1.124668,6,two-sided,0.303699,[-0.02 0.05],0.265506,0.573,0.091721



Test language: ar, (ru, en)


Unnamed: 0,T,dof,alternative,p-val,CI95%,cohen-d,BF10,power
T-test,-0.148864,6,two-sided,0.886538,[-0.07 0.06],0.048266,0.357,0.051356



Test language: ar, ({en-es-ru}, en)


Unnamed: 0,T,dof,alternative,p-val,CI95%,cohen-d,BF10,power
T-test,5.967275,6,two-sided,0.000993,[0.05 0.11],1.678484,41.204,0.955817



Test language: ar, (ru, es)


Unnamed: 0,T,dof,alternative,p-val,CI95%,cohen-d,BF10,power
T-test,-0.843288,6,two-sided,0.431396,[-0.07 0.04],0.225316,0.469,0.079916



Test language: ar, ({en-es-ru}, es)


Unnamed: 0,T,dof,alternative,p-val,CI95%,cohen-d,BF10,power
T-test,3.611917,6,two-sided,0.011205,[0.02 0.11],1.377878,6.151,0.856942



Test language: ar, ({en-es-ru}, ru)


Unnamed: 0,T,dof,alternative,p-val,CI95%,cohen-d,BF10,power
T-test,2.587645,6,two-sided,0.041342,[0. 0.16],1.068175,2.285,0.656184



Test language: es, (es, en)


Unnamed: 0,T,dof,alternative,p-val,CI95%,cohen-d,BF10,power
T-test,3.194778,6,two-sided,0.018723,[0.02 0.12],1.477617,4.15,0.899547



Test language: es, (ru, en)


Unnamed: 0,T,dof,alternative,p-val,CI95%,cohen-d,BF10,power
T-test,-1.898094,6,two-sided,0.106451,[-0.11 0.01],0.501829,1.148,0.202451



Test language: es, ({en-es-ru}, en)


Unnamed: 0,T,dof,alternative,p-val,CI95%,cohen-d,BF10,power
T-test,2.803569,6,two-sided,0.031018,[0.01 0.11],1.304457,2.832,0.818555



Test language: es, (ru, es)


Unnamed: 0,T,dof,alternative,p-val,CI95%,cohen-d,BF10,power
T-test,-2.60575,6,two-sided,0.04035,[-0.22 -0.01],1.33363,2.326,0.834533



Test language: es, ({en-es-ru}, es)


Unnamed: 0,T,dof,alternative,p-val,CI95%,cohen-d,BF10,power
T-test,-1.378571,6,two-sided,0.217221,[-0.02 0. ],0.326524,0.707,0.113536



Test language: es, ({en-es-ru}, ru)


Unnamed: 0,T,dof,alternative,p-val,CI95%,cohen-d,BF10,power
T-test,2.385373,6,two-sided,0.054368,[-0. 0.22],1.253982,1.866,0.788668



Test language: uk, (es, en)


Unnamed: 0,T,dof,alternative,p-val,CI95%,cohen-d,BF10,power
T-test,1.270602,6,two-sided,0.250912,[-0.01 0.03],0.150998,0.644,0.063344



Test language: uk, (ru, en)


Unnamed: 0,T,dof,alternative,p-val,CI95%,cohen-d,BF10,power
T-test,4.417097,6,two-sided,0.004485,[0.03 0.12],1.574611,12.523,0.931181



Test language: uk, ({en-es-ru}, en)


Unnamed: 0,T,dof,alternative,p-val,CI95%,cohen-d,BF10,power
T-test,4.302046,6,two-sided,0.005081,[0.03 0.12],1.654814,11.359,0.950952



Test language: uk, (ru, es)


Unnamed: 0,T,dof,alternative,p-val,CI95%,cohen-d,BF10,power
T-test,4.73656,6,two-sided,0.003203,[0.03 0.1 ],1.362947,16.305,0.849625



Test language: uk, ({en-es-ru}, es)


Unnamed: 0,T,dof,alternative,p-val,CI95%,cohen-d,BF10,power
T-test,3.990462,6,two-sided,0.007197,[0.03 0.11],1.430763,8.663,0.88087



Test language: uk, ({en-es-ru}, ru)


Unnamed: 0,T,dof,alternative,p-val,CI95%,cohen-d,BF10,power
T-test,0.066785,6,two-sided,0.948923,[-0.02 0.02],0.015806,0.354,0.050145



Test language: nl, (es, en)


Unnamed: 0,T,dof,alternative,p-val,CI95%,cohen-d,BF10,power
T-test,-0.306496,6,two-sided,0.769584,[-0.03 0.02],0.087386,0.367,0.054451



Test language: nl, (ru, en)


Unnamed: 0,T,dof,alternative,p-val,CI95%,cohen-d,BF10,power
T-test,-2.590386,6,two-sided,0.041191,[-0.09 -0. ],0.867628,2.291,0.486906



Test language: nl, ({en-es-ru}, en)


Unnamed: 0,T,dof,alternative,p-val,CI95%,cohen-d,BF10,power
T-test,1.650181,6,two-sided,0.149998,[-0.01 0.05],0.612285,0.905,0.277212



Test language: nl, (ru, es)


Unnamed: 0,T,dof,alternative,p-val,CI95%,cohen-d,BF10,power
T-test,-3.15916,6,two-sided,0.019586,[-0.08 -0.01],0.766069,4.011,0.399597



Test language: nl, ({en-es-ru}, es)


Unnamed: 0,T,dof,alternative,p-val,CI95%,cohen-d,BF10,power
T-test,2.218295,6,two-sided,0.068358,[-0. 0.05],0.606972,1.578,0.273325



Test language: nl, ({en-es-ru}, ru)


Unnamed: 0,T,dof,alternative,p-val,CI95%,cohen-d,BF10,power
T-test,3.420671,6,two-sided,0.014132,[0.02 0.11],1.177693,5.146,0.73828



Test language: el, (es, en)


Unnamed: 0,T,dof,alternative,p-val,CI95%,cohen-d,BF10,power
T-test,-1.66985,6,two-sided,0.145988,[-0.08 0.02],0.599938,0.922,0.26822



Test language: el, (ru, en)


Unnamed: 0,T,dof,alternative,p-val,CI95%,cohen-d,BF10,power
T-test,-1.123603,6,two-sided,0.304117,[-0.07 0.03],0.448516,0.572,0.171353



Test language: el, ({en-es-ru}, en)


Unnamed: 0,T,dof,alternative,p-val,CI95%,cohen-d,BF10,power
T-test,1.915022,6,two-sided,0.103981,[-0.01 0.06],0.931575,1.168,0.542302



Test language: el, (ru, es)


Unnamed: 0,T,dof,alternative,p-val,CI95%,cohen-d,BF10,power
T-test,0.465618,6,two-sided,0.65791,[-0.04 0.06],0.146491,0.387,0.062555



Test language: el, ({en-es-ru}, es)


Unnamed: 0,T,dof,alternative,p-val,CI95%,cohen-d,BF10,power
T-test,2.643051,6,two-sided,0.038384,[0. 0.11],1.163117,2.415,0.727977



Test language: el, ({en-es-ru}, ru)


Unnamed: 0,T,dof,alternative,p-val,CI95%,cohen-d,BF10,power
T-test,2.251856,6,two-sided,0.065274,[-0. 0.1],1.044268,1.632,0.636988



Test language: ca, (es, en)


Unnamed: 0,T,dof,alternative,p-val,CI95%,cohen-d,BF10,power
T-test,5.490707,6,two-sided,0.001528,[0.04 0.09],1.829579,29.243,0.978402



Test language: ca, (ru, en)


Unnamed: 0,T,dof,alternative,p-val,CI95%,cohen-d,BF10,power
T-test,-0.676078,6,two-sided,0.524181,[-0.11 0.06],0.285897,0.426,0.098486



Test language: ca, ({en-es-ru}, en)


Unnamed: 0,T,dof,alternative,p-val,CI95%,cohen-d,BF10,power
T-test,7.193571,6,two-sided,0.000365,[0.05 0.11],2.347866,91.774,0.999031



Test language: ca, (ru, es)


Unnamed: 0,T,dof,alternative,p-val,CI95%,cohen-d,BF10,power
T-test,-2.304246,6,two-sided,0.060746,[-0.18 0.01],1.067782,1.72,0.655872



Test language: ca, ({en-es-ru}, es)


Unnamed: 0,T,dof,alternative,p-val,CI95%,cohen-d,BF10,power
T-test,2.235723,6,two-sided,0.066738,[-0. 0.04],0.495674,1.606,0.198686



Test language: ca, ({en-es-ru}, ru)


Unnamed: 0,T,dof,alternative,p-val,CI95%,cohen-d,BF10,power
T-test,2.679957,6,two-sided,0.036538,[0.01 0.2 ],1.272657,2.505,0.800054



Test language: ro, (es, en)


Unnamed: 0,T,dof,alternative,p-val,CI95%,cohen-d,BF10,power
T-test,-1.514698,6,two-sided,0.180624,[-0.03 0.01],0.343083,0.798,0.120274



Test language: ro, (ru, en)


Unnamed: 0,T,dof,alternative,p-val,CI95%,cohen-d,BF10,power
T-test,-2.422676,6,two-sided,0.051675,[-0.12 0. ],0.989443,1.937,0.59164



Test language: ro, ({en-es-ru}, en)


Unnamed: 0,T,dof,alternative,p-val,CI95%,cohen-d,BF10,power
T-test,1.192617,6,two-sided,0.27804,[-0.01 0.02],0.310511,0.604,0.107353



Test language: ro, (ru, es)


Unnamed: 0,T,dof,alternative,p-val,CI95%,cohen-d,BF10,power
T-test,-2.611057,6,two-sided,0.040064,[-0.09 -0. ],0.785325,2.339,0.41593



Test language: ro, ({en-es-ru}, es)


Unnamed: 0,T,dof,alternative,p-val,CI95%,cohen-d,BF10,power
T-test,3.02001,6,two-sided,0.023397,[0. 0.03],0.57749,3.504,0.252258



Test language: ro, ({en-es-ru}, ru)


Unnamed: 0,T,dof,alternative,p-val,CI95%,cohen-d,BF10,power
T-test,2.845412,6,two-sided,0.029358,[0.01 0.12],1.103709,2.952,0.683943



Test language: pt, (es, en)


Unnamed: 0,T,dof,alternative,p-val,CI95%,cohen-d,BF10,power
T-test,0.833934,6,two-sided,0.436253,[-0.01 0.02],0.21948,0.466,0.07837



Test language: pt, (ru, en)


Unnamed: 0,T,dof,alternative,p-val,CI95%,cohen-d,BF10,power
T-test,-2.596372,6,two-sided,0.040861,[-0.11 -0. ],1.066586,2.305,0.654921



Test language: pt, ({en-es-ru}, en)


Unnamed: 0,T,dof,alternative,p-val,CI95%,cohen-d,BF10,power
T-test,2.888329,6,two-sided,0.027754,[0. 0.03],0.85145,3.08,0.472867



Test language: pt, (ru, es)


Unnamed: 0,T,dof,alternative,p-val,CI95%,cohen-d,BF10,power
T-test,-2.926758,6,two-sided,0.026398,[-0.12 -0.01],1.191524,3.198,0.747863



Test language: pt, ({en-es-ru}, es)


Unnamed: 0,T,dof,alternative,p-val,CI95%,cohen-d,BF10,power
T-test,2.723572,6,two-sided,0.034479,[0. 0.02],0.793335,2.616,0.422765



Test language: pt, ({en-es-ru}, ru)


Unnamed: 0,T,dof,alternative,p-val,CI95%,cohen-d,BF10,power
T-test,3.16211,6,two-sided,0.019513,[0.02 0.14],1.450872,4.022,0.88917



Test language: et, (es, en)


Unnamed: 0,T,dof,alternative,p-val,CI95%,cohen-d,BF10,power
T-test,-1.930665,6,two-sided,0.10175,[-0.04 0.01],0.463802,1.186,0.179917



Test language: et, (ru, en)


Unnamed: 0,T,dof,alternative,p-val,CI95%,cohen-d,BF10,power
T-test,-1.613717,6,two-sided,0.157716,[-0.1 0.02],0.625399,0.874,0.286923



Test language: et, ({en-es-ru}, en)


Unnamed: 0,T,dof,alternative,p-val,CI95%,cohen-d,BF10,power
T-test,1.942527,6,two-sided,0.100091,[-0. 0.03],0.405125,1.2,0.148628



Test language: et, (ru, es)


Unnamed: 0,T,dof,alternative,p-val,CI95%,cohen-d,BF10,power
T-test,-1.342782,6,two-sided,0.22791,[-0.06 0.02],0.301361,0.685,0.103967



Test language: et, ({en-es-ru}, es)


Unnamed: 0,T,dof,alternative,p-val,CI95%,cohen-d,BF10,power
T-test,3.112408,6,two-sided,0.020785,[0.01 0.06],0.779129,3.833,0.410658



Test language: et, ({en-es-ru}, ru)


Unnamed: 0,T,dof,alternative,p-val,CI95%,cohen-d,BF10,power
T-test,2.1228,6,two-sided,0.077981,[-0.01 0.12],0.835453,1.434,0.459009



Test language: de, (es, en)


Unnamed: 0,T,dof,alternative,p-val,CI95%,cohen-d,BF10,power
T-test,2.316992,6,two-sided,0.059694,[-0. 0.03],0.510425,1.742,0.207784



Test language: de, (ru, en)


Unnamed: 0,T,dof,alternative,p-val,CI95%,cohen-d,BF10,power
T-test,-2.576962,6,two-sided,0.04194,[-0.07 -0. ],0.975584,2.26,0.579936



Test language: de, ({en-es-ru}, en)


Unnamed: 0,T,dof,alternative,p-val,CI95%,cohen-d,BF10,power
T-test,5.30698,6,two-sided,0.001818,[0.02 0.04],1.063759,25.487,0.652669



Test language: de, (ru, es)


Unnamed: 0,T,dof,alternative,p-val,CI95%,cohen-d,BF10,power
T-test,-4.26815,6,two-sided,0.005274,[-0.08 -0.02],1.281825,11.034,0.805503



Test language: de, ({en-es-ru}, es)


Unnamed: 0,T,dof,alternative,p-val,CI95%,cohen-d,BF10,power
T-test,2.040436,6,two-sided,0.087392,[-0. 0.03],0.502872,1.321,0.203094



Test language: de, ({en-es-ru}, ru)


Unnamed: 0,T,dof,alternative,p-val,CI95%,cohen-d,BF10,power
T-test,4.059291,6,two-sided,0.006656,[0.03 0.11],1.668534,9.205,0.953822



Test language: ru, (es, en)


Unnamed: 0,T,dof,alternative,p-val,CI95%,cohen-d,BF10,power
T-test,1.670081,6,two-sided,0.145941,[-0.01 0.03],0.257387,0.922,0.089173



Test language: ru, (ru, en)


Unnamed: 0,T,dof,alternative,p-val,CI95%,cohen-d,BF10,power
T-test,4.08481,6,two-sided,0.006467,[0.03 0.11],1.67438,9.413,0.955003



Test language: ru, ({en-es-ru}, en)


Unnamed: 0,T,dof,alternative,p-val,CI95%,cohen-d,BF10,power
T-test,3.567262,6,two-sided,0.011823,[0.02 0.1 ],1.483724,5.902,0.901811



Test language: ru, (ru, es)


Unnamed: 0,T,dof,alternative,p-val,CI95%,cohen-d,BF10,power
T-test,4.147974,6,two-sided,0.006025,[0.02 0.09],1.513362,9.947,0.912268



Test language: ru, ({en-es-ru}, es)


Unnamed: 0,T,dof,alternative,p-val,CI95%,cohen-d,BF10,power
T-test,3.156975,6,two-sided,0.01964,[0.01 0.09],1.30476,4.002,0.818726



Test language: ru, ({en-es-ru}, ru)


Unnamed: 0,T,dof,alternative,p-val,CI95%,cohen-d,BF10,power
T-test,-1.218786,6,two-sided,0.268666,[-0.02 0.01],0.192282,0.617,0.071716



Test language: bg, (es, en)


Unnamed: 0,T,dof,alternative,p-val,CI95%,cohen-d,BF10,power
T-test,-1.107658,6,two-sided,0.310427,[-0.03 0.01],0.1204,0.565,0.058465



Test language: bg, (ru, en)


Unnamed: 0,T,dof,alternative,p-val,CI95%,cohen-d,BF10,power
T-test,1.361211,6,two-sided,0.222348,[-0.02 0.09],0.436322,0.696,0.164728



Test language: bg, ({en-es-ru}, en)


Unnamed: 0,T,dof,alternative,p-val,CI95%,cohen-d,BF10,power
T-test,2.097586,6,two-sided,0.080747,[-0.01 0.09],0.643749,1.399,0.300775



Test language: bg, (ru, es)


Unnamed: 0,T,dof,alternative,p-val,CI95%,cohen-d,BF10,power
T-test,2.220355,6,two-sided,0.068165,[-0. 0.09],0.566714,1.581,0.244781



Test language: bg, ({en-es-ru}, es)


Unnamed: 0,T,dof,alternative,p-val,CI95%,cohen-d,BF10,power
T-test,2.770331,6,two-sided,0.032408,[0.01 0.1 ],0.775228,2.741,0.407346



Test language: bg, ({en-es-ru}, ru)


Unnamed: 0,T,dof,alternative,p-val,CI95%,cohen-d,BF10,power
T-test,1.672547,6,two-sided,0.145446,[-0.01 0.03],0.243982,0.924,0.085148



Test language: zh, (es, en)


Unnamed: 0,T,dof,alternative,p-val,CI95%,cohen-d,BF10,power
T-test,0.34536,6,two-sided,0.74161,[-0.04 0.06],0.045843,0.371,0.051223



Test language: zh, (ru, en)


Unnamed: 0,T,dof,alternative,p-val,CI95%,cohen-d,BF10,power
T-test,0.126687,6,two-sided,0.903327,[-0.07 0.08],0.024603,0.356,0.050352



Test language: zh, ({en-es-ru}, en)


Unnamed: 0,T,dof,alternative,p-val,CI95%,cohen-d,BF10,power
T-test,3.782827,6,two-sided,0.00915,[0.05 0.22],1.110192,7.192,0.6889



Test language: zh, (ru, es)


Unnamed: 0,T,dof,alternative,p-val,CI95%,cohen-d,BF10,power
T-test,-0.144227,6,two-sided,0.890043,[-0.05 0.04],0.016495,0.356,0.050158



Test language: zh, ({en-es-ru}, es)


Unnamed: 0,T,dof,alternative,p-val,CI95%,cohen-d,BF10,power
T-test,2.891516,6,two-sided,0.027639,[0.02 0.24],1.054612,3.089,0.645342



Test language: zh, ({en-es-ru}, ru)


Unnamed: 0,T,dof,alternative,p-val,CI95%,cohen-d,BF10,power
T-test,2.382155,6,two-sided,0.054607,[-0. 0.26],0.920158,1.86,0.532452



Test language: hu, (es, en)


Unnamed: 0,T,dof,alternative,p-val,CI95%,cohen-d,BF10,power
T-test,-1.53092,6,two-sided,0.176668,[-0.04 0.01],0.478442,0.81,0.188386



Test language: hu, (ru, en)


Unnamed: 0,T,dof,alternative,p-val,CI95%,cohen-d,BF10,power
T-test,-2.31947,6,two-sided,0.059492,[-0.11 0. ],0.944092,1.746,0.553066



Test language: hu, ({en-es-ru}, en)


Unnamed: 0,T,dof,alternative,p-val,CI95%,cohen-d,BF10,power
T-test,0.829921,6,two-sided,0.43835,[-0.02 0.03],0.310358,0.465,0.107296



Test language: hu, (ru, es)


Unnamed: 0,T,dof,alternative,p-val,CI95%,cohen-d,BF10,power
T-test,-2.386895,6,two-sided,0.054255,[-0.08 0. ],0.622252,1.868,0.284578



Test language: hu, ({en-es-ru}, es)


Unnamed: 0,T,dof,alternative,p-val,CI95%,cohen-d,BF10,power
T-test,3.250023,6,two-sided,0.017466,[0.01 0.04],0.731369,4.376,0.370598



Test language: hu, ({en-es-ru}, ru)


Unnamed: 0,T,dof,alternative,p-val,CI95%,cohen-d,BF10,power
T-test,2.715377,6,two-sided,0.034856,[0.01 0.12],1.093902,2.595,0.676381



Test language: hr, (es, en)


Unnamed: 0,T,dof,alternative,p-val,CI95%,cohen-d,BF10,power
T-test,-1.769812,6,two-sided,0.127157,[-0.05 0.01],0.54146,1.014,0.227745



Test language: hr, (ru, en)


Unnamed: 0,T,dof,alternative,p-val,CI95%,cohen-d,BF10,power
T-test,-2.059619,6,two-sided,0.085101,[-0.1 0.01],0.812773,1.347,0.439437



Test language: hr, ({en-es-ru}, en)


Unnamed: 0,T,dof,alternative,p-val,CI95%,cohen-d,BF10,power
T-test,0.306849,6,two-sided,0.769328,[-0.02 0.03],0.087331,0.367,0.054445



Test language: hr, (ru, es)


Unnamed: 0,T,dof,alternative,p-val,CI95%,cohen-d,BF10,power
T-test,-1.968519,6,two-sided,0.096548,[-0.05 0.01],0.36009,1.231,0.127558



Test language: hr, ({en-es-ru}, es)


Unnamed: 0,T,dof,alternative,p-val,CI95%,cohen-d,BF10,power
T-test,2.590828,6,two-sided,0.041166,[0. 0.05],0.540801,2.292,0.22731



Test language: hr, ({en-es-ru}, ru)


Unnamed: 0,T,dof,alternative,p-val,CI95%,cohen-d,BF10,power
T-test,2.470639,6,two-sided,0.048416,[0. 0.1],0.804265,2.032,0.432126



Test language: en, (es, en)


Unnamed: 0,T,dof,alternative,p-val,CI95%,cohen-d,BF10,power
T-test,-3.901944,6,two-sided,0.007966,[-0.1 -0.02],1.861041,8.006,0.981591



Test language: en, (ru, en)


Unnamed: 0,T,dof,alternative,p-val,CI95%,cohen-d,BF10,power
T-test,-4.070857,6,two-sided,0.006569,[-0.14 -0.03],1.998118,9.299,0.991209



Test language: en, ({en-es-ru}, en)


Unnamed: 0,T,dof,alternative,p-val,CI95%,cohen-d,BF10,power
T-test,-2.394541,6,two-sided,0.053693,[-0.01 0. ],0.69106,1.883,0.337786



Test language: en, (ru, es)


Unnamed: 0,T,dof,alternative,p-val,CI95%,cohen-d,BF10,power
T-test,-2.191157,6,two-sided,0.070962,[-0.06 0. ],0.515779,1.536,0.211149



Test language: en, ({en-es-ru}, es)


Unnamed: 0,T,dof,alternative,p-val,CI95%,cohen-d,BF10,power
T-test,3.401772,6,two-sided,0.014464,[0.01 0.09],1.674619,5.055,0.955051



Test language: en, ({en-es-ru}, ru)


Unnamed: 0,T,dof,alternative,p-val,CI95%,cohen-d,BF10,power
T-test,3.637307,6,two-sided,0.01087,[0.03 0.13],1.862422,6.297,0.981721


Language,ar,bg,ca,cs,de,el,en,es,et,ga,gd,hr,hu,nl,pl,pt,ro,ru,sk,sl,uk,zh,{all}
Train Language,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1
en,0.81 (±0.05),0.90 (±0.08),0.78 (±0.03),0.91 (±0.05),0.88 (±0.02),0.90 (±0.03),0.96 (±0.01),0.87 (±0.06),0.92 (±0.03),,,0.94 (±0.03),0.97 (±0.03),0.84 (±0.02),0.89 (±0.05),0.92 (±0.02),0.94 (±0.02),0.87 (±0.05),,,0.81 (±0.05),0.73 (±0.14),0.87 (±0.04)
es,0.82 (±0.05),0.89 (±0.08),0.85 (±0.03),0.89 (±0.06),0.89 (±0.03),0.87 (±0.06),0.90 (±0.04),0.94 (±0.01),0.90 (±0.05),,,0.92 (±0.05),0.95 (±0.04),0.84 (±0.03),0.88 (±0.06),0.93 (±0.02),0.93 (±0.03),0.88 (±0.04),,,0.82 (±0.05),0.73 (±0.14),0.86 (±0.05)
ru,0.81 (±0.10),0.93 (±0.05),0.76 (±0.10),0.87 (±0.08),0.84 (±0.04),0.88 (±0.06),0.87 (±0.06),0.82 (±0.11),0.88 (±0.08),,,0.89 (±0.07),0.91 (±0.07),0.79 (±0.06),0.87 (±0.07),0.86 (±0.07),0.88 (±0.07),0.94 (±0.02),,,0.89 (±0.04),0.73 (±0.17),0.84 (±0.08)
{en-es-ru},0.89 (±0.03),0.94 (±0.04),0.86 (±0.03),0.93 (±0.03),0.91 (±0.03),0.93 (±0.02),0.95 (±0.01),0.93 (±0.02),0.93 (±0.03),,,0.94 (±0.04),0.97 (±0.02),0.86 (±0.03),0.91 (±0.04),0.94 (±0.01),0.95 (±0.02),0.93 (±0.03),,,0.89 (±0.03),0.86 (±0.08),0.91 (±0.03)


\begin{tabular}{llllllllllllllllllllllll}
Language & \bfseries ar & \bfseries bg & \bfseries ca & \bfseries cs & \bfseries de & \bfseries el & \bfseries en & \bfseries es & \bfseries et & \bfseries ga & \bfseries gd & \bfseries hr & \bfseries hu & \bfseries nl & \bfseries pl & \bfseries pt & \bfseries ro & \bfseries ru & \bfseries sk & \bfseries sl & \bfseries uk & \bfseries zh & \bfseries {all}} \\
Train Language &  &  &  &  &  &  &  &  &  &  &  &  &  &  &  &  &  &  &  &  &  &  & } \\
\bfseries en & {\cellcolor[HTML]{BCC7E1}} \textcolor{black}{0.81 (±0.05)} & {\cellcolor[HTML]{9CB9D9}} \textcolor{black}{0.90 (±0.08)} & {\cellcolor[HTML]{C6CCE3}} \textcolor{black}{0.78 (±0.03)} & {\cellcolor[HTML]{99B8D8}} \textcolor{black}{0.91 (±0.05)} & {\cellcolor[HTML]{A5BDDB}} \textcolor{black}{0.88 (±0.02)} & {\cellcolor[HTML]{9AB8D8}} \textcolor{black}{0.90 (±0.03)} & {\cellcolor[HTML]{84B0D3}} \textcolor{black}{0.96 (±0.01)} & {\cellcolor[HTML]{A8BEDC}} \textcolor{black}{0.87 (±0.06)} & {\cell

  print(temp.applymap_index(lambda v: "font-weight: bold;", axis=0).applymap_index(lambda v: "font-weight: bold;", axis=1).to_latex(convert_css=True).replace('\\multirow', '\\hline\n\\multirow').replace('{\\cellcolor[HTML]{000000}} \\color[HTML]{000000} nan', 'N/A').replace('nan', 'N/A').replace('\\bfseries {en-es-ru}', '\hline\n\\bfseries {en-es-ru}').replace('\\color[HTML]{000000} ', '\\textcolor{black}{').replace(') & {\\cellcolor', ')} & {\\cellcolor').replace(') & N/A', ')} & N/A').replace(' \\', '} \\').replace('&}', '&').replace('}}}', '}}'))


In [80]:
temp_mean.astype(float).style.highlight_max(props='font-weight: bold;', axis=0)

Language,ar,bg,ca,cs,de,el,en,es,et,hr,hu,nl,pl,pt,ro,ru,uk,zh,{all},ga,gd,sk,sl
Train Language,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1
en,0.809295,0.899167,0.779832,0.909984,0.87879,0.904746,0.960253,0.870765,0.917832,0.938125,0.966266,0.839966,0.890541,0.920229,0.94101,0.872232,0.814642,0.727139,0.867748,,,,
es,0.824158,0.888744,0.845237,0.885706,0.893328,0.8719,0.901816,0.938009,0.898208,0.915434,0.949759,0.837107,0.878015,0.925194,0.931187,0.884376,0.823268,0.733953,0.863598,,,,
ru,0.80523,0.930442,0.75573,0.872877,0.841072,0.881664,0.874709,0.821996,0.87718,0.892345,0.911441,0.794452,0.872687,0.861349,0.882712,0.939087,0.890963,0.731219,0.840632,,,,
{en-es-ru},0.889312,0.942544,0.862008,0.934025,0.909034,0.931246,0.954641,0.932067,0.93223,0.941341,0.974758,0.859795,0.910343,0.938163,0.948052,0.933666,0.891547,0.861325,0.911614,,,,


In [81]:
#statistical significance between language-differently finetuned detectors
temp = results_language.copy()
temp = temp[~temp.Language.isin(['ga', 'gd', 'sk', 'sl'])]#exclude languages with not enough samples
temp = temp[~temp['Train Language'].isin(['{en-es-ru-x3}'])]#exclude training with 3x more samples
languages = temp.Language.unique()
temp = temp.pivot(index=['Train Language', 'Detector'], columns='Language', values='AUC ROC').reset_index()
#temp = pd.concat([temp, temp.loc[temp.Category == 'P', :]])

res_df = pd.DataFrame()
for lang in languages[languages != '{all}']:
 for (src, trg) in itertools.combinations_with_replacement(temp['Train Language'].unique(), 2):
   if src == trg: continue
   try:
    res = pg.ttest(temp[(temp["Train Language"] == trg)][lang], temp[(temp["Train Language"] == src)][lang], paired=True)
    if (res['p-val'][0] >= 0.05): continue
    print(f"\nTest language: {lang}, ({trg}, {src})")
    display(res.style.apply(lambda _: np.where(res['p-val'] >= 0.05, 'background-color: yellow', '')))
   except:
    pass


Test language: cs, ({en-es-ru}, es)


  if (res['p-val'][0] >= 0.05): continue
  if (res['p-val'][0] >= 0.05): continue
  if (res['p-val'][0] >= 0.05): continue
  if (res['p-val'][0] >= 0.05): continue
  if (res['p-val'][0] >= 0.05): continue


Unnamed: 0,T,dof,alternative,p-val,CI95%,cohen-d,BF10,power
T-test,2.551696,6,two-sided,0.043391,[0. 0.09],0.884182,2.204,0.501279



Test language: cs, ({en-es-ru}, ru)


  if (res['p-val'][0] >= 0.05): continue


Unnamed: 0,T,dof,alternative,p-val,CI95%,cohen-d,BF10,power
T-test,2.575619,6,two-sided,0.042016,[0. 0.12],0.92457,2.257,0.536261



Test language: pl, ({en-es-ru}, es)


  if (res['p-val'][0] >= 0.05): continue
  if (res['p-val'][0] >= 0.05): continue
  if (res['p-val'][0] >= 0.05): continue
  if (res['p-val'][0] >= 0.05): continue
  if (res['p-val'][0] >= 0.05): continue


Unnamed: 0,T,dof,alternative,p-val,CI95%,cohen-d,BF10,power
T-test,2.86221,6,two-sided,0.028718,[0. 0.06],0.59047,3.001,0.261426



Test language: ar, ({en-es-ru}, en)


  if (res['p-val'][0] >= 0.05): continue
  if (res['p-val'][0] >= 0.05): continue
  if (res['p-val'][0] >= 0.05): continue
  if (res['p-val'][0] >= 0.05): continue


Unnamed: 0,T,dof,alternative,p-val,CI95%,cohen-d,BF10,power
T-test,5.967275,6,two-sided,0.000993,[0.05 0.11],1.678484,41.204,0.955817



Test language: ar, ({en-es-ru}, es)


  if (res['p-val'][0] >= 0.05): continue
  if (res['p-val'][0] >= 0.05): continue


Unnamed: 0,T,dof,alternative,p-val,CI95%,cohen-d,BF10,power
T-test,3.611917,6,two-sided,0.011205,[0.02 0.11],1.377878,6.151,0.856942



Test language: ar, ({en-es-ru}, ru)


  if (res['p-val'][0] >= 0.05): continue


Unnamed: 0,T,dof,alternative,p-val,CI95%,cohen-d,BF10,power
T-test,2.587645,6,two-sided,0.041342,[0. 0.16],1.068175,2.285,0.656184



Test language: es, (es, en)


  if (res['p-val'][0] >= 0.05): continue


Unnamed: 0,T,dof,alternative,p-val,CI95%,cohen-d,BF10,power
T-test,3.194778,6,two-sided,0.018723,[0.02 0.12],1.477617,4.15,0.899547



Test language: es, ({en-es-ru}, en)


  if (res['p-val'][0] >= 0.05): continue
  if (res['p-val'][0] >= 0.05): continue


Unnamed: 0,T,dof,alternative,p-val,CI95%,cohen-d,BF10,power
T-test,2.803569,6,two-sided,0.031018,[0.01 0.11],1.304457,2.832,0.818555



Test language: es, (ru, es)


  if (res['p-val'][0] >= 0.05): continue


Unnamed: 0,T,dof,alternative,p-val,CI95%,cohen-d,BF10,power
T-test,-2.60575,6,two-sided,0.04035,[-0.22 -0.01],1.33363,2.326,0.834533


  if (res['p-val'][0] >= 0.05): continue
  if (res['p-val'][0] >= 0.05): continue



Test language: uk, (ru, en)


  if (res['p-val'][0] >= 0.05): continue
  if (res['p-val'][0] >= 0.05): continue


Unnamed: 0,T,dof,alternative,p-val,CI95%,cohen-d,BF10,power
T-test,4.417097,6,two-sided,0.004485,[0.03 0.12],1.574611,12.523,0.931181



Test language: uk, ({en-es-ru}, en)


  if (res['p-val'][0] >= 0.05): continue


Unnamed: 0,T,dof,alternative,p-val,CI95%,cohen-d,BF10,power
T-test,4.302046,6,two-sided,0.005081,[0.03 0.12],1.654814,11.359,0.950952



Test language: uk, (ru, es)


  if (res['p-val'][0] >= 0.05): continue


Unnamed: 0,T,dof,alternative,p-val,CI95%,cohen-d,BF10,power
T-test,4.73656,6,two-sided,0.003203,[0.03 0.1 ],1.362947,16.305,0.849625



Test language: uk, ({en-es-ru}, es)


  if (res['p-val'][0] >= 0.05): continue


Unnamed: 0,T,dof,alternative,p-val,CI95%,cohen-d,BF10,power
T-test,3.990462,6,two-sided,0.007197,[0.03 0.11],1.430763,8.663,0.88087



Test language: nl, (ru, en)


  if (res['p-val'][0] >= 0.05): continue
  if (res['p-val'][0] >= 0.05): continue
  if (res['p-val'][0] >= 0.05): continue


Unnamed: 0,T,dof,alternative,p-val,CI95%,cohen-d,BF10,power
T-test,-2.590386,6,two-sided,0.041191,[-0.09 -0. ],0.867628,2.291,0.486906



Test language: nl, (ru, es)


  if (res['p-val'][0] >= 0.05): continue
  if (res['p-val'][0] >= 0.05): continue


Unnamed: 0,T,dof,alternative,p-val,CI95%,cohen-d,BF10,power
T-test,-3.15916,6,two-sided,0.019586,[-0.08 -0.01],0.766069,4.011,0.399597



Test language: nl, ({en-es-ru}, ru)


  if (res['p-val'][0] >= 0.05): continue
  if (res['p-val'][0] >= 0.05): continue


Unnamed: 0,T,dof,alternative,p-val,CI95%,cohen-d,BF10,power
T-test,3.420671,6,two-sided,0.014132,[0.02 0.11],1.177693,5.146,0.73828



Test language: el, ({en-es-ru}, es)


  if (res['p-val'][0] >= 0.05): continue
  if (res['p-val'][0] >= 0.05): continue
  if (res['p-val'][0] >= 0.05): continue
  if (res['p-val'][0] >= 0.05): continue
  if (res['p-val'][0] >= 0.05): continue


Unnamed: 0,T,dof,alternative,p-val,CI95%,cohen-d,BF10,power
T-test,2.643051,6,two-sided,0.038384,[0. 0.11],1.163117,2.415,0.727977



Test language: ca, (es, en)


  if (res['p-val'][0] >= 0.05): continue
  if (res['p-val'][0] >= 0.05): continue


Unnamed: 0,T,dof,alternative,p-val,CI95%,cohen-d,BF10,power
T-test,5.490707,6,two-sided,0.001528,[0.04 0.09],1.829579,29.243,0.978402



Test language: ca, ({en-es-ru}, en)


  if (res['p-val'][0] >= 0.05): continue
  if (res['p-val'][0] >= 0.05): continue


Unnamed: 0,T,dof,alternative,p-val,CI95%,cohen-d,BF10,power
T-test,7.193571,6,two-sided,0.000365,[0.05 0.11],2.347866,91.774,0.999031


  if (res['p-val'][0] >= 0.05): continue
  if (res['p-val'][0] >= 0.05): continue
  if (res['p-val'][0] >= 0.05): continue



Test language: ca, ({en-es-ru}, ru)


Unnamed: 0,T,dof,alternative,p-val,CI95%,cohen-d,BF10,power
T-test,2.679957,6,two-sided,0.036538,[0.01 0.2 ],1.272657,2.505,0.800054


  if (res['p-val'][0] >= 0.05): continue



Test language: ro, (ru, es)


  if (res['p-val'][0] >= 0.05): continue
  if (res['p-val'][0] >= 0.05): continue
  if (res['p-val'][0] >= 0.05): continue


Unnamed: 0,T,dof,alternative,p-val,CI95%,cohen-d,BF10,power
T-test,-2.611057,6,two-sided,0.040064,[-0.09 -0. ],0.785325,2.339,0.41593


  if (res['p-val'][0] >= 0.05): continue



Test language: ro, ({en-es-ru}, es)


Unnamed: 0,T,dof,alternative,p-val,CI95%,cohen-d,BF10,power
T-test,3.02001,6,two-sided,0.023397,[0. 0.03],0.57749,3.504,0.252258



Test language: ro, ({en-es-ru}, ru)


  if (res['p-val'][0] >= 0.05): continue


Unnamed: 0,T,dof,alternative,p-val,CI95%,cohen-d,BF10,power
T-test,2.845412,6,two-sided,0.029358,[0.01 0.12],1.103709,2.952,0.683943


  if (res['p-val'][0] >= 0.05): continue
  if (res['p-val'][0] >= 0.05): continue



Test language: pt, (ru, en)


Unnamed: 0,T,dof,alternative,p-val,CI95%,cohen-d,BF10,power
T-test,-2.596372,6,two-sided,0.040861,[-0.11 -0. ],1.066586,2.305,0.654921



Test language: pt, ({en-es-ru}, en)


  if (res['p-val'][0] >= 0.05): continue


Unnamed: 0,T,dof,alternative,p-val,CI95%,cohen-d,BF10,power
T-test,2.888329,6,two-sided,0.027754,[0. 0.03],0.85145,3.08,0.472867


  if (res['p-val'][0] >= 0.05): continue



Test language: pt, (ru, es)


Unnamed: 0,T,dof,alternative,p-val,CI95%,cohen-d,BF10,power
T-test,-2.926758,6,two-sided,0.026398,[-0.12 -0.01],1.191524,3.198,0.747863



Test language: pt, ({en-es-ru}, es)


  if (res['p-val'][0] >= 0.05): continue


Unnamed: 0,T,dof,alternative,p-val,CI95%,cohen-d,BF10,power
T-test,2.723572,6,two-sided,0.034479,[0. 0.02],0.793335,2.616,0.422765



Test language: pt, ({en-es-ru}, ru)


  if (res['p-val'][0] >= 0.05): continue


Unnamed: 0,T,dof,alternative,p-val,CI95%,cohen-d,BF10,power
T-test,3.16211,6,two-sided,0.019513,[0.02 0.14],1.450872,4.022,0.88917


  if (res['p-val'][0] >= 0.05): continue
  if (res['p-val'][0] >= 0.05): continue
  if (res['p-val'][0] >= 0.05): continue
  if (res['p-val'][0] >= 0.05): continue



Test language: et, ({en-es-ru}, es)


  if (res['p-val'][0] >= 0.05): continue


Unnamed: 0,T,dof,alternative,p-val,CI95%,cohen-d,BF10,power
T-test,3.112408,6,two-sided,0.020785,[0.01 0.06],0.779129,3.833,0.410658


  if (res['p-val'][0] >= 0.05): continue
  if (res['p-val'][0] >= 0.05): continue
  if (res['p-val'][0] >= 0.05): continue



Test language: de, (ru, en)


Unnamed: 0,T,dof,alternative,p-val,CI95%,cohen-d,BF10,power
T-test,-2.576962,6,two-sided,0.04194,[-0.07 -0. ],0.975584,2.26,0.579936



Test language: de, ({en-es-ru}, en)


  if (res['p-val'][0] >= 0.05): continue


Unnamed: 0,T,dof,alternative,p-val,CI95%,cohen-d,BF10,power
T-test,5.30698,6,two-sided,0.001818,[0.02 0.04],1.063759,25.487,0.652669



Test language: de, (ru, es)


  if (res['p-val'][0] >= 0.05): continue


Unnamed: 0,T,dof,alternative,p-val,CI95%,cohen-d,BF10,power
T-test,-4.26815,6,two-sided,0.005274,[-0.08 -0.02],1.281825,11.034,0.805503


  if (res['p-val'][0] >= 0.05): continue
  if (res['p-val'][0] >= 0.05): continue



Test language: de, ({en-es-ru}, ru)


Unnamed: 0,T,dof,alternative,p-val,CI95%,cohen-d,BF10,power
T-test,4.059291,6,two-sided,0.006656,[0.03 0.11],1.668534,9.205,0.953822


  if (res['p-val'][0] >= 0.05): continue
  if (res['p-val'][0] >= 0.05): continue



Test language: ru, (ru, en)


Unnamed: 0,T,dof,alternative,p-val,CI95%,cohen-d,BF10,power
T-test,4.08481,6,two-sided,0.006467,[0.03 0.11],1.67438,9.413,0.955003



Test language: ru, ({en-es-ru}, en)


  if (res['p-val'][0] >= 0.05): continue


Unnamed: 0,T,dof,alternative,p-val,CI95%,cohen-d,BF10,power
T-test,3.567262,6,two-sided,0.011823,[0.02 0.1 ],1.483724,5.902,0.901811


  if (res['p-val'][0] >= 0.05): continue



Test language: ru, (ru, es)


Unnamed: 0,T,dof,alternative,p-val,CI95%,cohen-d,BF10,power
T-test,4.147974,6,two-sided,0.006025,[0.02 0.09],1.513362,9.947,0.912268



Test language: ru, ({en-es-ru}, es)


  if (res['p-val'][0] >= 0.05): continue


Unnamed: 0,T,dof,alternative,p-val,CI95%,cohen-d,BF10,power
T-test,3.156975,6,two-sided,0.01964,[0.01 0.09],1.30476,4.002,0.818726


  if (res['p-val'][0] >= 0.05): continue
  if (res['p-val'][0] >= 0.05): continue
  if (res['p-val'][0] >= 0.05): continue
  if (res['p-val'][0] >= 0.05): continue
  if (res['p-val'][0] >= 0.05): continue
  if (res['p-val'][0] >= 0.05): continue



Test language: bg, ({en-es-ru}, es)


Unnamed: 0,T,dof,alternative,p-val,CI95%,cohen-d,BF10,power
T-test,2.770331,6,two-sided,0.032408,[0.01 0.1 ],0.775228,2.741,0.407346


  if (res['p-val'][0] >= 0.05): continue
  if (res['p-val'][0] >= 0.05): continue
  if (res['p-val'][0] >= 0.05): continue



Test language: zh, ({en-es-ru}, en)


  if (res['p-val'][0] >= 0.05): continue


Unnamed: 0,T,dof,alternative,p-val,CI95%,cohen-d,BF10,power
T-test,3.782827,6,two-sided,0.00915,[0.05 0.22],1.110192,7.192,0.6889


  if (res['p-val'][0] >= 0.05): continue



Test language: zh, ({en-es-ru}, es)


  if (res['p-val'][0] >= 0.05): continue


Unnamed: 0,T,dof,alternative,p-val,CI95%,cohen-d,BF10,power
T-test,2.891516,6,two-sided,0.027639,[0.02 0.24],1.054612,3.089,0.645342


  if (res['p-val'][0] >= 0.05): continue
  if (res['p-val'][0] >= 0.05): continue
  if (res['p-val'][0] >= 0.05): continue
  if (res['p-val'][0] >= 0.05): continue
  if (res['p-val'][0] >= 0.05): continue



Test language: hu, ({en-es-ru}, es)


  if (res['p-val'][0] >= 0.05): continue


Unnamed: 0,T,dof,alternative,p-val,CI95%,cohen-d,BF10,power
T-test,3.250023,6,two-sided,0.017466,[0.01 0.04],0.731369,4.376,0.370598



Test language: hu, ({en-es-ru}, ru)


  if (res['p-val'][0] >= 0.05): continue


Unnamed: 0,T,dof,alternative,p-val,CI95%,cohen-d,BF10,power
T-test,2.715377,6,two-sided,0.034856,[0.01 0.12],1.093902,2.595,0.676381


  if (res['p-val'][0] >= 0.05): continue



Test language: hr, ({en-es-ru}, es)


  if (res['p-val'][0] >= 0.05): continue
  if (res['p-val'][0] >= 0.05): continue
  if (res['p-val'][0] >= 0.05): continue
  if (res['p-val'][0] >= 0.05): continue


Unnamed: 0,T,dof,alternative,p-val,CI95%,cohen-d,BF10,power
T-test,2.590828,6,two-sided,0.041166,[0. 0.05],0.540801,2.292,0.22731



Test language: hr, ({en-es-ru}, ru)


  if (res['p-val'][0] >= 0.05): continue


Unnamed: 0,T,dof,alternative,p-val,CI95%,cohen-d,BF10,power
T-test,2.470639,6,two-sided,0.048416,[0. 0.1],0.804265,2.032,0.432126



Test language: en, (es, en)


  if (res['p-val'][0] >= 0.05): continue


Unnamed: 0,T,dof,alternative,p-val,CI95%,cohen-d,BF10,power
T-test,-3.901944,6,two-sided,0.007966,[-0.1 -0.02],1.861041,8.006,0.981591



Test language: en, (ru, en)


  if (res['p-val'][0] >= 0.05): continue


Unnamed: 0,T,dof,alternative,p-val,CI95%,cohen-d,BF10,power
T-test,-4.070857,6,two-sided,0.006569,[-0.14 -0.03],1.998118,9.299,0.991209


  if (res['p-val'][0] >= 0.05): continue
  if (res['p-val'][0] >= 0.05): continue



Test language: en, ({en-es-ru}, es)


  if (res['p-val'][0] >= 0.05): continue


Unnamed: 0,T,dof,alternative,p-val,CI95%,cohen-d,BF10,power
T-test,3.401772,6,two-sided,0.014464,[0.01 0.09],1.674619,5.055,0.955051



Test language: en, ({en-es-ru}, ru)


  if (res['p-val'][0] >= 0.05): continue


Unnamed: 0,T,dof,alternative,p-val,CI95%,cohen-d,BF10,power
T-test,3.637307,6,two-sided,0.01087,[0.03 0.13],1.862422,6.297,0.981721


In [82]:
#statistical significance between monolingually and multilingually finetuned detectors
temp = results_language.copy()
temp = temp[~temp.Language.isin(['ga', 'gd', 'sk', 'sl'])]#exclude languages with not enough samples
temp = temp[~temp['Train Language'].isin(['{en-es-ru-x3}'])]#exclude training with 3x more samples
languages = temp.Language.unique()
temp = temp.pivot(index=['Train Language', 'Detector'], columns='Language', values='AUC ROC').reset_index()
#temp = pd.concat([temp, temp.loc[temp.Category == 'P', :]])

res_df = pd.DataFrame()
#for (src, trg) in itertools.combinations_with_replacement(temp['Train Language'].unique(), 2):
#  if src == trg: continue
for lang in languages[languages != '{all}']:
   try:
    res = pg.ttest(temp[(temp["Train Language"] == '{en-es-ru}')][lang], temp[(temp["Train Language"] != '{en-es-ru}')][lang], paired=True)
    #res = pg.ttest(temp[(temp["Train Language"] == '{en-es-ru}')][lang], temp[(temp["Train Language"] != '{en-es-ru}')].groupby('Detector')[lang].mean(), paired=True)
    if (res['p-val'][0] >= 0.05): continue
    print(f"\nTest language: {lang}")
    display(res.style.apply(lambda _: np.where(res['p-val'] >= 0.05, 'background-color: yellow', '')))
   except:
    pass




Test language: cs


  if (res['p-val'][0] >= 0.05): continue


Unnamed: 0,T,dof,alternative,p-val,CI95%,cohen-d,BF10,power
T-test,2.25233,21.336179,two-sided,0.034949,[0. 0.09],0.714576,2.177,0.351051



Test language: ar


  if (res['p-val'][0] >= 0.05): continue
  if (res['p-val'][0] >= 0.05): continue


Unnamed: 0,T,dof,alternative,p-val,CI95%,cohen-d,BF10,power
T-test,3.615912,20.867671,two-sided,0.001634,[0.03 0.12],1.157363,24.078,0.723386



Test language: es


  if (res['p-val'][0] >= 0.05): continue


Unnamed: 0,T,dof,alternative,p-val,CI95%,cohen-d,BF10,power
T-test,2.59179,25.031569,two-sided,0.015708,[0.01 0.1 ],0.692595,3.673,0.333293



Test language: uk


  if (res['p-val'][0] >= 0.05): continue


Unnamed: 0,T,dof,alternative,p-val,CI95%,cohen-d,BF10,power
T-test,2.629753,18.806776,two-sided,0.016594,[0.01 0.09],0.876071,3.908,0.489291





Test language: el


  if (res['p-val'][0] >= 0.05): continue
  if (res['p-val'][0] >= 0.05): continue


Unnamed: 0,T,dof,alternative,p-val,CI95%,cohen-d,BF10,power
T-test,3.093797,25.670038,two-sided,0.004724,[0.02 0.08],0.889244,8.782,0.500869


  if (res['p-val'][0] >= 0.05): continue



Test language: ca


Unnamed: 0,T,dof,alternative,p-val,CI95%,cohen-d,BF10,power
T-test,3.233279,24.118661,two-sided,0.003528,[0.02 0.11],0.970992,11.393,0.572328



Test language: pt


  if (res['p-val'][0] >= 0.05): continue
  if (res['p-val'][0] >= 0.05): continue


Unnamed: 0,T,dof,alternative,p-val,CI95%,cohen-d,BF10,power
T-test,2.8179,25.556772,two-sided,0.009199,[0.01 0.06],0.763956,5.367,0.392119


  if (res['p-val'][0] >= 0.05): continue
  if (res['p-val'][0] >= 0.05): continue



Test language: de


Unnamed: 0,T,dof,alternative,p-val,CI95%,cohen-d,BF10,power
T-test,2.560688,13.637285,two-sided,0.02301,[0.01 0.07],0.970162,3.493,0.57161



Test language: ru


  if (res['p-val'][0] >= 0.05): continue


Unnamed: 0,T,dof,alternative,p-val,CI95%,cohen-d,BF10,power
T-test,2.236778,17.249877,two-sided,0.038783,[0. 0.07],0.770105,2.128,0.397331


  if (res['p-val'][0] >= 0.05): continue
  if (res['p-val'][0] >= 0.05): continue



Test language: zh


Unnamed: 0,T,dof,alternative,p-val,CI95%,cohen-d,BF10,power
T-test,2.838212,19.28253,two-sided,0.010407,[0.03 0.23],0.936575,5.559,0.542394



Test language: en


  if (res['p-val'][0] >= 0.05): continue
  if (res['p-val'][0] >= 0.05): continue
  if (res['p-val'][0] >= 0.05): continue


Unnamed: 0,T,dof,alternative,p-val,CI95%,cohen-d,BF10,power
T-test,3.413493,22.678633,two-sided,0.002414,[0.02 0.07],0.87719,16.111,0.490274


In [83]:
#statistical significance between monolingually and multilingually finetuned detectors autoregressive-only
temp = results_language.copy()
temp = temp[~temp.Language.isin(['ga', 'gd', 'sk', 'sl'])]#exclude languages with not enough samples
temp = temp[~temp['Train Language'].isin(['{en-es-ru-x3}'])]#exclude training with 3x more samples
temp = temp[temp.Detector.isin(autoregressive)]#exclude not autoregressive models
languages = temp.Language.unique()
temp = temp.pivot(index=['Train Language', 'Detector'], columns='Language', values='AUC ROC').reset_index()
#temp = pd.concat([temp, temp.loc[temp.Category == 'P', :]])

res_df = pd.DataFrame()
#for (src, trg) in itertools.combinations_with_replacement(temp['Train Language'].unique(), 2):
#  if src == trg: continue
for lang in languages[languages != '{all}']:
   try:
    res = pg.ttest(temp[(temp["Train Language"] == '{en-es-ru}')][lang], temp[(temp["Train Language"] != '{en-es-ru}')][lang], paired=True)
    #res = pg.ttest(temp[(temp["Train Language"] == '{en-es-ru}')][lang], temp[(temp["Train Language"] != '{en-es-ru}')].groupby('Detector')[lang].mean(), paired=True)
    if (res['p-val'][0] >= 0.05): continue
    print(f"\nTest language: {lang}")
    display(res.style.apply(lambda _: np.where(res['p-val'] >= 0.05, 'background-color: yellow', '')))
   except:
    pass




Test language: cs


  if (res['p-val'][0] >= 0.05): continue


Unnamed: 0,T,dof,alternative,p-val,CI95%,cohen-d,BF10,power
T-test,3.843354,8.174722,two-sided,0.004727,[0.03 0.12],1.787533,17.195,0.820757


  if (res['p-val'][0] >= 0.05): continue



Test language: ar


  if (res['p-val'][0] >= 0.05): continue


Unnamed: 0,T,dof,alternative,p-val,CI95%,cohen-d,BF10,power
T-test,4.53537,5.5217,two-sided,0.004858,[0.05 0.18],2.522101,48.563,0.981821





Test language: es


  if (res['p-val'][0] >= 0.05): continue


Unnamed: 0,T,dof,alternative,p-val,CI95%,cohen-d,BF10,power
T-test,2.703101,12.662639,two-sided,0.01844,[0.02 0.16],0.914608,3.462,0.314567



Test language: uk


  if (res['p-val'][0] >= 0.05): continue


Unnamed: 0,T,dof,alternative,p-val,CI95%,cohen-d,BF10,power
T-test,2.451552,6.470851,two-sided,0.046777,[0. 0.14],1.257945,2.519,0.52733


  if (res['p-val'][0] >= 0.05): continue
  if (res['p-val'][0] >= 0.05): continue



Test language: el


Unnamed: 0,T,dof,alternative,p-val,CI95%,cohen-d,BF10,power
T-test,4.314645,13.981802,two-sided,0.000715,[0.04 0.11],1.57918,34.807,0.720468



Test language: ca


  if (res['p-val'][0] >= 0.05): continue


Unnamed: 0,T,dof,alternative,p-val,CI95%,cohen-d,BF10,power
T-test,3.408614,13.465928,two-sided,0.004456,[0.04 0.16],1.298446,9.114,0.553146



Test language: ro


  if (res['p-val'][0] >= 0.05): continue


Unnamed: 0,T,dof,alternative,p-val,CI95%,cohen-d,BF10,power
T-test,2.465583,13.036637,two-sided,0.028324,[0.01 0.09],0.957138,2.563,0.339186



Test language: pt


  if (res['p-val'][0] >= 0.05): continue


Unnamed: 0,T,dof,alternative,p-val,CI95%,cohen-d,BF10,power
T-test,3.09848,13.795759,two-sided,0.007973,[0.02 0.09],1.094127,5.885,0.422837


  if (res['p-val'][0] >= 0.05): continue



Test language: de


  if (res['p-val'][0] >= 0.05): continue


Unnamed: 0,T,dof,alternative,p-val,CI95%,cohen-d,BF10,power
T-test,3.018914,7.660714,two-sided,0.017446,[0.01 0.1 ],1.440448,5.275,0.641183


  if (res['p-val'][0] >= 0.05): continue
  if (res['p-val'][0] >= 0.05): continue
  if (res['p-val'][0] >= 0.05): continue



Test language: zh


Unnamed: 0,T,dof,alternative,p-val,CI95%,cohen-d,BF10,power
T-test,3.204484,5.692791,two-sided,0.019898,[0.05 0.37],1.75257,6.822,0.805719



Test language: hu


  if (res['p-val'][0] >= 0.05): continue


Unnamed: 0,T,dof,alternative,p-val,CI95%,cohen-d,BF10,power
T-test,2.45901,9.126779,two-sided,0.03587,[0. 0.1],1.097308,2.543,0.424839



Test language: en


  if (res['p-val'][0] >= 0.05): continue
  if (res['p-val'][0] >= 0.05): continue


Unnamed: 0,T,dof,alternative,p-val,CI95%,cohen-d,BF10,power
T-test,3.686266,12.589695,two-sided,0.002883,[0.03 0.11],1.244836,13.639,0.518943


In [84]:
#statistical significance between platform-differently finetuned detectors
temp = results_platform.copy()
temp = temp[~temp['Train Platform'].str.contains('x5')]#exclude training with 5x more samples
platforms = temp.Platform.unique()
temp = temp.pivot(index=['Train Platform', 'Detector'], columns='Platform', values='AUC ROC').reset_index()
#temp = pd.concat([temp, temp.loc[temp.Category == 'P', :]])
#display(temp)

res_df = pd.DataFrame()
for (src, trg) in itertools.combinations_with_replacement(temp['Train Platform'].unique(), 2):
  if src == trg: continue
  for lang in platforms[platforms != '{all}']:
   try:
    res = pg.ttest(temp[(temp["Train Platform"] == trg)][lang], temp[(temp["Train Platform"] == src)][lang], paired=True)
    if (res['p-val'][0] >= 0.05): continue
    print(f"\nTest platform: {lang}, ({trg}, {src})")
    display(res.style.apply(lambda _: np.where(res['p-val'] >= 0.05, 'background-color: yellow', '')))
   except:
    pass

  if (res['p-val'][0] >= 0.05): continue
  if (res['p-val'][0] >= 0.05): continue



Test platform: Gab, (Gab, Discord)


  if (res['p-val'][0] >= 0.05): continue
  if (res['p-val'][0] >= 0.05): continue


Unnamed: 0,T,dof,alternative,p-val,CI95%,cohen-d,BF10,power
T-test,5.825367,2,two-sided,0.028226,[0.03 0.19],6.274932,3.576,0.997002


  if (res['p-val'][0] >= 0.05): continue
  if (res['p-val'][0] >= 0.05): continue
  if (res['p-val'][0] >= 0.05): continue
  if (res['p-val'][0] >= 0.05): continue
  if (res['p-val'][0] >= 0.05): continue
  if (res['p-val'][0] >= 0.05): continue
  if (res['p-val'][0] >= 0.05): continue
  if (res['p-val'][0] >= 0.05): continue



Test platform: Discord, (Twitter, Discord)


  if (res['p-val'][0] >= 0.05): continue


Unnamed: 0,T,dof,alternative,p-val,CI95%,cohen-d,BF10,power
T-test,-4.501344,2,two-sided,0.045976,[-0.04 -0. ],3.11696,2.688,0.770574



Test platform: Gab, (Twitter, Discord)


  if (res['p-val'][0] >= 0.05): continue


Unnamed: 0,T,dof,alternative,p-val,CI95%,cohen-d,BF10,power
T-test,5.235296,2,two-sided,0.034603,[0.01 0.12],4.184489,3.178,0.926621


  if (res['p-val'][0] >= 0.05): continue
  if (res['p-val'][0] >= 0.05): continue
  if (res['p-val'][0] >= 0.05): continue
  if (res['p-val'][0] >= 0.05): continue



Test platform: Discord, (WhatsApp, Discord)


Unnamed: 0,T,dof,alternative,p-val,CI95%,cohen-d,BF10,power
T-test,-4.312694,2,two-sided,0.049785,[-0.04 -0. ],2.379491,2.564,0.58495



Test platform: Gab, (WhatsApp, Discord)


  if (res['p-val'][0] >= 0.05): continue


Unnamed: 0,T,dof,alternative,p-val,CI95%,cohen-d,BF10,power
T-test,5.823259,2,two-sided,0.028246,[0.02 0.11],3.706589,3.574,0.872622


  if (res['p-val'][0] >= 0.05): continue
  if (res['p-val'][0] >= 0.05): continue
  if (res['p-val'][0] >= 0.05): continue



Test platform: Gab, (all, Discord)


  if (res['p-val'][0] >= 0.05): continue
  if (res['p-val'][0] >= 0.05): continue


Unnamed: 0,T,dof,alternative,p-val,CI95%,cohen-d,BF10,power
T-test,4.373394,2,two-sided,0.04851,[0. 0.17],4.584626,2.604,0.956076


  if (res['p-val'][0] >= 0.05): continue



Test platform: Twitter, (Telegram, Gab)


  if (res['p-val'][0] >= 0.05): continue


Unnamed: 0,T,dof,alternative,p-val,CI95%,cohen-d,BF10,power
T-test,17.005818,2,two-sided,0.00344,[0.01 0.01],0.288645,11.257,0.061505


  if (res['p-val'][0] >= 0.05): continue



Test platform: Telegram, (Telegram, Gab)


Unnamed: 0,T,dof,alternative,p-val,CI95%,cohen-d,BF10,power
T-test,6.360729,2,two-sided,0.023836,[0.01 0.05],3.726769,3.939,0.875386


  if (res['p-val'][0] >= 0.05): continue
  if (res['p-val'][0] >= 0.05): continue
  if (res['p-val'][0] >= 0.05): continue
  if (res['p-val'][0] >= 0.05): continue
  if (res['p-val'][0] >= 0.05): continue
  if (res['p-val'][0] >= 0.05): continue



Test platform: Gab, (Twitter, Gab)


  if (res['p-val'][0] >= 0.05): continue


Unnamed: 0,T,dof,alternative,p-val,CI95%,cohen-d,BF10,power
T-test,-6.960058,2,two-sided,0.020025,[-0.07 -0.02],5.123404,4.346,0.97956



Test platform: Twitter, (WhatsApp, Gab)


  if (res['p-val'][0] >= 0.05): continue
  if (res['p-val'][0] >= 0.05): continue


Unnamed: 0,T,dof,alternative,p-val,CI95%,cohen-d,BF10,power
T-test,-5.137992,2,two-sided,0.035855,[-0.05 -0. ],1.310183,3.113,0.260916


  if (res['p-val'][0] >= 0.05): continue
  if (res['p-val'][0] >= 0.05): continue
  if (res['p-val'][0] >= 0.05): continue
  if (res['p-val'][0] >= 0.05): continue
  if (res['p-val'][0] >= 0.05): continue
  if (res['p-val'][0] >= 0.05): continue



Test platform: Gab, (all, Gab)


  if (res['p-val'][0] >= 0.05): continue
  if (res['p-val'][0] >= 0.05): continue


Unnamed: 0,T,dof,alternative,p-val,CI95%,cohen-d,BF10,power
T-test,-4.919427,2,two-sided,0.038924,[-0.04 -0. ],1.376226,2.967,0.279846


  if (res['p-val'][0] >= 0.05): continue



Test platform: Telegram, (Twitter, Telegram)


  if (res['p-val'][0] >= 0.05): continue
  if (res['p-val'][0] >= 0.05): continue


Unnamed: 0,T,dof,alternative,p-val,CI95%,cohen-d,BF10,power
T-test,-5.815158,2,two-sided,0.028321,[-0.09 -0.01],4.959048,3.569,0.973954


  if (res['p-val'][0] >= 0.05): continue
  if (res['p-val'][0] >= 0.05): continue



Test platform: Twitter, (WhatsApp, Telegram)


  if (res['p-val'][0] >= 0.05): continue
  if (res['p-val'][0] >= 0.05): continue


Unnamed: 0,T,dof,alternative,p-val,CI95%,cohen-d,BF10,power
T-test,-6.776219,2,two-sided,0.021092,[-0.06 -0.01],1.646238,4.221,0.360869


  if (res['p-val'][0] >= 0.05): continue



Test platform: Telegram, (WhatsApp, Telegram)


Unnamed: 0,T,dof,alternative,p-val,CI95%,cohen-d,BF10,power
T-test,-5.096585,2,two-sided,0.036409,[-0.07 -0.01],3.965144,3.085,0.904697



Test platform: WhatsApp, (WhatsApp, Telegram)


  if (res['p-val'][0] >= 0.05): continue
  if (res['p-val'][0] >= 0.05): continue
  if (res['p-val'][0] >= 0.05): continue


Unnamed: 0,T,dof,alternative,p-val,CI95%,cohen-d,BF10,power
T-test,5.495194,2,two-sided,0.031557,[0. 0.03],2.346384,3.353,0.575344


  if (res['p-val'][0] >= 0.05): continue
  if (res['p-val'][0] >= 0.05): continue
  if (res['p-val'][0] >= 0.05): continue
  if (res['p-val'][0] >= 0.05): continue
  if (res['p-val'][0] >= 0.05): continue
  if (res['p-val'][0] >= 0.05): continue



Test platform: Twitter, (WhatsApp, Twitter)


Unnamed: 0,T,dof,alternative,p-val,CI95%,cohen-d,BF10,power
T-test,-9.512343,2,two-sided,0.010872,[-0.08 -0.03],3.875337,6.094,0.894359



Test platform: Telegram, (WhatsApp, Twitter)


  if (res['p-val'][0] >= 0.05): continue


Unnamed: 0,T,dof,alternative,p-val,CI95%,cohen-d,BF10,power
T-test,9.255128,2,two-sided,0.011474,[0.01 0.02],1.546771,5.917,0.330482


  if (res['p-val'][0] >= 0.05): continue
  if (res['p-val'][0] >= 0.05): continue
  if (res['p-val'][0] >= 0.05): continue



Test platform: WhatsApp, (WhatsApp, Twitter)


Unnamed: 0,T,dof,alternative,p-val,CI95%,cohen-d,BF10,power
T-test,4.764757,2,two-sided,0.041335,[0.01 0.1 ],4.515875,2.864,0.951867



Test platform: Twitter, (all, Twitter)


  if (res['p-val'][0] >= 0.05): continue


Unnamed: 0,T,dof,alternative,p-val,CI95%,cohen-d,BF10,power
T-test,-4.573857,2,two-sided,0.044625,[-0.03 -0. ],1.670475,2.737,0.368339



Test platform: Telegram, (all, Twitter)


  if (res['p-val'][0] >= 0.05): continue


Unnamed: 0,T,dof,alternative,p-val,CI95%,cohen-d,BF10,power
T-test,8.394854,2,two-sided,0.013895,[0.02 0.05],3.017736,5.327,0.749213


  if (res['p-val'][0] >= 0.05): continue
  if (res['p-val'][0] >= 0.05): continue



Test platform: Twitter, (all, WhatsApp)


  if (res['p-val'][0] >= 0.05): continue
  if (res['p-val'][0] >= 0.05): continue


Unnamed: 0,T,dof,alternative,p-val,CI95%,cohen-d,BF10,power
T-test,4.361766,2,two-sided,0.048751,[0. 0.07],2.45733,2.596,0.607186


  if (res['p-val'][0] >= 0.05): continue



Test platform: Telegram, (all, WhatsApp)


Unnamed: 0,T,dof,alternative,p-val,CI95%,cohen-d,BF10,power
T-test,6.09387,2,two-sided,0.025887,[0.01 0.03],1.834795,3.758,0.419369


  if (res['p-val'][0] >= 0.05): continue



Test platform: WhatsApp, (all, WhatsApp)


  if (res['p-val'][0] >= 0.05): continue
  if (res['p-val'][0] >= 0.05): continue


Unnamed: 0,T,dof,alternative,p-val,CI95%,cohen-d,BF10,power
T-test,-5.799327,2,two-sided,0.02847,[-0.04 -0.01],4.667736,3.558,0.960748


In [85]:
#statistical significance between monoplatform and multiplatform finetuning of detectors
temp = results_platform.copy()
temp = temp[~temp['Train Platform'].str.contains('x5')]#exclude training with 5x more samples
platforms = temp.Platform.unique()
temp = temp.pivot(index=['Train Platform', 'Detector'], columns='Platform', values='AUC ROC').reset_index()
#temp = pd.concat([temp, temp.loc[temp.Category == 'P', :]])
#display(temp)

res_df = pd.DataFrame()
#for (src, trg) in itertools.combinations_with_replacement(temp['Train Platform'].unique(), 2):
#  if src == trg: continue
for lang in platforms[platforms != '{all}']:
   try:
    res = pg.ttest(temp[(temp["Train Platform"] == 'all')][lang], temp[(temp["Train Platform"] != 'all')][lang], paired=True)
    #if (res['p-val'][0] >= 0.05): continue
    print(f"\nTest platform: {lang}")
    display(res.style.apply(lambda _: np.where(res['p-val'] >= 0.05, 'background-color: yellow', '')))
   except:
    pass




Test platform: Twitter


Unnamed: 0,T,dof,alternative,p-val,CI95%,cohen-d,BF10,power
T-test,1.877462,15.985249,two-sided,0.078826,[-0. 0.08],0.549837,1.336,0.129492



Test platform: Telegram




Unnamed: 0,T,dof,alternative,p-val,CI95%,cohen-d,BF10,power
T-test,2.009185,11.791828,two-sided,0.067972,[-0. 0.05],0.669272,1.533,0.168999



Test platform: Discord




Unnamed: 0,T,dof,alternative,p-val,CI95%,cohen-d,BF10,power
T-test,0.058547,2.797362,two-sided,0.957241,[-0.02 0.03],0.038025,0.493,0.050367



Test platform: Gab




Unnamed: 0,T,dof,alternative,p-val,CI95%,cohen-d,BF10,power
T-test,1.604018,7.800241,two-sided,0.148343,[-0.01 0.06],0.597453,1.031,0.144262



Test platform: WhatsApp




Unnamed: 0,T,dof,alternative,p-val,CI95%,cohen-d,BF10,power
T-test,1.941411,15.978352,two-sided,0.070056,[-0. 0.04],0.567905,1.427,0.134943


In [86]:
#statistical significance between monoplatform and multiplatform finetuning of detectors without mdeberta
temp = results_platform.copy()
temp = temp[~temp['Train Platform'].str.contains('x5')]#exclude training with 5x more samples
temp = temp[~temp.Detector.str.contains('mDeBERTa')]#exclude the small mdeberta model
platforms = temp.Platform.unique()
temp = temp.pivot(index=['Train Platform', 'Detector'], columns='Platform', values='AUC ROC').reset_index()
#temp = pd.concat([temp, temp.loc[temp.Category == 'P', :]])
#display(temp)

res_df = pd.DataFrame()
#for (src, trg) in itertools.combinations_with_replacement(temp['Train Platform'].unique(), 2):
#  if src == trg: continue
for lang in platforms[platforms != '{all}']:
   try:
    res = pg.ttest(temp[(temp["Train Platform"] == 'all')][lang], temp[(temp["Train Platform"] != 'all')][lang], paired=True)
    #if (res['p-val'][0] >= 0.05): continue
    print(f"\nTest platform: {lang}")
    display(res.style.apply(lambda _: np.where(res['p-val'] >= 0.05, 'background-color: yellow', '')))
   except:
    pass


Test platform: Twitter




Unnamed: 0,T,dof,alternative,p-val,CI95%,cohen-d,BF10,power
T-test,1.336011,9.961581,two-sided,0.211263,[-0.02 0.1 ],0.464185,0.867,0.084506



Test platform: Telegram




Unnamed: 0,T,dof,alternative,p-val,CI95%,cohen-d,BF10,power
T-test,1.465887,4.973727,two-sided,0.20288,[-0.02 0.07],0.626992,0.946,0.113668



Test platform: Discord




Unnamed: 0,T,dof,alternative,p-val,CI95%,cohen-d,BF10,power
T-test,-0.078218,1.403998,two-sided,0.947306,[-0.07 0.07],0.062077,0.549,0.050607





Test platform: Gab


Unnamed: 0,T,dof,alternative,p-val,CI95%,cohen-d,BF10,power
T-test,1.538056,5.411776,two-sided,0.180243,[-0.02 0.08],0.644834,0.996,0.117428



Test platform: WhatsApp




Unnamed: 0,T,dof,alternative,p-val,CI95%,cohen-d,BF10,power
T-test,1.69875,9.999679,two-sided,0.120211,[-0.01 0.05],0.595596,1.124,0.107324


## News

In [87]:

#news test data - th calibration for macroF1 based on train data
auc_dict = {}
for detector in tqdm(test_results, total=len(test_results)):
  for model,data in detector.items():
    train = data[(data.split == 'train') & (data.domain == 'social_media') & (~data.multi_label.str.contains('gemini'))]
    #print(data[(data.split == 'test') & (data.domain == 'news')].multi_label.unique())
    temp = data[(data.split == 'test') & (data.domain == 'news') & (~data.multi_label.str.contains('lama'))]
  if (to_category[model] == 'F') and ('social-media' not in model): continue
  #if (temp['prediction_probs'].min() >= 0.5) and (temp['prediction_probs'].max() <= 1.0):
    #temp.loc[temp.predictions == 'human', 'prediction_probs'] = 1 - temp['prediction_probs'] #adjust human prediction prob to be in range <0, 0.5)
  labels = [label2id[x] for x in temp['label']]
  fpr, tpr, thresholds = roc_curve(labels, temp['prediction_probs'])
  auc_dict[model] = {'auc':  auc_roc_reliable(fpr, tpr, labels), 'th_optim': thresholds[np.argmax(tpr - fpr)]}#, 'tpr_1%fpr': tpr[fpr <= 0.01][-1], 'tpr_5%fpr': tpr[fpr <= 0.05][-1], 'th_1%fpr': thresholds[fpr <= 0.01][-1], 'th_3%fpr': thresholds[fpr <= 0.03][-1], 'th_5%fpr': thresholds[fpr <= 0.05][-1], 'th_10%fpr': thresholds[fpr <= 0.10][-1], 'th_15%fpr': thresholds[fpr <= 0.15][-1], 'th_20%fpr': thresholds[fpr <= 0.20][-1], 'th_25%fpr': thresholds[fpr <= 0.25][-1], 'th_30%fpr': thresholds[fpr <= 0.30][-1], 'th_40%fpr': thresholds[fpr <= 0.40][-1], 'th_50%fpr': thresholds[fpr <= 0.50][-1]}

  train_labels = [label2id[x] for x in train['label']]
  fpr, tpr, thresholds = roc_curve(train_labels, train['prediction_probs'])
  predictions = [1 if ((x >= thresholds[fpr <= 0.05][-1])) else 0 for x in temp['prediction_probs']]
  auc_dict[model]['cr_5'] = report_todict(*report_np(np.array(labels), np.array(predictions), 2))
  predictions = [1 if ((x >= thresholds[np.argmax(tpr - fpr)])) else 0 for x in temp['prediction_probs']]
  auc_dict[model]['cr_o'] = report_todict(*report_np(np.array(labels), np.array(predictions), 2))
results_all = pd.DataFrame()
for model, v in tqdm(auc_dict.items(), total=len(auc_dict)):
  temp = pd.DataFrame({'Detector': rename_detector(model), 'Category': to_category[model], 'AUC ROC': v['auc'], 'MacroF1@5%FPR': v['cr_5']['macro avg']['f1-score'], 'MacroF1@optim': v['cr_o']['macro avg']['f1-score']}, index=[0])
  results_all = pd.concat([results_all, temp])
results_all = results_all.sort_values(by=['AUC ROC'], ascending=False).reset_index(drop=True)
results_all.index = results_all.index + 1
display(results_all.style.format(na_rep=0, precision=4))
display(results_all.style.apply(highlight_categories, axis=1).hide(['Category', 'MacroF1@optim'], axis=1))
print(results_all.reset_index().style.apply(highlight_categories, axis=1).hide(['Category', 'MacroF1@optim'], axis=1).format(na_rep=0, precision=4).applymap_index(lambda v: "font-weight: bold;", axis=0).applymap_index(lambda v: "font-weight: bold;", axis=1).hide(
    axis=0).to_latex(convert_css=True))

100%|███████████████████████████████████████████████████████████████████████████████████| 83/83 [01:07<00:00,  1.23it/s]
100%|█████████████████████████████████████████████████████████████████████████████████| 17/17 [00:00<00:00, 2479.16it/s]


Unnamed: 0,Detector,Category,AUC ROC,MacroF1@5%FPR,MacroF1@optim
1,Llama-3-8b-MultiSocial,F,0.9273,0.7988,0.7999
2,Aya-101-MultiSocial,F,0.9262,0.8008,0.801
3,mDeBERTa-v3-base-MultiSocial,F,0.9025,0.7512,0.7748
4,Mistral-7b-v0.1-MultiSocial,F,0.8988,0.7937,0.7892
5,XLM-RoBERTa-large-MultiSocial,F,0.8309,0.7306,0.684
6,Binoculars,S,0.8303,0.4041,0.5539
7,Fast-Detect-GPT,S,0.8104,0.6361,0.5961
8,Falcon-rw-1b-MultiSocial,F,0.7592,0.6394,0.6599
9,BLOOMZ-3b-MultiSocial,F,0.7071,0.5731,0.5959
10,LLM-Deviation,S,0.6568,0.3568,0.4649


Unnamed: 0,Detector,AUC ROC,MacroF1@5%FPR
1,Llama-3-8b-MultiSocial,0.927322,0.798831
2,Aya-101-MultiSocial,0.926166,0.800815
3,mDeBERTa-v3-base-MultiSocial,0.9025,0.751232
4,Mistral-7b-v0.1-MultiSocial,0.898766,0.793683
5,XLM-RoBERTa-large-MultiSocial,0.830872,0.730627
6,Binoculars,0.830301,0.404108
7,Fast-Detect-GPT,0.810407,0.636138
8,Falcon-rw-1b-MultiSocial,0.759205,0.639355
9,BLOOMZ-3b-MultiSocial,0.707063,0.573052
10,LLM-Deviation,0.65679,0.356787


\begin{tabular}{rlrr}
\bfseries index & \bfseries Detector & \bfseries AUC ROC & \bfseries MacroF1@5%FPR \\
{\cellcolor[HTML]{B6D7A8}} 1 & {\cellcolor[HTML]{B6D7A8}} Llama-3-8b-MultiSocial & {\cellcolor[HTML]{B6D7A8}} 0.9273 & {\cellcolor[HTML]{B6D7A8}} 0.7988 \\
{\cellcolor[HTML]{B6D7A8}} 2 & {\cellcolor[HTML]{B6D7A8}} Aya-101-MultiSocial & {\cellcolor[HTML]{B6D7A8}} 0.9262 & {\cellcolor[HTML]{B6D7A8}} 0.8008 \\
{\cellcolor[HTML]{B6D7A8}} 3 & {\cellcolor[HTML]{B6D7A8}} mDeBERTa-v3-base-MultiSocial & {\cellcolor[HTML]{B6D7A8}} 0.9025 & {\cellcolor[HTML]{B6D7A8}} 0.7512 \\
{\cellcolor[HTML]{B6D7A8}} 4 & {\cellcolor[HTML]{B6D7A8}} Mistral-7b-v0.1-MultiSocial & {\cellcolor[HTML]{B6D7A8}} 0.8988 & {\cellcolor[HTML]{B6D7A8}} 0.7937 \\
{\cellcolor[HTML]{B6D7A8}} 5 & {\cellcolor[HTML]{B6D7A8}} XLM-RoBERTa-large-MultiSocial & {\cellcolor[HTML]{B6D7A8}} 0.8309 & {\cellcolor[HTML]{B6D7A8}} 0.7306 \\
{\cellcolor[HTML]{F9CB9C}} 6 & {\cellcolor[HTML]{F9CB9C}} Binoculars & {\cellcolor[HTML]{F9CB9C}}

  print(results_all.reset_index().style.apply(highlight_categories, axis=1).hide(['Category', 'MacroF1@optim'], axis=1).format(na_rep=0, precision=4).applymap_index(lambda v: "font-weight: bold;", axis=0).applymap_index(lambda v: "font-weight: bold;", axis=1).hide(


In [88]:

#(OOD) news test data of Llama-2-70b generator - th calibration for macroF1 based on train data
auc_dict = {}
for detector in tqdm(test_results, total=len(test_results)):
  for model,data in detector.items():
    train = data[(data.split == 'train') & (data.domain == 'social_media') & (~data.multi_label.str.contains('gemini'))]
    #print(data[(data.split == 'test') & (data.domain == 'news')].multi_label.unique())
    temp = data[(data.split == 'test') & (data.domain == 'news') & (data.multi_label.str.contains('lama') | data.multi_label.str.contains('human'))]
  if (to_category[model] == 'F') and ('social-media' not in model): continue
  #if (temp['prediction_probs'].min() >= 0.5) and (temp['prediction_probs'].max() <= 1.0):
    #temp.loc[temp.predictions == 'human', 'prediction_probs'] = 1 - temp['prediction_probs'] #adjust human prediction prob to be in range <0, 0.5)
  labels = [label2id[x] for x in temp['label']]
  fpr, tpr, thresholds = roc_curve(labels, temp['prediction_probs'])
  auc_dict[model] = {'auc':  auc_roc_reliable(fpr, tpr, labels), 'th_optim': thresholds[np.argmax(tpr - fpr)]}#, 'tpr_1%fpr': tpr[fpr <= 0.01][-1], 'tpr_5%fpr': tpr[fpr <= 0.05][-1], 'th_1%fpr': thresholds[fpr <= 0.01][-1], 'th_3%fpr': thresholds[fpr <= 0.03][-1], 'th_5%fpr': thresholds[fpr <= 0.05][-1], 'th_10%fpr': thresholds[fpr <= 0.10][-1], 'th_15%fpr': thresholds[fpr <= 0.15][-1], 'th_20%fpr': thresholds[fpr <= 0.20][-1], 'th_25%fpr': thresholds[fpr <= 0.25][-1], 'th_30%fpr': thresholds[fpr <= 0.30][-1], 'th_40%fpr': thresholds[fpr <= 0.40][-1], 'th_50%fpr': thresholds[fpr <= 0.50][-1]}

  train_labels = [label2id[x] for x in train['label']]
  fpr, tpr, thresholds = roc_curve(train_labels, train['prediction_probs'])
  predictions = [1 if ((x >= thresholds[fpr <= 0.05][-1])) else 0 for x in temp['prediction_probs']]
  auc_dict[model]['cr_5'] = report_todict(*report_np(np.array(labels), np.array(predictions), 2))
  predictions = [1 if ((x >= thresholds[np.argmax(tpr - fpr)])) else 0 for x in temp['prediction_probs']]
  auc_dict[model]['cr_o'] = report_todict(*report_np(np.array(labels), np.array(predictions), 2))
results_all = pd.DataFrame()
for model, v in tqdm(auc_dict.items(), total=len(auc_dict)):
  temp = pd.DataFrame({'Detector': rename_detector(model), 'Category': to_category[model], 'AUC ROC': v['auc'], 'MacroF1@5%FPR': v['cr_5']['macro avg']['f1-score'], 'MacroF1@optim': v['cr_o']['macro avg']['f1-score']}, index=[0])
  results_all = pd.concat([results_all, temp])
results_all = results_all.sort_values(by=['AUC ROC'], ascending=False).reset_index(drop=True)
results_all.index = results_all.index + 1
display(results_all.style.format(na_rep=0, precision=4))
display(results_all.style.apply(highlight_categories, axis=1).hide(['Category', 'MacroF1@optim'], axis=1))
print(results_all.reset_index().style.apply(highlight_categories, axis=1).hide(['Category', 'MacroF1@optim'], axis=1).format(na_rep=0, precision=4).applymap_index(lambda v: "font-weight: bold;", axis=0).applymap_index(lambda v: "font-weight: bold;", axis=1).hide(
    axis=0).to_latex(convert_css=True))

100%|███████████████████████████████████████████████████████████████████████████████████| 83/83 [00:56<00:00,  1.47it/s]
100%|█████████████████████████████████████████████████████████████████████████████████| 17/17 [00:00<00:00, 1860.24it/s]


Unnamed: 0,Detector,Category,AUC ROC,MacroF1@5%FPR,MacroF1@optim
1,Fast-Detect-GPT,S,0.9238,0.8471,0.5841
2,Binoculars,S,0.9048,0.7568,0.4473
3,mDeBERTa-v3-base-MultiSocial,F,0.8871,0.8011,0.7916
4,Mistral-7b-v0.1-MultiSocial,F,0.8614,0.7673,0.7732
5,Aya-101-MultiSocial,F,0.8574,0.7556,0.7562
6,Llama-3-8b-MultiSocial,F,0.8549,0.7352,0.7451
7,XLM-RoBERTa-large-MultiSocial,F,0.7928,0.7108,0.6032
8,Falcon-rw-1b-MultiSocial,F,0.771,0.6912,0.6707
9,DetectLLM-LRR,S,0.7559,0.7121,0.332
10,LLM-Deviation,S,0.7257,0.5931,0.3319


Unnamed: 0,Detector,AUC ROC,MacroF1@5%FPR
1,Fast-Detect-GPT,0.923793,0.847065
2,Binoculars,0.904783,0.756778
3,mDeBERTa-v3-base-MultiSocial,0.887064,0.801123
4,Mistral-7b-v0.1-MultiSocial,0.861438,0.767348
5,Aya-101-MultiSocial,0.857379,0.755611
6,Llama-3-8b-MultiSocial,0.85489,0.735153
7,XLM-RoBERTa-large-MultiSocial,0.79276,0.710824
8,Falcon-rw-1b-MultiSocial,0.770981,0.691165
9,DetectLLM-LRR,0.75592,0.712063
10,LLM-Deviation,0.725709,0.593085


\begin{tabular}{rlrr}
\bfseries index & \bfseries Detector & \bfseries AUC ROC & \bfseries MacroF1@5%FPR \\
{\cellcolor[HTML]{F9CB9C}} 1 & {\cellcolor[HTML]{F9CB9C}} Fast-Detect-GPT & {\cellcolor[HTML]{F9CB9C}} 0.9238 & {\cellcolor[HTML]{F9CB9C}} 0.8471 \\
{\cellcolor[HTML]{F9CB9C}} 2 & {\cellcolor[HTML]{F9CB9C}} Binoculars & {\cellcolor[HTML]{F9CB9C}} 0.9048 & {\cellcolor[HTML]{F9CB9C}} 0.7568 \\
{\cellcolor[HTML]{B6D7A8}} 3 & {\cellcolor[HTML]{B6D7A8}} mDeBERTa-v3-base-MultiSocial & {\cellcolor[HTML]{B6D7A8}} 0.8871 & {\cellcolor[HTML]{B6D7A8}} 0.8011 \\
{\cellcolor[HTML]{B6D7A8}} 4 & {\cellcolor[HTML]{B6D7A8}} Mistral-7b-v0.1-MultiSocial & {\cellcolor[HTML]{B6D7A8}} 0.8614 & {\cellcolor[HTML]{B6D7A8}} 0.7673 \\
{\cellcolor[HTML]{B6D7A8}} 5 & {\cellcolor[HTML]{B6D7A8}} Aya-101-MultiSocial & {\cellcolor[HTML]{B6D7A8}} 0.8574 & {\cellcolor[HTML]{B6D7A8}} 0.7556 \\
{\cellcolor[HTML]{B6D7A8}} 6 & {\cellcolor[HTML]{B6D7A8}} Llama-3-8b-MultiSocial & {\cellcolor[HTML]{B6D7A8}} 0.8549 & {\ce

  print(results_all.reset_index().style.apply(highlight_categories, axis=1).hide(['Category', 'MacroF1@optim'], axis=1).format(na_rep=0, precision=4).applymap_index(lambda v: "font-weight: bold;", axis=0).applymap_index(lambda v: "font-weight: bold;", axis=1).hide(


In [89]:
#news test data

auc_dict = {}
for detector in tqdm(test_results, total=len(test_results)):
  for model,data in detector.items():
    temp = data[(data.split == 'test') & (data.domain == 'news') & (~data.multi_label.str.contains('lama'))]
    #temp = data[(data.split == 'test') & (data.domain == 'news') & (data.multi_label.str.contains('lama') | data.multi_label.str.contains('human'))]
  if (to_category[model] == 'F') and ('social-media' not in model): continue
  #if (temp['prediction_probs'].min() >= 0.5) and (temp['prediction_probs'].max() <= 1.0):
    #temp.loc[temp.predictions == 'human', 'prediction_probs'] = 1 - temp['prediction_probs'] #adjust human prediction prob to be in range <0, 0.5)
  temp['prediction_probs']= temp['prediction_probs'].astype(float)
  labels = [label2id[x] for x in temp['label']]
  fpr, tpr, thresholds = roc_curve(labels, temp['prediction_probs'])
  auc_dict[model] = {'auc':  auc_roc_reliable(fpr, tpr, labels), 'th_optim': thresholds[np.argmax(tpr - fpr)], 'tpr_1%fpr': tpr[fpr <= 0.01][-1], 'tpr_5%fpr': tpr[fpr <= 0.05][-1], 'th_1%fpr': thresholds[fpr <= 0.01][-1], 'th_3%fpr': thresholds[fpr <= 0.03][-1], 'th_5%fpr': thresholds[fpr <= 0.05][-1], 'th_10%fpr': thresholds[fpr <= 0.10][-1], 'th_15%fpr': thresholds[fpr <= 0.15][-1], 'th_20%fpr': thresholds[fpr <= 0.20][-1], 'th_25%fpr': thresholds[fpr <= 0.25][-1], 'th_30%fpr': thresholds[fpr <= 0.30][-1], 'th_40%fpr': thresholds[fpr <= 0.40][-1], 'th_50%fpr': thresholds[fpr <= 0.50][-1]}
  for test_language in temp.language.unique():
    temp2 = temp[temp.language == test_language].reset_index(drop=True)
    labels = [label2id[x] for x in temp2['label']]
    fpr, tpr, thresholds = roc_curve(labels, temp2['prediction_probs'])
    auc_dict[model][test_language] = {'auc':  auc_roc_reliable(fpr, tpr, labels), 'th_optim': thresholds[np.argmax(tpr - fpr)], 'tpr_1%fpr': tpr[fpr <= 0.01][-1], 'tpr_5%fpr': tpr[fpr <= 0.05][-1], 'th_1%fpr': thresholds[fpr <= 0.01][-1], 'th_3%fpr': thresholds[fpr <= 0.03][-1], 'th_5%fpr': thresholds[fpr <= 0.05][-1], 'th_10%fpr': thresholds[fpr <= 0.10][-1], 'th_15%fpr': thresholds[fpr <= 0.15][-1], 'th_20%fpr': thresholds[fpr <= 0.20][-1], 'th_25%fpr': thresholds[fpr <= 0.25][-1], 'th_30%fpr': thresholds[fpr <= 0.30][-1], 'th_40%fpr': thresholds[fpr <= 0.40][-1], 'th_50%fpr': thresholds[fpr <= 0.50][-1]}
results_all = pd.DataFrame()
for model, v in tqdm(auc_dict.items(), total=len(auc_dict)):
  temp = pd.DataFrame({'Detector': model, 'Language': '{all}', 'AUC ROC': v['auc']}, index=[0])
  results_all = pd.concat([results_all, temp])
  for test_language,val in v.items():
    if (test_language == 'auc') or ('_' in test_language): continue
    temp = pd.DataFrame({'Detector': model, 'Language': test_language, 'AUC ROC': val['auc']}, index=[0])
    results_all = pd.concat([results_all, temp])
results_all.set_index(['Detector', 'Language']).unstack().sort_values(by=('AUC ROC','{all}'), ascending=False).style.format(precision=2).apply(lambda x: ["background-color: lightyellow" if v >= 0.6 else "" for v in x], axis = 0, subset='AUC ROC').highlight_max(props='font-weight: bold;', axis=0)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  temp['prediction_probs']= temp['prediction_probs'].astype(float)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  temp['prediction_probs']= temp['prediction_probs'].astype(float)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  temp['prediction_probs']= temp['prediction_probs'].astype(float)
A value is

Unnamed: 0_level_0,AUC ROC,AUC ROC,AUC ROC,AUC ROC,AUC ROC,AUC ROC,AUC ROC,AUC ROC,AUC ROC,AUC ROC,AUC ROC,AUC ROC,AUC ROC,AUC ROC,AUC ROC,AUC ROC,AUC ROC,AUC ROC,AUC ROC,AUC ROC,AUC ROC,AUC ROC,AUC ROC
Language,ar,bg,ca,cs,de,el,en,es,et,ga,gd,hr,hu,nl,pl,pt,ro,ru,sk,sl,uk,zh,{all}
Detector,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2,Unnamed: 22_level_2,Unnamed: 23_level_2
Meta-Llama-3-8B-first-social-media,0.98,0.95,0.86,0.99,0.98,0.97,0.93,0.93,0.93,0.97,0.92,0.93,0.97,0.92,0.97,0.94,0.94,0.96,0.98,0.95,0.98,0.96,0.93
aya-101-first-social-media,0.98,0.98,0.85,0.98,0.97,0.96,0.88,0.9,0.96,0.99,0.95,0.95,0.98,0.91,0.98,0.9,0.95,0.91,0.98,0.98,0.95,0.94,0.93
mdeberta-v3-base-first-social-media,0.95,0.95,0.86,0.97,0.95,0.91,0.87,0.86,0.92,0.93,0.83,0.95,0.93,0.91,0.94,0.89,0.91,0.9,0.98,0.96,0.92,0.9,0.9
Mistral-7B-v0.1-first-social-media,0.95,0.97,0.93,0.98,0.94,0.87,0.93,0.92,0.85,0.92,0.77,0.95,0.94,0.93,0.95,0.95,0.95,0.96,0.97,0.96,0.98,0.94,0.9
xlm-roberta-large-first-social-media,0.9,0.87,0.85,0.94,0.88,0.78,0.86,0.8,0.87,0.89,0.68,0.88,0.89,0.86,0.89,0.83,0.87,0.79,0.92,0.9,0.81,0.87,0.83
BinocularsMetric_threshold,0.72,0.67,0.92,0.88,0.94,0.76,0.98,0.96,0.84,0.83,0.68,0.87,0.9,0.94,0.91,0.96,0.94,0.72,0.8,0.87,0.65,0.65,0.83
fast-detect-gpt,0.67,0.72,0.96,0.87,0.89,0.53,0.93,0.94,0.64,0.79,0.73,0.88,0.88,0.93,0.89,0.93,0.94,0.73,0.71,0.84,0.72,0.82,0.81
falcon-rw-1b-first-social-media,0.83,0.62,0.87,0.8,0.85,0.72,0.82,0.81,0.78,0.82,0.69,0.81,0.79,0.9,0.82,0.81,0.81,0.67,0.88,0.84,0.7,0.88,0.76
bloomz-3b-first-social-media,0.9,0.78,0.78,0.78,0.76,0.73,0.78,0.76,0.69,0.76,0.65,0.73,0.7,0.87,0.76,0.79,0.68,0.78,0.81,0.75,0.82,0.81,0.71
gpt-j-6b_llm_deviationMetric_threshold,0.63,0.62,0.93,0.86,0.77,0.49,0.82,0.84,0.66,0.85,0.72,0.85,0.8,0.9,0.81,0.81,0.9,0.59,0.79,0.84,0.58,0.7,0.66


In [90]:
#news test data - OOD

auc_dict = {}
for detector in tqdm(test_results, total=len(test_results)):
  for model,data in detector.items():
    #temp = data[(data.split == 'test') & (data.domain == 'news') & (~data.multi_label.str.contains('lama'))]
    temp = data[(data.split == 'test') & (data.domain == 'news') & (data.multi_label.str.contains('lama') | data.multi_label.str.contains('human'))]
  if (to_category[model] == 'F') and ('social-media' not in model): continue
  #if (temp['prediction_probs'].min() >= 0.5) and (temp['prediction_probs'].max() <= 1.0):
    #temp.loc[temp.predictions == 'human', 'prediction_probs'] = 1 - temp['prediction_probs'] #adjust human prediction prob to be in range <0, 0.5)
  temp['prediction_probs']= temp['prediction_probs'].astype(float)
  labels = [label2id[x] for x in temp['label']]
  fpr, tpr, thresholds = roc_curve(labels, temp['prediction_probs'])
  auc_dict[model] = {'auc':  auc_roc_reliable(fpr, tpr, labels), 'th_optim': thresholds[np.argmax(tpr - fpr)], 'tpr_1%fpr': tpr[fpr <= 0.01][-1], 'tpr_5%fpr': tpr[fpr <= 0.05][-1], 'th_1%fpr': thresholds[fpr <= 0.01][-1], 'th_3%fpr': thresholds[fpr <= 0.03][-1], 'th_5%fpr': thresholds[fpr <= 0.05][-1], 'th_10%fpr': thresholds[fpr <= 0.10][-1], 'th_15%fpr': thresholds[fpr <= 0.15][-1], 'th_20%fpr': thresholds[fpr <= 0.20][-1], 'th_25%fpr': thresholds[fpr <= 0.25][-1], 'th_30%fpr': thresholds[fpr <= 0.30][-1], 'th_40%fpr': thresholds[fpr <= 0.40][-1], 'th_50%fpr': thresholds[fpr <= 0.50][-1]}
  for test_language in temp.language.unique():
    temp2 = temp[temp.language == test_language].reset_index(drop=True)
    labels = [label2id[x] for x in temp2['label']]
    fpr, tpr, thresholds = roc_curve(labels, temp2['prediction_probs'])
    auc_dict[model][test_language] = {'auc':  auc_roc_reliable(fpr, tpr, labels), 'th_optim': thresholds[np.argmax(tpr - fpr)], 'tpr_1%fpr': tpr[fpr <= 0.01][-1], 'tpr_5%fpr': tpr[fpr <= 0.05][-1], 'th_1%fpr': thresholds[fpr <= 0.01][-1], 'th_3%fpr': thresholds[fpr <= 0.03][-1], 'th_5%fpr': thresholds[fpr <= 0.05][-1], 'th_10%fpr': thresholds[fpr <= 0.10][-1], 'th_15%fpr': thresholds[fpr <= 0.15][-1], 'th_20%fpr': thresholds[fpr <= 0.20][-1], 'th_25%fpr': thresholds[fpr <= 0.25][-1], 'th_30%fpr': thresholds[fpr <= 0.30][-1], 'th_40%fpr': thresholds[fpr <= 0.40][-1], 'th_50%fpr': thresholds[fpr <= 0.50][-1]}
results_all = pd.DataFrame()
for model, v in tqdm(auc_dict.items(), total=len(auc_dict)):
  temp = pd.DataFrame({'Detector': model, 'Language': '{all}', 'AUC ROC': v['auc']}, index=[0])
  results_all = pd.concat([results_all, temp])
  for test_language,val in v.items():
    if (test_language == 'auc') or ('_' in test_language): continue
    temp = pd.DataFrame({'Detector': model, 'Language': test_language, 'AUC ROC': val['auc']}, index=[0])
    results_all = pd.concat([results_all, temp])
results_all.set_index(['Detector', 'Language']).unstack().sort_values(by=('AUC ROC','{all}'), ascending=False).style.format(precision=2).apply(lambda x: ["background-color: lightyellow" if v >= 0.6 else "" for v in x], axis = 0, subset='AUC ROC').highlight_max(props='font-weight: bold;', axis=0)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  temp['prediction_probs']= temp['prediction_probs'].astype(float)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  temp['prediction_probs']= temp['prediction_probs'].astype(float)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  temp['prediction_probs']= temp['prediction_probs'].astype(float)
A value is

Unnamed: 0_level_0,AUC ROC,AUC ROC,AUC ROC,AUC ROC,AUC ROC,AUC ROC,AUC ROC,AUC ROC,AUC ROC,AUC ROC,AUC ROC,AUC ROC,AUC ROC,AUC ROC,AUC ROC,AUC ROC,AUC ROC,AUC ROC,AUC ROC,AUC ROC,AUC ROC,AUC ROC,AUC ROC
Language,ar,bg,ca,cs,de,el,en,es,et,ga,gd,hr,hu,nl,pl,pt,ro,ru,sk,sl,uk,zh,{all}
Detector,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2,Unnamed: 22_level_2,Unnamed: 23_level_2
fast-detect-gpt,0.89,0.91,0.99,0.98,0.96,0.5,1.0,0.98,0.9,0.96,0.9,0.98,0.98,0.99,0.97,0.98,1.0,0.9,0.78,0.98,0.95,0.94,0.92
BinocularsMetric_threshold,0.93,0.7,0.97,0.96,0.96,0.89,1.0,0.99,0.95,0.93,0.75,0.91,0.93,0.98,0.97,0.98,0.98,0.74,0.91,0.94,0.65,0.85,0.9
mdeberta-v3-base-first-social-media,0.99,0.94,0.87,0.96,0.94,0.97,0.86,0.84,0.94,0.92,0.86,0.92,0.91,0.83,0.93,0.87,0.86,0.87,0.98,0.95,0.89,0.84,0.89
Mistral-7B-v0.1-first-social-media,0.94,0.95,0.89,0.95,0.9,0.87,0.9,0.87,0.84,0.96,0.83,0.91,0.9,0.87,0.91,0.91,0.89,0.89,0.96,0.95,0.96,0.91,0.86
aya-101-first-social-media,0.99,0.94,0.77,0.9,0.91,0.99,0.87,0.84,0.92,0.98,0.97,0.86,0.96,0.75,0.94,0.81,0.9,0.81,0.94,0.95,0.85,0.9,0.86
Meta-Llama-3-8B-first-social-media,0.99,0.83,0.69,0.96,0.94,0.99,0.92,0.86,0.93,0.97,0.93,0.79,0.92,0.79,0.94,0.88,0.86,0.86,0.98,0.9,0.93,0.92,0.85
xlm-roberta-large-first-social-media,0.91,0.79,0.85,0.9,0.84,0.84,0.89,0.79,0.86,0.94,0.82,0.83,0.85,0.77,0.84,0.78,0.8,0.69,0.86,0.84,0.77,0.85,0.79
falcon-rw-1b-first-social-media,0.62,0.62,0.9,0.76,0.88,0.77,0.92,0.87,0.85,0.88,0.77,0.81,0.8,0.92,0.81,0.84,0.82,0.7,0.87,0.78,0.72,0.91,0.77
gpt-j-6b_lrrMetric_threshold,0.82,0.77,0.99,0.95,0.91,0.66,0.99,0.96,0.86,0.94,0.78,0.96,0.9,0.97,0.91,0.95,0.98,0.72,0.88,0.95,0.74,0.77,0.76
gpt-j-6b_llm_deviationMetric_threshold,0.76,0.66,0.98,0.92,0.84,0.63,0.98,0.94,0.81,0.93,0.77,0.95,0.87,0.97,0.86,0.93,0.96,0.66,0.85,0.93,0.7,0.77,0.73
