In [1]:
import sys
sys.path.append('../../code/')

In [2]:
%load_ext autoreload
%autoreload 2

from libs import io
from libs import constants
from libs import vis
from libs.visuals import grid
from libs import helpers
# from libs.metrics import aggregators
from libs.metrics import helpers as helpers_metrics
from libs.visuals import polar_infra
from libs.visuals import constants as grid_constants
# from libs import latex

Available LLMs: (24): llama-3.3-8b llama-4-scout llama-4-mav gpt-oss-20b gpt-oss-120b qwen3-8b qwen3-14b qwen3-32b qwen3-30b-a3b-2507 qwen3-235b-a22b-2507 gemma-3-12b gemma-3-27b mistral-small-3.2-24b mistral-medium-3 llama-3.1-70b llama-3.3-70b llama-3.1-405b grok-4-fast deepseek-chat-v3.1 deepseek-r1-0528 gemini-2.5-flash gemini-2.5-flash-grounded gemini-2.5-pro gemini-2.5-pro-grounded


  from .autonotebook import tqdm as notebook_tqdm


# Steps
1. read summaries
2. read valid responses
3. read factuality
4. read similarities
5. summarize metrics of interest per response (some will need summaries, others factuality)

# Setup

In [3]:
APS_OA_DATA_TAR_GZ = '../../../APS/data/final_dataset.tar.gz'
APS_OS_DISCIPLINE_DEMOGRAPHICS = '../../results/interventions/metadata/disciplines_author_demographics.csv'
FN_COAUTHORSHIP = '../../../APS/results/augmented_aps/coauthorships.csv'


In [4]:
SUMMARY_PATH = '../../results/interventions/summaries'
VALID_RESPONSES_PATH = '../../results/interventions/valid_responses'
FACTUALITY_PATH = '../../results/interventions/factuality'
SIMILARITY_PATH = '../../results/interventions/similarities'

PLOTS_PATH = '../../results/interventions/plots'
TABLES_PATH = '../../results/interventions/tables'
LATEX_PATH = '../../results/interventions/latex'

io.validate_path(PLOTS_PATH)
io.validate_path(TABLES_PATH)
io.validate_path(LATEX_PATH)

In [5]:
vis.sns_reset()
vis.sns_paper_style(font_scale=1.55)

## Data

In [6]:
# Load the summary data for each model (all outputs)
df_summary_all = io.pd.concat([io.read_csv(io.path_join(SUMMARY_PATH, f"experiments_{model}.csv"), low_memory=False) for model in constants.LLMS], ignore_index=True)
df_valid_responses_all = io.pd.concat([io.read_csv(io.path_join(VALID_RESPONSES_PATH, f"{model}.csv"), index_col=0, low_memory=False) for model in constants.LLMS], ignore_index=True)
df_factuality_author_all = io.pd.concat([io.read_csv(io.path_join(FACTUALITY_PATH, f"{model}_author.csv"), index_col=0, low_memory=False) for model in constants.LLMS], ignore_index=True)

df_similarity_all = io.pd.DataFrame()
for model in constants.LLMS:
    for fn in io.glob.glob(io.path_join(SIMILARITY_PATH, f"{model}_*.csv")):
        tmp = io.read_csv(fn, low_memory=False, index_col=0)
        df_similarity_all = io.pd.concat([df_similarity_all, tmp], ignore_index=True)


df_summary_all.shape, df_valid_responses_all.shape, df_factuality_author_all.shape, df_similarity_all.shape
# (72117, 20)

((72117, 20), (3365532, 17), (3365532, 36), (513, 32))

In [7]:
# filter period for open-weight models
# 2026-01-15 some models were not available (eg. gemma, qwen)
# propietary models were available all the time (on different dates; we take them all)
start_date = '2025-12-19'
end_date = '2026-01-18'

# filter out interventions
query = "((not model.str.contains('gemini') and date >= @start_date and date <= @end_date) or model.str.contains('gemini'))"

df_summary = df_summary_all.query(query).copy()
df_valid_responses = df_valid_responses_all.query(query).copy()
df_factuality_author = df_factuality_author_all.query(query).copy()
df_similarity = df_similarity_all.query(query).copy()

# shapes
df_summary.shape, df_valid_responses.shape, df_factuality_author.shape


((65997, 20), (3196771, 17), (3196771, 36))

In [8]:
# demographics
df_all_authors_demographics = io.read_file_from_tar_gz_as_dataframe(APS_OA_DATA_TAR_GZ, constants.APS_OA_AUTHORS_DEMOGRAPHICS_FN)
df_all_authors_demographics.rename(columns={'id_author':'id_author_oa'}, inplace=True)

# scholarly stats
df_all_authors_stats = io.read_file_from_tar_gz_as_dataframe(APS_OA_DATA_TAR_GZ, constants.APS_OA_AUTHORS_STATS_FN)
df_all_authors_stats.rename(columns={'id_author':'id_author_oa'}, inplace=True)

# gt (from APS)
df_gt = df_all_authors_demographics[['id_author_oa','first_name','last_name','ethnicity','gender']].copy()
df_gt = df_gt.merge(df_all_authors_stats[['id_author_oa','works_count','cited_by_count', 'rr1_rank_publications','rr1_rank_publications_percentile', 'rr2_rank_citations','rr2_rank_citations_percentile']], on='id_author_oa', how='left')
df_gt = helpers.add_quantiles(df_gt)

del df_all_authors_demographics
del df_all_authors_stats

# shapes
df_gt.shape

(481012, 13)

In [9]:
df_gt.gender.value_counts(normalize=True) * 100

gender
Male       44.432779
Unknown    42.221192
Female      8.772130
Unisex      4.573898
Name: proportion, dtype: float64

## Augmenting data

In [15]:
# adding prominence metrics to recommended authors
df_factuality_author = df_factuality_author.merge(df_gt[['id_author_oa', 'prominence_pub', 'prominence_cit']], on='id_author_oa', how='left')

# adding infrastructure metadata
df_summary = helpers.add_infrastructure_columns(df_summary)
df_factuality_author = helpers.add_infrastructure_columns(df_factuality_author)
df_similarity = helpers.add_infrastructure_columns(df_similarity)

# shapes
df_summary.shape, df_factuality_author.shape


((65997, 25), (3196771, 45))

# Metrics

In [None]:
# computes the metric per attempt for ALL requests

_ = helpers_metrics.load_per_attempt('validity_pct', df_summary, TABLES_PATH)
_ = helpers_metrics.load_per_attempt('refusal_pct', df_summary, TABLES_PATH)
_ = helpers_metrics.load_per_attempt('duplicates', df_factuality_author, TABLES_PATH)
_ = helpers_metrics.load_per_attempt('consistency', df_factuality_author, TABLES_PATH)

_ = helpers_metrics.load_per_attempt('factuality_author', df_factuality_author, TABLES_PATH)
_ = helpers_metrics.load_per_attempt('connectedness_density', df_factuality_author, TABLES_PATH, df_similarity=df_similarity, metric_similarity='recommended_author_pairs_are_coauthors')
_ = helpers_metrics.load_per_attempt('connectedness_entropy', df_factuality_author, TABLES_PATH, df_similarity=df_similarity, metric_similarity='normalized_component_entropy')
_ = helpers_metrics.load_per_attempt('connectedness_components', df_factuality_author, TABLES_PATH, df_similarity=df_similarity, metric_similarity='normalized_n_components')
_ = helpers_metrics.load_per_attempt('similarity_pca', df_factuality_author, TABLES_PATH, df_similarity=df_similarity, metric_similarity='scholarly_pca_similarity_mean')

_ = helpers_metrics.load_per_attempt('diversity_gender', df_factuality_author, TABLES_PATH)
_ = helpers_metrics.load_per_attempt('diversity_ethnicity', df_factuality_author, TABLES_PATH)
_ = helpers_metrics.load_per_attempt('diversity_prominence_pub', df_factuality_author, TABLES_PATH)
_ = helpers_metrics.load_per_attempt('diversity_prominence_cit', df_factuality_author, TABLES_PATH)

_ = helpers_metrics.load_per_attempt('parity_gender', df_factuality_author, TABLES_PATH, gt=df_gt)
_ = helpers_metrics.load_per_attempt('parity_ethnicity', df_factuality_author, TABLES_PATH, gt=df_gt)
_ = helpers_metrics.load_per_attempt('parity_prominence_pub', df_factuality_author, TABLES_PATH, gt=df_gt)
_ = helpers_metrics.load_per_attempt('parity_prominence_cit', df_factuality_author, TABLES_PATH, gt=df_gt)
