In [1]:
import sys
sys.path.append('../../code/')

In [2]:
from tqdm.auto import tqdm
tqdm.pandas()

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
%load_ext autoreload
%autoreload 2

from libs import io
from libs import constants
from libs.network import fragmentation
from libs import helpers

Available LLMs: (24): llama-3.3-8b llama-4-scout llama-4-mav gpt-oss-20b gpt-oss-120b qwen3-8b qwen3-14b qwen3-32b qwen3-30b-a3b-2507 qwen3-235b-a22b-2507 gemma-3-12b gemma-3-27b mistral-small-3.2-24b mistral-medium-3 llama-3.1-70b llama-3.3-70b llama-3.1-405b grok-4-fast deepseek-chat-v3.1 deepseek-r1-0528 gemini-2.5-flash gemini-2.5-flash-grounded gemini-2.5-pro gemini-2.5-pro-grounded


In [4]:
APS_OA_DATA_TAR_GZ = '../../../APS/data/final_dataset.tar.gz'
APS_OS_DISCIPLINE_DEMOGRAPHICS = '../../results/interventions/metadata/disciplines_author_demographics.csv'

In [None]:
# demographics
df_all_authors_demographics = io.read_file_from_tar_gz_as_dataframe(APS_OA_DATA_TAR_GZ, constants.APS_OA_AUTHORS_DEMOGRAPHICS_FN)
df_all_authors_demographics.rename(columns={'id_author':'id_author_oa'}, inplace=True)

# scholarly stats
df_all_authors_stats = io.read_file_from_tar_gz_as_dataframe(APS_OA_DATA_TAR_GZ, constants.APS_OA_AUTHORS_STATS_FN)
df_all_authors_stats.rename(columns={'id_author':'id_author_oa'}, inplace=True)

# per discipline
df_gt_stats = io.read_csv(APS_OS_DISCIPLINE_DEMOGRAPHICS, index_col=0, header=[0, 1])

# shapes
df_all_authors_demographics.shape, df_gt_stats.shape, df_gt_stats.shape

((481012, 14), (17, 19), (17, 19), (481012, 13))

In [14]:
df_all_authors_demographics.columns

Index(['id_author_oa', 'created_date', 'updated_date', 'display_name', 'orcid',
       'gender_nq', 'alternative_names', 'longest_name', 'last_name',
       'first_name', 'ethnicity_dx', 'ethnicity_ec', 'ethnicity', 'gender'],
      dtype='object')

In [15]:
df_all_authors_stats.columns

Index(['id_author_oa', 'created_date', 'updated_date', 'name', 'orcid',
       'two_year_mean_citedness', 'h_index', 'i10_index', 'works_count',
       'cited_by_count', 'ID', 'e_index', 'career_age', 'max_year', 'min_year',
       'citations_per_paper_age', 'rr1_rank_publications',
       'rr1_rank_publications_percentile', 'rr2_rank_citations',
       'rr2_rank_citations_percentile', 'rr3_rank_h_index',
       'rr3_rank_h_index_percentile', 'rr4_rank_i10_index',
       'rr4_rank_i10_index_percentile', 'rr5_rank_e_index',
       'rr5_rank_e_index_percentile', 'rr6_rank_citation_publication_age',
       'rr6_rank_citation_publication_age_percentile',
       'rr7_rank_mean_citedness_2yr',
       'rr7_rank_mean_citedness_2yr_percentile'],
      dtype='object')

In [None]:
scholar_metrics = ['works_count','cited_by_count', 'two_year_mean_citedness','h_index', 'i10_index','e_index', 'career_age', 'max_year', 'min_year', 'citations_per_paper_age']
df_gt

Unnamed: 0,id_author_oa,first_name,last_name,ethnicity,gender,works_count,cited_by_count,rr1_rank_publications,rr1_rank_publications_percentile,rr2_rank_citations,rr2_rank_citations_percentile,prominence_pub,prominence_cit
0,5053051063,Ilan,Ben‐Zvi,White,Male,783,6034,4927.0,98.975909,47898.0,90.042452,elite,high
1,5067224934,Thomas,Roser,White,Male,632,5487,7893.0,98.359292,52722.0,89.039567,elite,high
2,5012146130,Massimo,Ferrario,White,Male,585,6813,9302.0,98.066368,42178.0,91.231612,elite,high
3,5051894783,Jean,Vay,Black or African American,Male,564,5161,10018.0,97.917516,55960.0,88.366403,elite,high
4,5041648606,Alex,Friedman,Asian,Male,518,3967,11881.0,97.530207,71570.0,85.121161,elite,high
...,...,...,...,...,...,...,...,...,...,...,...,...,...
481007,5071154606,Anton,Karazeev,White,Male,2,1087,429658.0,10.676449,175054.0,63.607353,low,mid
481008,5100688005,Lawrence,Wu,Asian,Male,2,14,429658.0,10.676449,453260.0,5.769711,low,low
481009,5103896844,Diego,Rodriguez,Unknown,Male,2,21,429658.0,10.676449,442162.0,8.076929,low,low
481010,5021481124,H.,Telle,Asian,Unknown,1,50,450627.0,6.317098,407488.0,15.285481,low,low


In [6]:
FN_COAUTHORSHIP = '../../../APS/results/augmented_aps/coauthorships.csv'
FACTUALITY_PATH = '../../results/interventions/factuality'
RESULTS_PATH = '../../results/interventions/tables'

In [7]:
# Load the summary data for each model (all outputs)
df_factuality_author_all = io.pd.concat([io.read_csv(io.path_join(FACTUALITY_PATH, f"{model}_author.csv"), index_col=0, low_memory=False) for model in constants.LLMS], ignore_index=True)
df_factuality_author_all.shape


(3365532, 36)

In [8]:
# filter period (valid for open-weight models)
# 2026-01-15 some models were not available (eg. gemma, qwen)
start_date = '2025-12-19'
end_date = '2026-01-18'

# filter out interventions
query = "((not model.str.contains('gemini') and date >= @start_date and date <= @end_date) or model.str.contains('gemini'))"

df_factuality_author_all = df_factuality_author_all.query(query).copy()

# shapes
df_factuality_author_all.shape


(3196771, 36)

In [9]:
group_cols = ['model','grounded','temperature','date','time','task_name','task_param','task_attempt']
df_factuality_author_all.groupby(group_cols).ngroups

31765

In [None]:
group_cols = ['model','grounded','temperature','date','time','task_name','task_param','task_attempt']
results = []

g = df_factuality_author_all.groupby(group_cols)

for group, df in tqdm(g, total=g.ngroups, desc="Processing groups"):
 
        rec_ids = df.id_author_oa.dropna().unique()

        similarity = None

        obj = {c: group[i] for i, c in enumerate(group_cols)}
        obj |= {'metric': similarity}
        
        results.append(obj)

df_summary = io.pd.DataFrame(results)
io.save_csv(df_summary, io.path_join(RESULTS_PATH, 'per_attempt_scholar_similarity.csv'))

Processing groups: 100%|██████████| 31765/31765 [26:43<00:00, 19.80it/s]


[2026-01-28 01:24:37] Data successfully saved to ../../results/interventions/tables/per_attempt_connectedness.csv


In [15]:
len(results)

31765

In [16]:
df_summary.shape



(31765, 13)

In [17]:
df_summary

Unnamed: 0,model,grounded,temperature,date,time,task_name,task_param,task_attempt,nrecs,n_components,metric,n_edges_rows,n_edges_undirected_unique
0,deepseek-chat-v3.1,False,0.0,2025-12-19,08:00,biased_top_k,top_100_bias_citations_high,2,44,43,0.991674,2.0,1.0
1,deepseek-chat-v3.1,False,0.0,2025-12-19,08:00,biased_top_k,top_100_bias_diverse,1,92,84,0.970008,16.0,8.0
2,deepseek-chat-v3.1,False,0.0,2025-12-19,08:00,biased_top_k,top_100_bias_ethnicity_asian,2,34,34,1.000000,0.0,0.0
3,deepseek-chat-v3.1,False,0.0,2025-12-19,08:00,biased_top_k,top_100_bias_ethnicity_black,2,4,4,1.000000,0.0,0.0
4,deepseek-chat-v3.1,False,0.0,2025-12-19,08:00,biased_top_k,top_100_bias_ethnicity_equal,1,96,86,0.965974,20.0,10.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
31760,qwen3-8b,False,0.5,2026-01-14,16:00,twins,famous_male,1,5,5,1.000000,0.0,0.0
31761,qwen3-8b,False,0.5,2026-01-14,16:00,twins,movie_female,1,9,7,0.833333,6.0,3.0
31762,qwen3-8b,False,0.5,2026-01-14,16:00,twins,movie_male,1,9,8,0.929897,2.0,1.0
31763,qwen3-8b,False,0.5,2026-01-14,16:00,twins,random_female,1,4,4,1.000000,0.0,0.0


In [18]:
model = 'qwen3-8b'
grounded = False	
temperature = 0.5
date = '2026-01-14'
time = '16:00'
task_name = 'twins'
task_param = 'random_male'
task_attempt = 1
tmp = df_factuality_author.query("model == @model and grounded == @grounded and temperature == @temperature and date == @date and time == @time and task_name == @task_name and task_param == @task_param and task_attempt == @task_attempt")
tmp[['id_author_oa','clean_name']]

Unnamed: 0,id_author_oa,clean_name
654268,5103535000.0,david r smith
654269,5022669000.0,nader engheta
654270,5009224000.0,stefan a maier
654271,5007779000.0,vladimir m shalaev
654272,,martin wegener
654273,5070415000.0,sergey i bozhevolnyi
654274,5052925000.0,costas m soukoulis


In [19]:
df_coauthorships.query("src in @tmp.id_author_oa and dst in @tmp.id_author_oa")

Unnamed: 0,src,dst
11323815,5070415188,5007779336
13681390,5009223920,5007779336
18909349,5007779336,5009223920
18909403,5007779336,5070415188
