# Data for Visualization Tool
Auditing Algorithms using Network data

Note: It contains `version=0` (from `dev/_vis_workshop.ipynb` by `LE`) and `version=1` (from `dev/_vis_workshop_updates.ipynb` by `DB`)

## Libraries

In [1]:
import sys 
sys.path.append('../code')

In [2]:
import pandas as pd
import os
import glob

In [3]:
%load_ext autoreload
%autoreload 2

from libs import constants as cons
from libs import io 
from libs import text

In [4]:
APS_PATH = '/data/datasets/LLMScholar-Audits/APS'
ROOT = '/data/datasets/LLMScholar-Audits/Auditor/backups/results_v2_arxiv'
RESULTS = os.path.join(ROOT, 'vistool')
FACT_TASKS = ['author', 'epoch', 'field', 'seniority']

### SUBSET data (1 week):
# VALID_DATE_RANGE = pd.date_range(start='2024-12-09', end='2024-12-15', freq='D')

### ALL data (1 month): 2024-12-09 to 2025-01-08
VALID_DATE_RANGE = None

## Functions

In [5]:

MODEL_MAPPING = {
    'gemma2-9b': 'g2',
    'mixtral-8x7b': 'mx',
    'llama3-70b': 'l3_70',
    'llama3-8b': 'l3_8',
    'llama-3.1-8b': 'l31_8',
    'llama-3.1-70b': 'l31_70'
}

# Task parameter abbreviations
TASK_MAPPING = {
    '1950s': '1950s',
    'PER': 'per',
    'top_5': 'top5',
    '2000s': '2000s',
    'CM&MP': 'cmmp',
    'early_career': 'ec',
    'senior': 'sr',
    'top_100': 'top100',
    'famous_female': 'twff',
    'famous_male': 'twfm',
    'fictitious_female': 'twficf',
    'fictitious_male': 'twficm',
    'movie_female': 'twmvf',
    'movie_male': 'twmvm',
    'politic_female': 'twpolf',
    'politic_male': 'twpolm',
    'random_female': 'twranf',
    'random_male': 'twranm'
}

def _update_factuality_author(kind, task, df):
    """Update factuality author DataFrame with corrected fields."""
    if kind == 'factuality' and task == 'author':
        # Corrected factuality based on APS presence
        df['is_in_aps'] = df.id_author_oa.apply(lambda x: False if pd.isnull(x) else True)
    return df

def _update_factuality_epoch(kind, task, df):
    """Update factuality epoch DataFrame with corrected fields."""
    if kind == 'factuality' and task == 'epoch':
        df['requested_epoch'] = df['task_param'].apply(lambda x: int(x[:-1]) if x in ['1950s', '2000s'] else x)
        
        # @TODO: Review this logic as it seems to be the same as fact_epoch_overlap
        # df['fact_epoch_requested'] = (
        #     df['year_first_publication'].notna() & 
        #     df['year_last_publication'].notna() &
        #     ~((df['year_last_publication'] < df['requested_epoch']) | 
        #     (df['year_first_publication'] > df['requested_epoch'] + 10))
        # )

        df.drop(columns=['requested_epoch'], inplace=True)

    return df

def _add_runid(df):
    # Format: {model_short}_{date_short}_{task_short}_{hour}
    df['model_short'] = df['model'].map(MODEL_MAPPING)
    df['date_short'] = df['date'].str.replace('2024-', '').str.replace('-', '')
    df['task_short'] = df['task_param'].map(TASK_MAPPING)
    df['time_short'] = df['time'].str.split(':').str[0]  # Extract hour (00, 08, 16)
    df['run_id'] = df['model_short'] + '_' + df['task_short'] + '_' + df['date_short'] + '_' + df['time_short']
    df.drop(columns=['model_short', 'date_short', 'task_short', 'time_short'], inplace=True)
    return df

def _get_df_audit(kind='factuality', task='author', date_range=None):
    """Load all data into a single DataFrame."""

    if kind not in ['factuality', 'similarities']:
        raise ValueError(f"kind must be either 'factuality' or 'similarities'")

    if kind == 'factuality':
        if task not in FACT_TASKS:
            raise ValueError(f"task must be either {', '.join(FACT_TASKS)}")
    elif kind == 'similarities':
        if task not in cons.EXPERIMENT_TASKS:
            raise ValueError(f"task must be either {cons.EXPERIMENT_TASKS}")
        
    df_rs = pd.DataFrame()
    for fn in glob.glob(os.path.join(ROOT, kind, f'*{task}.csv')):
        df = pd.read_csv(fn, index_col=0)
        df['date_dt'] = pd.to_datetime(df['date'])

        if date_range is not None:
            df = df[(df['date_dt'] >= date_range[0]) & (df['date_dt'] <= date_range[-1])]

        if 'llm_model' in df.columns:
            df.loc[:,'_tmp_'] = df.loc[:,'llm_model']
            df.loc[:, 'llm_model'] = df.loc[:, 'model']
            df.loc[:, 'model'] = df.loc[:, '_tmp_']
            df.rename(columns={'model':'model_fullname', 'llm_model':'model'}, inplace=True)

        if df.empty:
            print(f"Skipping empty DataFrame for {fn}")
            continue

        df.drop(columns=['date_dt', '_tmp_'], inplace=True, errors='ignore')
        df_rs = pd.concat([df_rs, df])

    # If task is author remove unnecesary fields
    cols_to_remove = ['level_5']

    if kind == 'factuality':
        cols_to_remove.extend(['valid_attempt', 'model_fullname'])

        if task == 'author':
            cols_to_remove.extend(['fact_author_score', 'id_author_aps_list', 'ethnicity_dx', 'ethnicity_ec', 'ethnicity', 'gender', 'works_count', 'cited_by_count', 'h_index', 'i10_index', 'e_index', 'two_year_mean_citedness', 'year_first_publication', 'year_last_publication', 'academic_age', 'age_now', 'seniority_active', 'seniority_now'])
        elif task == 'field':
            cols_to_remove = ['fact_doi_score']

    df_rs.drop(columns=cols_to_remove, inplace=True, errors='ignore')
        
    # Updates
    df_rs = _update_factuality_author(kind, task, df_rs)
    df_rs = _update_factuality_epoch(kind, task, df_rs)

    # create run ids
    df_rs = _add_runid(df_rs)
    
    return df_rs

## Generating Files

### Factuality

In [6]:
# FACTUALITY
def get_factuality_df(task='author', date_range=None):
    """Load all factuality data into a single DataFrame."""
    return _get_df_audit(kind='factuality', task=task, date_range=date_range)

In [7]:
# Factuality
for task in FACT_TASKS:
    df_factuality = get_factuality_df(task=task, date_range=VALID_DATE_RANGE)
    print(f"Factuality DataFrame for {task} loaded with {len(df_factuality)} rows.")
    fn = io.path_join(RESULTS, 'audit', f'factuality_{task}.csv')
    io.validate_path(fn)
    df_factuality.to_csv(fn)

Factuality DataFrame for author loaded with 138439 rows.
Factuality DataFrame for epoch loaded with 11760 rows.
Factuality DataFrame for field loaded with 15861 rows.
Skipping empty DataFrame for /data/datasets/LLMScholar-Audits/Auditor/backups/results_v2_arxiv/factuality/gemma2-9b_seniority.csv
Factuality DataFrame for seniority loaded with 10860 rows.


### Similarity

In [8]:
# SIMILARITY SCORES
def get_similarity_df(task='top-k', date_range=None):
    """Load all similarity scores into a single DataFrame."""
    return _get_df_audit(kind='similarities', task=task, date_range=date_range)

In [9]:
# Similarities
for task in cons.EXPERIMENT_TASKS:
    df_similarity = get_similarity_df(task=task, date_range=VALID_DATE_RANGE)
    print(f"Similarity DataFrame for {task} loaded with {len(df_similarity)} rows.")
    fn = io.path_join(RESULTS, 'audit', f'similarity_{task}.csv')
    io.validate_path(fn)
    df_similarity.to_csv(fn)

Similarity DataFrame for top_k loaded with 822 rows.
Similarity DataFrame for field loaded with 831 rows.
Similarity DataFrame for epoch loaded with 757 rows.
Similarity DataFrame for seniority loaded with 752 rows.
Similarity DataFrame for twins loaded with 4086 rows.


### Coauthorship

In [10]:
def get_coauthors_and_authors(authors_in_llm_lst, fn_coauthors):
    authors = set()
    edge_list = set()

    obj_coauthors = {int(item['openalex_id'].replace('A','')):item for item in io.read_list_of_dicts(fn_coauthors)}
    aps_oa_mapping = {obj['id_author']:oa_id for oa_id, obj in obj_coauthors.items()}

    for id_author_oa in authors_in_llm_lst:
        id_author_oa = int(id_author_oa)
        authors |= set([id_author_oa])

        if id_author_oa in obj_coauthors:
            oa_coauthors_ids = [aps_oa_mapping[aps_id] for aps_id in obj_coauthors[id_author_oa]['aps_co_authors'] if aps_id not in cons.NONE and pd.notna(aps_id)]
            authors |= set(oa_coauthors_ids)    
            edge_list |= set([f'{id_author_oa}\t{coauthor_id}' for coauthor_id in oa_coauthors_ids])

    return edge_list, authors

In [11]:
fn_coauthors = io.path_join(APS_PATH, 'results/organised_data/aps_coauthor_networks.json')
authors_in_llm = pd.read_csv(io.path_join(RESULTS, 'audit', f'factuality_author.csv'))['id_author_oa'].dropna().values

edge_list, authors = get_coauthors_and_authors(authors_in_llm, fn_coauthors)

fn = io.path_join(RESULTS, 'ground_truth', f'coauthorships_edgelist.txt')
io.validate_path(fn)
io.save_list_to_file(edge_list, fn)

### Metadata

In [12]:
def get_PCA(df_stats, n_components=2):
    from sklearn.preprocessing import StandardScaler
    from sklearn.decomposition import PCA
    
    data = df_stats.copy()
    data = data.fillna({'e_index':0, 'citations_per_paper_age':0}).replace({'e_index':cons.INF, 'citations_per_paper_age':cons.INF}, 0)
    data = data.drop(columns=[c for c in data.columns if c.startswith('rr') and '_rank_' in c] + ['orcid', 'aps_years_of_activity'])
    print(f"Original number of features: {df_stats.shape[1]}, now: {data.shape[1]}")

    # Handle missing values (if any)
    data = data.fillna(data.mean())
    combined_data = data.copy()
    
    # Standardize the data
    scaler = StandardScaler()
    data_normalized = scaler.fit_transform(data)
    
    # Apply dimensionality reduction (e.g., PCA)
    pca = PCA(n_components=n_components)
    reduced_data = pca.fit_transform(data_normalized)

    # Add reduced dimensions to the combined DataFrame
    for i in range(n_components):
        combined_data[f'dim{i+1}'] = reduced_data[:, i]
        
    # appending the results
    results = {'reduction':combined_data, 
               'variance':pca.explained_variance_, 
               'variance_ratio':pca.explained_variance_ratio_}
    # Get PC1 loadings
    loadings = pd.DataFrame(pca.components_.T,     # Transpose to align features with PCs
                            index=data.columns,    # Original feature names
                            columns=["PC1", "PC2"] # Principal components
                            )
        
    return results, loadings

def get_demographics(fn_demographics, fn_nobel_prize_winners, threshold=5):
    df_demographics = pd.read_csv(fn_demographics).rename(columns={'id_author':'id_author_oa'})
    df_nobel = pd.read_csv(fn_nobel_prize_winners, sep=';')
    df_nobel.loc[:,'Fullname'] = df_nobel.apply(lambda row: f"{row.Firstname} {row.Surname}", axis=1)
    df_nobel.rename(columns={'Surname':'last_name'}, inplace=True)

    # keeping only necesary colums
    cols_to_remove = ['created_date', 'updated_date', 'gender_nq', 'ethnicity_dx', 'ethnicity_ec']
    df_demographics.drop(columns=cols_to_remove, inplace=True)
    df_demographics.set_index('id_author_oa', inplace=True)

    # adding nobel prize tag
    column_block = 'last_name'
    column_pairs = [("Fullname", "display_name","jarowinkler",0.85,"display_name"),
                    ("Fullname", "longest_name","jarowinkler",0.85,"d_longest_name"),
                    ("Fullname", "alternative_names","jarowinkler",0.7,"d_alternative_names"),
                    ("Firstname", "first_name","jarowinkler",0.7,"first_name"),
                    ("last_name", "last_name","jarowinkler",0.7,"last_name"),
                    ("Fullname", "last_name","jarowinkler",0.7,"d_last_name"),
                    ("Fullname", "first_name","jarowinkler",0.7,"d_first_name"),
                    ]
    valid_matches = text.find_matching_texts(df_nobel, df_demographics, 
                                             column_block=column_block, column_pairs_to_evaluate=column_pairs, threshold=threshold)
    valid_matches.set_index('id_author_oa', inplace=True)

    # adding new information (mapping)
    df_demographics = df_demographics.join(valid_matches[['total_matches']], how='left')
    df_demographics.rename(columns={"total_matches": "nobel_prize_likelihood"}, inplace=True)
    df_demographics.loc[:, 'nobel_prize_likelihood'] = df_demographics.nobel_prize_likelihood / len(column_pairs)
    
    df_demographics = df_demographics.reset_index().drop_duplicates(subset='id_author_oa').set_index('id_author_oa')
    return df_demographics

def get_stats(fn_aps_stats, fn_oa_stats):
    df_aps_stats = pd.read_csv(fn_aps_stats).rename(columns={'id_author':'id_author_oa'}).set_index('id_author_oa')
    df_oa_stats = pd.read_csv(fn_oa_stats).rename(columns={'id_author':'id_author_oa'}).drop(columns=['created_date', 'updated_date', 'ID', 'name']).set_index('id_author_oa')

    print(f"APS stats data: {df_aps_stats.shape}")
    print(f"OA stats data: {df_oa_stats.shape}")

    df_stats = df_oa_stats.join(df_aps_stats)

    cols_order = sorted([c for c in df_oa_stats.columns if not c.startswith('rr')]) + [c for c in df_oa_stats.columns if c.startswith('rr')]
    df_oa_stats = df_oa_stats[cols_order]

    cols_order = ['orcid'] + df_aps_stats.columns.tolist() + [c for c in df_oa_stats.columns.tolist() if c!='orcid']
    df_stats = df_stats[cols_order]
    
    #id_author,aps_works_count,aps_cited_by_count,aps_h_index,aps_i10_index,aps_e_index,aps_years_of_activity,aps_career_age,aps_citations_per_paper_age
    #id_author,created_date,updated_date,name,orcid,two_year_mean_citedness,h_index,i10_index,works_count,cited_by_count,ID,e_index,career_age,max_year,min_year,citations_per_paper_age,rr1_rank_publications,rr1_rank_publications_percentile,rr2_rank_citations,rr2_rank_citations_percentile,rr3_rank_h_index,rr3_rank_h_index_percentile,rr4_rank_i10_index,rr4_rank_i10_index_percentile,rr5_rank_e_index,rr5_rank_e_index_percentile,rr6_rank_citation_publication_age,rr6_rank_citation_publication_age_percentile,rr7_rank_mean_citedness_2yr,rr7_rank_mean_citedness_2yr_percentile

    return df_stats

### Demographics

In [13]:
# Demographics (all authors)
fn_demographics = io.path_join(APS_PATH, 'results/augmented_aps/authors_demographics.csv')
fn_nobel_prize_winners = io.path_join(RESULTS, 'extra', 'nobel-prize-laureates.csv')
df_demographics = get_demographics(fn_demographics, fn_nobel_prize_winners)
df_demographics.shape

(481012, 9)

In [14]:
# Demographics (only subset)
df_demographics_sample = df_demographics.query("id_author_oa in @authors").copy()
fn = io.path_join(RESULTS, 'ground_truth', f'authors_demographics.csv')
io.validate_path(fn)
df_demographics_sample.to_csv(fn)

### Stats

In [15]:
# Stats (all authors)
fn_aps_stats = io.path_join(APS_PATH, 'results/augmented_aps/authors_aps_stats.csv')
fn_oa_stats = io.path_join(APS_PATH, 'results/augmented_aps/authors_stats.csv')

df_stats = get_stats(fn_aps_stats, fn_oa_stats)
df_stats.shape

APS stats data: (481012, 8)
OA stats data: (481012, 25)


(481012, 33)

In [16]:
# PCA (all authors)
df_PCA, df_PCA_summary = get_PCA(df_stats, n_components=2)

# saving PCA reduced dimensions per author (all authors)
fn = os.path.join(RESULTS, 'ground_truth', f'authors_PCA.csv')
df_PCA['reduction'][['dim1','dim2']].to_csv(fn)

# saving summary
df_PCA_summary_final = pd.concat([df_PCA_summary, pd.DataFrame(index=['variance','variance_ratio'], columns=['PC1', 'PC2'], data=[df_PCA['variance'], df_PCA['variance_ratio']])])
fn = io.path_join(RESULTS, 'ground_truth', f'summary_PCA.csv')
io.validate_path(fn)
df_PCA_summary_final.to_csv(fn)

Original number of features: 33, now: 17


In [17]:
# Stats (only subset)
df_stats_sample = df_stats.query("id_author_oa in @authors").copy()
fn = io.path_join(RESULTS, 'ground_truth', f'authors_stats.csv')
io.validate_path(fn)
df_stats_sample.to_csv(fn)

df_stats_sample.shape

(61475, 33)