In [1]:
import pandas as pd
import json
import os
import numpy as np
from copy import deepcopy

In [2]:
model = 'Meta-Llama-3-8B-Instruct'

In [3]:
base_path_emotions = '../results/emotions'
base_path_ai_persona = '../results/ai-risk'
base_path_truth = '../results/truthfulqa'
base_path_trivia = '../results/triviaqa'
base_path_safety = '../results/safety'
base_path_jbb = '../results/jbb'
base_path_toxicity = '../results/toxicity'

In [4]:
emotions = ['anger', 'disgust', 'fear', 'joy', 'sadness', 'surprise']
emo_results = {}
for emotion in emotions:
    emo_results[emotion] = {}
    res_path = os.path.join(base_path_emotions, 
                   f"{emotion}/{model}/all_methods_results.json")
    if os.path.exists(res_path):
        with open(res_path, 'r') as f:
            res = json.load(f)
    emo_results[emotion] = res['methods']

In [5]:
types = ['mcq', 'qa']
behaviors = ['power', 'wealth']
risk_results = {}
for t in types:
    for b in behaviors:
        b_name = b + '-' + t
        risk_results[b_name] = {}
        res_path = os.path.join(base_path_ai_persona+'-'+t, 
                       f"{b}/{model}/all_methods_results.json")
        if os.path.exists(res_path):
            with open(res_path, 'r') as f:
                res = json.load(f)
        risk_results[b_name] = res['methods']

In [6]:
def get_results_single(base_path,  
                       model='Meta-Llama-3-8B-Instruct', flip_t_f = False):
    results = {}
    res_path = os.path.join(base_path, 
                   f"{model}/all_methods_results.json")
    if os.path.exists(res_path):
        with open(res_path, 'r') as f:
            res = json.load(f)
        results = res['methods']
        if flip_t_f:
            for method in results.keys():
                results[method]['score'] = 1-results[method]['score']
                low = results[method]['score_ci']['lower']
                upp = results[method]['score_ci']['upper']
                results[method]['score_ci']['lower'] = 1-upp
                results[method]['score_ci']['upper'] = 1-low
    return results

In [7]:
truthful_res = get_results_single(base_path_truth, flip_t_f=False,
                                 model=model)

In [8]:
trivia_res = get_results_single(base_path_trivia, flip_t_f=False,
                               model=model)

In [9]:
safety_res = get_results_single(base_path_safety, flip_t_f=False, 
                                model=model)

In [10]:
jbb_res = get_results_single(base_path_jbb, flip_t_f=False,
                            model=model)

In [11]:
toxicity_res = get_results_single(base_path_toxicity, flip_t_f=True,
                                 model=model)

In [12]:
def process_data(data, res_dict, method, subcase=None):
    if subcase is None:
        subcase = data
    pos_total = res_dict[method]['true']
    neg_total = res_dict[method]['false']
    uns_total = res_dict[method]['unsure']
    n_total = res_dict[method]['total']
    d = {'Data': data,
           'Subcase': subcase,
           'Method': method,
           'Postives': pos_total,
           'Negatives': neg_total,
           'Unsure': uns_total,
           'Total': n_total,
           'Success rate': res_dict[method]['score'],
           'Score CI - Lower': res_dict[method]['score_ci']['lower'],
           'Score CI - Upper': res_dict[method]['score_ci']['upper'],
           'Average score': res_dict[method]['score'],
           'Fluency': res_dict[method]['fluency'],
           'Fluency CI - Lower': res_dict[method]['fluency_ci']['lower'],
           'Fluency CI - Upper': res_dict[method]['fluency_ci']['upper'],
        }
    return d

In [13]:
rates_list_all = []

data = 'Emotion'
for emotion in emo_results.keys():
    for method in emo_results[emotion].keys():
        d = process_data(data, emo_results[emotion], method, emotion)
        rates_list_all.append(d)

data = 'AI Persona'
for b in risk_results.keys():
    for method in risk_results[b].keys():
        d = process_data(data, risk_results[b], method, b)
        rates_list_all.append(d)

data = 'TruthfulQA'
for method in truthful_res.keys():
    d = process_data(data, truthful_res, method)
    rates_list_all.append(d)

data = 'AdvBench'
for method in safety_res.keys():
    d = process_data(data, safety_res, method)
    rates_list_all.append(d)

data = 'JailbreakBench'
for method in jbb_res.keys():
    d = process_data(data, jbb_res, method)
    rates_list_all.append(d)

data = 'TriviaQA'
for method in trivia_res.keys():
    d = process_data(data, trivia_res, method)
    rates_list_all.append(d)
    
data = 'Toxicity'
for method in toxicity_res.keys():
    d = process_data(data, toxicity_res, method)
    rates_list_all.append(d)

rates_all_df = pd.DataFrame(rates_list_all)

In [14]:
method_map = {'baseline': 'Default',
              'linear': 'Linear',
              'mean': 'DiffMean',
              'pca': 'PCAct',
              'repe': 'PCDiff',
              'random': 'Random',
              'refusal': 'Projection',
              'prompt': 'Prompt-only',
              'prompt-attention': 'InstA-Boost'
             }
rates_all_df['Method'] = rates_all_df['Method'].map(method_map)
rates_all_df.to_csv(f'../results/steering_results_{model}.csv', index=False)