In [2]:
import os
import json
import pandas as pd
from sklearn.metrics import f1_score, accuracy_score, roc_auc_score

## Codex Binary

In [17]:
preds = pd.read_json('../../../data/moral/inferred/codex/twitter/prompt/baseline-1ex-n1-t0.0/test.jsonl', lines=True)
preds['pred'] = preds['instance_code'].apply(lambda x: x[0]).str[0].apply(lambda x: int(x))
preds['label'] = preds['input'].apply(lambda x: x['original_example']['label'])

# calculate the f1, auc, acc of the predictions
f1 = f1_score(preds['label'], preds['pred'])
auc = roc_auc_score(preds['label'], preds['pred'])
acc = accuracy_score(preds['label'], preds['pred'])
print("F1: ", f1)
print("AUC: ", auc)
print("ACC: ", acc)

F1:  0.7714401294498382
AUC:  0.679943106563911
ACC:  0.7026315789473684


## Codex Text Multiclass

In [39]:
preds = pd.read_json('../../../data/moral/inferred/codex/twitter/prompt/baseline-50exwxp-n1-t0.0/test.jsonl', lines=True)
# preds = pd.read_json('../../../data/moral/inferred/codex/twitter/prompt/baseline-3expmv-n1-t0.0/test.jsonl', lines=True)
moral_value_list = ['care_or_harm', 'fairness_or_cheating', 'loyalty_or_betrayal', 'authority_or_subversion', 'purity_or_degradation', 'non-moral']
preds['label'] = preds['input'].apply(lambda x: [1 if x['original_example'][moral_value] == 1 else 0 for moral_value in moral_value_list])
preds['pred'] = preds['instance_code'].apply(lambda x:[1 if moral_value in x[0] else 0 for moral_value in moral_value_list])
label = preds['label'].tolist()
pred = preds['pred'].tolist()

f1 = f1_score(label, pred, average='micro')
auc = roc_auc_score(label, pred, average='micro')
acc = accuracy_score(label, pred)

print("F1: ", f1)
print("AUC: ", auc)
print("ACC: ", acc)

f1_no_avg = f1_score(label, pred, average=None)
auc_no_avg = roc_auc_score(label, pred, average=None)
df = pd.DataFrame({'moral_value': moral_value_list, 'f1': f1_no_avg, 'auc': auc_no_avg})
df

F1:  0.5370493253704932
AUC:  0.7332285505529831
ACC:  0.42134831460674155


Unnamed: 0,moral_value,f1,auc
0,care_or_harm,0.583784,0.750674
1,fairness_or_cheating,0.55481,0.778727
2,loyalty_or_betrayal,0.392927,0.687867
3,authority_or_subversion,0.368263,0.695774
4,purity_or_degradation,0.25,0.67351
5,non-moral,0.685675,0.735175


## Codex Code Multiclass

### Twitter

In [5]:
TWITTER_MORAL_SETS_REVERSE = {
    'care': 'care_or_harm',
    'harm': 'care_or_harm',
    'fairness': 'fairness_or_cheating',
    'cheating': 'fairness_or_cheating',
    'loyalty': 'loyalty_or_betrayal',
    'betrayal': 'loyalty_or_betrayal',
    'authority': 'authority_or_subversion',
    'subversion': 'authority_or_subversion',
    'purity': 'purity_or_degradation',
    'degradation': 'purity_or_degradation',
    'non-moral': 'non-moral'
}

preds = pd.read_json('../../../data/moral/inferred/codex/twitter/code/baseline-2expmv-n1-t0.0/test.jsonl', lines=True)
moral_value_list = ['care_or_harm', 'fairness_or_cheating', 'loyalty_or_betrayal', 'authority_or_subversion', 'purity_or_degradation', 'non-moral']
for moral_value in moral_value_list[:-1]:
    preds[moral_value] = preds['instance_code'].apply(lambda x: int(moral_value in x[0]))
preds['non-moral'] = preds[moral_value_list[:-1]].apply(lambda x: 1 if x.sum() == 0 else 0, axis=1)
preds['annotation'] = preds['input'].apply(lambda x: [TWITTER_MORAL_SETS_REVERSE[moral_value] for moral_value in x['original_example']['gold_classes']])
preds['label'] = preds['annotation'].apply(lambda x: [1 if moral_value in x else 0 for moral_value in moral_value_list])
preds['pred'] = preds[moral_value_list].apply(lambda x: [1 if x[moral_value] == 1 else 0 for moral_value in moral_value_list], axis=1)

label = preds['label'].tolist()
pred = preds['pred'].tolist()

f1 = f1_score(label, pred, average='micro')
auc = roc_auc_score(label, pred, average='micro')
acc = accuracy_score(label, pred)

print("F1: ", f1)
print("AUC: ", auc)
print("ACC: ", acc)

KeyError: 'gold_classes'

### Reddit

In [2]:
preds = pd.read_json('../../../data/moral/inferred/codex/reddit/code/baseline-1ex-n1-t0.0/test.jsonl', lines=True)
moral_value_list = ['Purity', 'Loyalty', 'Authority', 'Equality', 'Care', 'Proportionality', 'Non-Moral', 'Thin Morality']
for moral_value in moral_value_list[:-2] + ['Thin Morality']:
    preds[moral_value] = preds['instance_code'].apply(lambda x: int(moral_value in x[0]))
preds['Non-Moral'] = preds[moral_value_list[:-2] + ['Thin Morality']].apply(lambda x: 1 if x.sum() == 0 or x['Thin Morality'] == 1 else 0, axis=1)
preds.drop(columns=['Thin Morality'], inplace=True)
preds['pred'] = preds[moral_value_list[:-1]].apply(lambda x: [1 if x[moral_value] == 1 else 0 for moral_value in moral_value_list[:-1]], axis=1)
preds['label'] = preds['input'].apply(lambda x: [1 if moral_value in x['original_example']['gold_classes'] else 0 for moral_value in moral_value_list[:-1]])

label = preds['label'].tolist()
pred = preds['pred'].tolist()

f1 = f1_score(label, pred, average='micro')
auc = roc_auc_score(label, pred, average='micro')
acc = accuracy_score(label, pred)

print("F1: ", f1)
print("AUC: ", auc)
print("ACC: ", acc)

F1:  0.1499323410013532
AUC:  0.4458123812565292
ACC:  0.1392156862745098
