In [4]:
import os
os.environ['PYTHONPATH'] = '/home/user/conda/envs/lm-exps/lib/python3.10/site-packages'
os.environ["PATH"] = '/home/user/conda/envs/lm-exps/bin'

import datasets
import json
import numpy as np
import pandas as pd
from collections import defaultdict

from sklearn.metrics import roc_auc_score
from lm_polygraph.utils.manager import UEManager, _delete_nans, _recombine_data
from lm_polygraph.ue_metrics.risk_cov_curve import RiskCoverageCurveAUC
from lm_polygraph.ue_metrics.pred_rej_area import PredictionRejectionArea
from transformers import AutoTokenizer

prr = PredictionRejectionArea()

# CoQA

## Vicuna 7b

In [46]:
path = "lm-polygraph/workdir/output/qa/lmsys/vicuna-7b-v1.5/coqa/2024-04-09/22-25-24/ue_manager_seed1"
man = UEManager.load(path)

metric_names = ['Accuracy', 'AlignScore']
df = {}
for key in man.metrics.keys():
    if (key[-1] == 'prr_normalized') and (key[-2] in metric_names):
        scores = df.get(key[1], [])
        scores.append(man.metrics[key])
        df[key[1]] = scores
        
df0 = pd.DataFrame(df, index=[f'CoQA, {m}, PRR' for m in metric_names])  
df0.T.sort_values(f"CoQA, {metric_names[-1]}, PRR", ascending=False).style.background_gradient(axis=0)

Stat calculators: [<lm_polygraph.stat_calculators.greedy_probs.GreedyProbsCalculator object at 0x7fc48b67cf40>]


Unnamed: 0,"CoQA, Accuracy, PRR","CoQA, AlignScore, PRR"
MaximumSequenceProbability,0.485674,0.426848
"TokenMahalanobisDistance_decoder (mean, accuracy, 1.0)",0.727515,0.398484
"RelativeMahalanobisDistance_decoder (sum, alignscore, 0.9)",0.716602,0.395771
"RelativeMahalanobisDistance_decoder (sum, accuracy, 1.0)",0.769119,0.394056
"RelativeMahalanobisDistance_decoder (mean, accuracy, 1.0)",0.738503,0.393038
"TokenMahalanobisDistance_decoder (sum, accuracy, 1.0)",0.755343,0.392636
"TokenMahalanobisDistance_decoder (mean, alignscore, 0.9)",0.66599,0.391263
"TokenMahalanobisDistance_decoder (sum, alignscore, 0.9)",0.744779,0.390905
"TokenMahalanobisDistance_decoder (sum, alignscore, 0.5)",0.743296,0.388285
"TokenMahalanobisDistance_decoder (sum, alignscore, 0.0)",0.740978,0.383136


In [47]:
np.mean(man.gen_metrics[('sequence', 'Accuracy')])

0.311

## Vicuna-13b

In [49]:
path = "lm-polygraph/workdir/output/qa/lmsys/vicuna-13b-v1.5/coqa/2024-04-09/22-25-46/ue_manager_seed1"
man = UEManager.load(path)

metric_names = ['Accuracy', 'AlignScore']
df = {}
for key in man.metrics.keys():
    if (key[-1] == 'prr_normalized') and (key[-2] in metric_names):
        scores = df.get(key[1], [])
        scores.append(man.metrics[key])
        df[key[1]] = scores
        
df0 = pd.DataFrame(df, index=[f'CoQA, {m}, PRR' for m in metric_names])  
df0.T.sort_values(f"CoQA, {metric_names[-1]}, PRR", ascending=False).style.background_gradient(axis=0)

Stat calculators: [<lm_polygraph.stat_calculators.greedy_probs.GreedyProbsCalculator object at 0x7fc48b67cf40>]


Unnamed: 0,"CoQA, Accuracy, PRR","CoQA, AlignScore, PRR"
MaximumSequenceProbability,0.571644,0.476971
"TokenMahalanobisDistance_decoder (mean, accuracy, 1.0)",0.669625,0.424662
"TokenMahalanobisDistance_decoder (mean, alignscore, 0.9)",0.594895,0.415991
"RelativeMahalanobisDistance_decoder (mean, accuracy, 1.0)",0.682688,0.398452
"TokenMahalanobisDistance_decoder (mean, alignscore, 0.5)",0.582878,0.39733
"RelativeMahalanobisDistance_decoder (sum, accuracy, 1.0)",0.711412,0.355334
"RelativeMahalanobisDistance_decoder (sum, alignscore, 0.9)",0.614735,0.354281
"TokenMahalanobisDistance_decoder (mean, alignscore, 0.0)",0.568651,0.353172
"RelativeMahalanobisDistance_decoder (mean, alignscore, 0.9)",0.598521,0.350778
"TokenMahalanobisDistance_decoder (sum, accuracy, 1.0)",0.698367,0.344116


In [50]:
np.mean(man.gen_metrics[('sequence', 'Accuracy')])

0.398

## Mistral 7b

In [121]:
path = "lm-polygraph/workdir/output/qa/alpindale/Mistral-7B-v0.2-hf/coqa/2024-04-10/05-00-05/ue_manager_seed1"
man = UEManager.load(path)

metric_names = ['Accuracy', 'AlignScore']
df = {}
for key in man.metrics.keys():
    if (key[-1] == 'prr_normalized') and (key[-2] in metric_names):
        scores = df.get(key[1], [])
        scores.append(man.metrics[key])
        df[key[1]] = scores
        
df0 = pd.DataFrame(df, index=[f'CoQA, {m}, PRR' for m in metric_names])  
df0.T.sort_values(f"CoQA, {metric_names[-1]}, PRR", ascending=False).style.background_gradient(axis=0)

Stat calculators: [<lm_polygraph.stat_calculators.greedy_probs.GreedyProbsCalculator object at 0x7fc48b67cf40>]


Unnamed: 0,"CoQA, Accuracy, PRR","CoQA, AlignScore, PRR"
MaximumSequenceProbability,0.085285,0.225358
Perplexity,0.086268,0.221819
MeanTokenEntropy,0.061717,0.205033
"TokenMahalanobisDistance_decoder (sum, alignscore, 0.9)",0.225555,0.001932
"TokenMahalanobisDistance_decoder (sum, alignscore, 0.5)",0.241225,-0.004349
"TokenMahalanobisDistance_decoder (mean, alignscore, 0.9)",0.229649,-0.006377
"TokenMahalanobisDistance_decoder (sum, accuracy, 1.0)",0.2511,-0.009197
"TokenMahalanobisDistance_decoder (mean, alignscore, 0.5)",0.246404,-0.011738
"TokenMahalanobisDistance_decoder (mean, accuracy, 1.0)",0.256077,-0.016848
"TokenMahalanobisDistance_decoder (sum, alignscore, 0.0)",0.199988,-0.056343


In [122]:
np.mean(man.gen_metrics[('sequence', 'Accuracy')])

0.5705

# SciQ

## Vicuna 7b

In [60]:
path = "lm-polygraph/workdir/output/qa/lmsys/vicuna-7b-v1.5/sciq/2024-04-09/22-25-46/ue_manager_seed1"
man = UEManager.load(path)

metric_names = ['Accuracy', 'AlignScore']
df = {}
for key in man.metrics.keys():
    if (key[-1] == 'prr_normalized') and (key[-2] in metric_names):
        scores = df.get(key[1], [])
        scores.append(man.metrics[key])
        df[key[1]] = scores
        
df0 = pd.DataFrame(df, index=[f'SciQ, {m}, PRR' for m in metric_names])  
df0.T.sort_values(f"SciQ, {metric_names[-1]}, PRR", ascending=False).style.background_gradient(axis=0)

Stat calculators: [<lm_polygraph.stat_calculators.greedy_probs.GreedyProbsCalculator object at 0x7fc48b67cf40>]


Unnamed: 0,"SciQ, Accuracy, PRR","SciQ, AlignScore, PRR"
"TokenMahalanobisDistance_decoder (mean, alignscore, 0.9)",0.665993,0.584277
"TokenMahalanobisDistance_decoder (mean, alignscore, 0.5)",0.64452,0.56672
"TokenMahalanobisDistance_decoder (mean, accuracy, 1.0)",0.902777,0.564961
"RelativeMahalanobisDistance_decoder (sum, alignscore, 0.9)",0.697955,0.550695
"TokenMahalanobisDistance_decoder (mean, alignscore, 0.0)",0.635907,0.54977
"RelativeMahalanobisDistance_decoder (mean, accuracy, 1.0)",0.905488,0.548268
"RelativeMahalanobisDistance_decoder (mean, alignscore, 0.9)",0.629997,0.534538
"TokenMahalanobisDistance_decoder (sum, alignscore, 0.9)",0.89028,0.507877
"RelativeMahalanobisDistance_decoder (sum, accuracy, 1.0)",0.92768,0.506569
"TokenMahalanobisDistance_decoder (sum, accuracy, 1.0)",0.920234,0.501223


In [54]:
np.mean(man.gen_metrics[('sequence', 'Accuracy')])

0.211

## Vicuna 13b

In [61]:
path = "lm-polygraph/workdir/output/qa/lmsys/vicuna-13b-v1.5/sciq/2024-04-09/22-25-26/ue_manager_seed1"
man = UEManager.load(path)

metric_names = ['Accuracy', 'AlignScore']
df = {}
for key in man.metrics.keys():
    if (key[-1] == 'prr_normalized') and (key[-2] in metric_names):
        scores = df.get(key[1], [])
        scores.append(man.metrics[key])
        df[key[1]] = scores
        
df0 = pd.DataFrame(df, index=[f'SciQ, {m}, PRR' for m in metric_names])  
df0.T.sort_values(f"SciQ, {metric_names[-1]}, PRR", ascending=False).style.background_gradient(axis=0)

Stat calculators: [<lm_polygraph.stat_calculators.greedy_probs.GreedyProbsCalculator object at 0x7fc48b67cf40>]


Unnamed: 0,"SciQ, Accuracy, PRR","SciQ, AlignScore, PRR"
"TokenMahalanobisDistance_decoder (mean, alignscore, 0.9)",0.673459,0.698069
"TokenMahalanobisDistance_decoder (mean, accuracy, 1.0)",0.763938,0.687435
"TokenMahalanobisDistance_decoder (mean, alignscore, 0.5)",0.649234,0.681515
"TokenMahalanobisDistance_decoder (mean, alignscore, 0.0)",0.637699,0.670883
"RelativeMahalanobisDistance_decoder (mean, alignscore, 0.9)",0.701557,0.658282
"RelativeMahalanobisDistance_decoder (mean, accuracy, 1.0)",0.770314,0.65536
"RelativeMahalanobisDistance_decoder (sum, alignscore, 0.9)",0.717094,0.637501
"RelativeMahalanobisDistance_decoder (mean, alignscore, 0.5)",0.678222,0.630164
"RelativeMahalanobisDistance_decoder (sum, accuracy, 1.0)",0.787178,0.62608
"RelativeMahalanobisDistance_decoder (sum, alignscore, 0.5)",0.650041,0.603264


In [62]:
np.mean(man.gen_metrics[('sequence', 'Accuracy')])

0.518

## Mistral 7b

In [124]:
path = "lm-polygraph/workdir/output/qa/alpindale/Mistral-7B-v0.2-hf/sciq/2024-04-10/05-00-23/ue_manager_seed1"
man = UEManager.load(path)

metric_names = ['Accuracy', 'AlignScore']
df = {}
for key in man.metrics.keys():
    if (key[-1] == 'prr_normalized') and (key[-2] in metric_names):
        scores = df.get(key[1], [])
        scores.append(man.metrics[key])
        df[key[1]] = scores
        
df0 = pd.DataFrame(df, index=[f'SciQ, {m}, PRR' for m in metric_names])  
df0.T.sort_values(f"SciQ, {metric_names[-1]}, PRR", ascending=False).style.background_gradient(axis=0)

Stat calculators: [<lm_polygraph.stat_calculators.greedy_probs.GreedyProbsCalculator object at 0x7fc48b67cf40>]


Unnamed: 0,"SciQ, Accuracy, PRR","SciQ, AlignScore, PRR"
"TokenMahalanobisDistance_decoder (sum, alignscore, 0.9)",-0.265819,0.428868
"TokenMahalanobisDistance_decoder (mean, alignscore, 0.9)",-0.331701,0.414744
"TokenMahalanobisDistance_decoder (sum, alignscore, 0.5)",-0.259495,0.406263
"RelativeMahalanobisDistance_decoder (sum, alignscore, 0.9)",-0.326745,0.393589
"TokenMahalanobisDistance_decoder (mean, alignscore, 0.5)",-0.325507,0.39192
"RelativeMahalanobisDistance_decoder (mean, alignscore, 0.9)",-0.331136,0.390634
"TokenMahalanobisDistance_decoder (sum, alignscore, 0.0)",-0.257455,0.383041
"RelativeMahalanobisDistance_decoder (sum, alignscore, 0.5)",-0.321503,0.374006
"RelativeMahalanobisDistance_decoder (mean, alignscore, 0.5)",-0.325209,0.371871
"TokenMahalanobisDistance_decoder (mean, alignscore, 0.0)",-0.323484,0.368324


In [64]:
np.mean(man.gen_metrics[('sequence', 'Accuracy')])

0.146

# TriviaQA

## Vicuna 7b

In [65]:
path = "lm-polygraph/workdir/output/qa/lmsys/vicuna-7b-v1.5/['trivia_qa', 'rc.nocontext']/2024-04-09/22-25-24/ue_manager_seed1"
man = UEManager.load(path)

metric_names = ['Accuracy', 'AlignScore']
df = {}
for key in man.metrics.keys():
    if (key[-1] == 'prr_normalized') and (key[-2] in metric_names):
        scores = df.get(key[1], [])
        scores.append(man.metrics[key])
        df[key[1]] = scores
        
df0 = pd.DataFrame(df, index=[f'TriviaQA, {m}, PRR' for m in metric_names])  
df0.T.sort_values(f"TriviaQA, {metric_names[-1]}, PRR", ascending=False).style.background_gradient(axis=0)

Stat calculators: [<lm_polygraph.stat_calculators.greedy_probs.GreedyProbsCalculator object at 0x7fc48b67cf40>]


Unnamed: 0,"TriviaQA, Accuracy, PRR","TriviaQA, AlignScore, PRR"
MaximumSequenceProbability,0.547689,0.572455
"TokenMahalanobisDistance_decoder (sum, alignscore, 0.9)",0.581011,0.420545
"TokenMahalanobisDistance_decoder (sum, alignscore, 0.5)",0.577732,0.416744
"TokenMahalanobisDistance_decoder (sum, accuracy, 1.0)",0.595045,0.413152
"TokenMahalanobisDistance_decoder (sum, alignscore, 0.0)",0.563798,0.398236
"TokenMahalanobisDistance_decoder (mean, alignscore, 0.9)",0.438955,0.33429
MeanTokenEntropy,0.12714,0.329241
"TokenMahalanobisDistance_decoder (mean, accuracy, 1.0)",0.516962,0.328341
"TokenMahalanobisDistance_decoder (mean, alignscore, 0.5)",0.428726,0.32481
"RelativeMahalanobisDistance_decoder (sum, accuracy, 1.0)",0.542539,0.318825


In [66]:
np.mean(man.gen_metrics[('sequence', 'Accuracy')])

0.335

## Vicuna 13b

In [67]:
path = "lm-polygraph/workdir/output/qa/lmsys/vicuna-13b-v1.5/['trivia_qa', 'rc.nocontext']/2024-04-09/22-25-24/ue_manager_seed1"
man = UEManager.load(path)

metric_names = ['Accuracy', 'AlignScore']
df = {}
for key in man.metrics.keys():
    if (key[-1] == 'prr_normalized') and (key[-2] in metric_names):
        scores = df.get(key[1], [])
        scores.append(man.metrics[key])
        df[key[1]] = scores
        
df0 = pd.DataFrame(df, index=[f'TriviaQA, {m}, PRR' for m in metric_names])  
df0.T.sort_values(f"TriviaQA, {metric_names[-1]}, PRR", ascending=False).style.background_gradient(axis=0)

Stat calculators: [<lm_polygraph.stat_calculators.greedy_probs.GreedyProbsCalculator object at 0x7fc48b67cf40>]


Unnamed: 0,"TriviaQA, Accuracy, PRR","TriviaQA, AlignScore, PRR"
MaximumSequenceProbability,0.649271,0.596263
MeanTokenEntropy,0.473291,0.581673
Perplexity,0.493253,0.54853
"TokenMahalanobisDistance_decoder (mean, alignscore, 0.9)",0.490199,0.410863
"TokenMahalanobisDistance_decoder (mean, alignscore, 0.5)",0.483249,0.400115
"TokenMahalanobisDistance_decoder (mean, accuracy, 1.0)",0.541686,0.382376
"RelativeMahalanobisDistance_decoder (sum, alignscore, 0.9)",0.46697,0.341137
"RelativeMahalanobisDistance_decoder (sum, accuracy, 1.0)",0.538213,0.331156
"RelativeMahalanobisDistance_decoder (mean, alignscore, 0.9)",0.438831,0.329843
"RelativeMahalanobisDistance_decoder (mean, accuracy, 1.0)",0.509496,0.313983


In [68]:
np.mean(man.gen_metrics[('sequence', 'Accuracy')])

0.528

## Mistral 7b

In [126]:
path = "lm-polygraph/workdir/output/qa/alpindale/Mistral-7B-v0.2-hf/['trivia_qa', 'rc.nocontext']/2024-04-10/05-00-05/ue_manager_seed1"
man = UEManager.load(path)

metric_names = ['Accuracy', 'AlignScore']
df = {}
for key in man.metrics.keys():
    if (key[-1] == 'prr_normalized') and (key[-2] in metric_names):
        scores = df.get(key[1], [])
        scores.append(man.metrics[key])
        df[key[1]] = scores
        
df0 = pd.DataFrame(df, index=[f'TriviaQA, {m}, PRR' for m in metric_names])  
df0.T.sort_values(f"TriviaQA, {metric_names[-1]}, PRR", ascending=False).style.background_gradient(axis=0)

Stat calculators: [<lm_polygraph.stat_calculators.greedy_probs.GreedyProbsCalculator object at 0x7fc48b67cf40>]


Unnamed: 0,"TriviaQA, Accuracy, PRR","TriviaQA, AlignScore, PRR"
MeanTokenEntropy,0.088757,0.269215
MaximumSequenceProbability,0.069616,0.258446
Perplexity,0.069616,0.258446
"TokenMahalanobisDistance_decoder (mean, alignscore, 0.9)",0.344637,0.138475
"TokenMahalanobisDistance_decoder (sum, alignscore, 0.9)",0.344637,0.138475
"TokenMahalanobisDistance_decoder (sum, alignscore, 0.5)",0.37789,0.13034
"TokenMahalanobisDistance_decoder (mean, alignscore, 0.5)",0.37789,0.13034
"TokenMahalanobisDistance_decoder (mean, accuracy, 1.0)",0.398288,0.11854
"TokenMahalanobisDistance_decoder (sum, accuracy, 1.0)",0.398288,0.11854
"RelativeMahalanobisDistance_decoder (sum, alignscore, 0.9)",0.347659,0.115769


In [127]:
np.mean(man.gen_metrics[('sequence', 'Accuracy')])

0.558

# MMLU

## Vicuna 7b

In [74]:
path = "lm-polygraph/workdir/output/qa/lmsys/vicuna-7b-v1.5/['cais/mmlu', 'all']/2024-04-09/22-25-26/ue_manager_seed1"
man = UEManager.load(path)

metric_names = ['Accuracy']
df = {}
for key in man.metrics.keys():
    if (key[-1] == 'prr_normalized') and (key[-2] in metric_names):
        scores = df.get(key[1], [])
        scores.append(man.metrics[key])
        df[key[1]] = scores
        
df0 = pd.DataFrame(df, index=[f'MMLU, {m}, PRR' for m in metric_names])  
df0.T.sort_values(f"MMLU, {metric_names[-1]}, PRR", ascending=False).style.background_gradient(axis=0)

Stat calculators: [<lm_polygraph.stat_calculators.greedy_probs.GreedyProbsCalculator object at 0x7fc48b67cf40>]


Unnamed: 0,"MMLU, Accuracy, PRR"
MeanTokenEntropy,0.474275
MaximumSequenceProbability,0.449569
Perplexity,0.449569
"TokenMahalanobisDistance_decoder (mean, alignscore, 0.5)",0.250325
"TokenMahalanobisDistance_decoder (sum, alignscore, 0.5)",0.250325
"RelativeMahalanobisDistance_decoder (sum, alignscore, 0.5)",0.248022
"RelativeMahalanobisDistance_decoder (mean, alignscore, 0.5)",0.248022
"TokenMahalanobisDistance_decoder (sum, alignscore, 0.0)",0.2206
"TokenMahalanobisDistance_decoder (mean, alignscore, 0.0)",0.2206
"RelativeMahalanobisDistance_decoder (mean, alignscore, 0.0)",0.219658


In [75]:
np.mean(man.gen_metrics[('sequence', 'Accuracy')])

0.403

## Vicuna 13b

In [76]:
path = "lm-polygraph/workdir/output/qa/lmsys/vicuna-13b-v1.5/['cais/mmlu', 'all']/2024-04-09/22-25-26/ue_manager_seed1"
man = UEManager.load(path)

metric_names = ['Accuracy']
df = {}
for key in man.metrics.keys():
    if (key[-1] == 'prr_normalized') and (key[-2] in metric_names):
        scores = df.get(key[1], [])
        scores.append(man.metrics[key])
        df[key[1]] = scores
        
df0 = pd.DataFrame(df, index=[f'MMLU, {m}, PRR' for m in metric_names])  
df0.T.sort_values(f"MMLU, {metric_names[-1]}, PRR", ascending=False).style.background_gradient(axis=0)

Stat calculators: [<lm_polygraph.stat_calculators.greedy_probs.GreedyProbsCalculator object at 0x7fc48b67cf40>]


Unnamed: 0,"MMLU, Accuracy, PRR"
MeanTokenEntropy,0.653613
MaximumSequenceProbability,0.642444
Perplexity,0.642444
"TokenMahalanobisDistance_decoder (mean, alignscore, 0.0)",0.160387
"TokenMahalanobisDistance_decoder (sum, alignscore, 0.0)",0.160387
"RelativeMahalanobisDistance_decoder (sum, alignscore, 0.0)",0.158379
"RelativeMahalanobisDistance_decoder (mean, alignscore, 0.0)",0.158379
"TokenMahalanobisDistance_decoder (mean, alignscore, 0.5)",0.127214
"TokenMahalanobisDistance_decoder (sum, alignscore, 0.5)",0.127214
"RelativeMahalanobisDistance_decoder (sum, alignscore, 0.5)",0.127011


In [77]:
np.mean(man.gen_metrics[('sequence', 'Accuracy')])

0.4865

## Mistral 7b

In [129]:
path = "lm-polygraph/workdir/output/qa/alpindale/Mistral-7B-v0.2-hf/['cais/mmlu', 'all']/2024-04-10/05-00-23/ue_manager_seed1"
man = UEManager.load(path)

metric_names = ['Accuracy']
df = {}
for key in man.metrics.keys():
    if (key[-1] == 'prr_normalized') and (key[-2] in metric_names):
        scores = df.get(key[1], [])
        scores.append(man.metrics[key])
        df[key[1]] = scores
        
df0 = pd.DataFrame(df, index=[f'MMLU, {m}, PRR' for m in metric_names])  
df0.T.sort_values(f"MMLU, {metric_names[-1]}, PRR", ascending=False).style.background_gradient(axis=0)

Stat calculators: [<lm_polygraph.stat_calculators.greedy_probs.GreedyProbsCalculator object at 0x7fc48b67cf40>]


Unnamed: 0,"MMLU, Accuracy, PRR"
MeanTokenEntropy,0.695688
MaximumSequenceProbability,0.678909
Perplexity,0.678909
"TokenMahalanobisDistance_decoder (mean, alignscore, 0.5)",0.151506
"TokenMahalanobisDistance_decoder (sum, alignscore, 0.5)",0.151506
"RelativeMahalanobisDistance_decoder (sum, alignscore, 0.5)",0.150787
"RelativeMahalanobisDistance_decoder (mean, alignscore, 0.5)",0.150787
"TokenMahalanobisDistance_decoder (mean, accuracy, 1.0)",0.111301
"TokenMahalanobisDistance_decoder (sum, accuracy, 1.0)",0.111301
"TokenMahalanobisDistance_decoder (sum, alignscore, 0.9)",0.111301


In [79]:
np.mean(man.gen_metrics[('sequence', 'Accuracy')])

0.5575

# GSM8k

## Vicuna 7b

In [80]:
path = "lm-polygraph/workdir/output/qa/lmsys/vicuna-7b-v1.5/['gsm8k', 'main']/2024-04-09/23-14-31/ue_manager_seed1"
man = UEManager.load(path)

metric_names = ['Accuracy']
df = {}
for key in man.metrics.keys():
    if (key[-1] == 'prr_normalized') and (key[-2] in metric_names):
        scores = df.get(key[1], [])
        scores.append(man.metrics[key])
        df[key[1]] = scores
        
df0 = pd.DataFrame(df, index=[f'GSM8k, {m}, PRR' for m in metric_names])  
df0.T.sort_values(f"GSM8k, {metric_names[-1]}, PRR", ascending=False).style.background_gradient(axis=0)

Stat calculators: [<lm_polygraph.stat_calculators.greedy_probs.GreedyProbsCalculator object at 0x7fc48b67cf40>]


Unnamed: 0,"GSM8k, Accuracy, PRR"
"RelativeMahalanobisDistance_decoder (sum, accuracy, 1.0)",0.45793
"TokenMahalanobisDistance_decoder (mean, accuracy, 1.0)",0.448746
"TokenMahalanobisDistance_decoder (mean, alignscore, 0.9)",0.431895
"RelativeMahalanobisDistance_decoder (mean, accuracy, 1.0)",0.430294
"TokenMahalanobisDistance_decoder (mean, alignscore, 0.5)",0.425684
"RelativeMahalanobisDistance_decoder (mean, alignscore, 0.9)",0.412565
"TokenMahalanobisDistance_decoder (mean, alignscore, 0.0)",0.411843
MaximumSequenceProbability,0.372873
"RelativeMahalanobisDistance_decoder (mean, alignscore, 0.5)",0.360804
"RelativeMahalanobisDistance_decoder (mean, alignscore, 0.0)",0.339123


In [81]:
np.mean(man.gen_metrics[('sequence', 'Accuracy')])

0.129

## Vicuna 13b

In [82]:
path = "lm-polygraph/workdir/output/qa/lmsys/vicuna-13b-v1.5/['gsm8k', 'main']/2024-04-09/23-14-30/ue_manager_seed1"
man = UEManager.load(path)

metric_names = ['Accuracy']
df = {}
for key in man.metrics.keys():
    if (key[-1] == 'prr_normalized') and (key[-2] in metric_names):
        scores = df.get(key[1], [])
        scores.append(man.metrics[key])
        df[key[1]] = scores
        
df0 = pd.DataFrame(df, index=[f'GSM8k, {m}, PRR' for m in metric_names])  
df0.T.sort_values(f"GSM8k, {metric_names[-1]}, PRR", ascending=False).style.background_gradient(axis=0)

Stat calculators: [<lm_polygraph.stat_calculators.greedy_probs.GreedyProbsCalculator object at 0x7fc48b67cf40>]


Unnamed: 0,"GSM8k, Accuracy, PRR"
"TokenMahalanobisDistance_decoder (mean, accuracy, 1.0)",0.581796
"RelativeMahalanobisDistance_decoder (sum, alignscore, 0.9)",0.542334
"RelativeMahalanobisDistance_decoder (mean, accuracy, 1.0)",0.531925
"TokenMahalanobisDistance_decoder (mean, alignscore, 0.5)",0.528363
"RelativeMahalanobisDistance_decoder (sum, accuracy, 1.0)",0.522828
"TokenMahalanobisDistance_decoder (mean, alignscore, 0.0)",0.518332
"TokenMahalanobisDistance_decoder (sum, accuracy, 1.0)",0.509051
"TokenMahalanobisDistance_decoder (sum, alignscore, 0.9)",0.50294
"TokenMahalanobisDistance_decoder (sum, alignscore, 0.5)",0.492718
"TokenMahalanobisDistance_decoder (mean, alignscore, 0.9)",0.486378


In [83]:
np.mean(man.gen_metrics[('sequence', 'Accuracy')])

0.191

## Mistral 7b

In [131]:
path = "lm-polygraph/workdir/output/qa/alpindale/Mistral-7B-v0.2-hf/['gsm8k', 'main']/2024-04-09/23-14-29/ue_manager_seed1"
man = UEManager.load(path)

metric_names = ['Accuracy']
df = {}
for key in man.metrics.keys():
    if (key[-1] == 'prr_normalized') and (key[-2] in metric_names):
        scores = df.get(key[1], [])
        scores.append(man.metrics[key])
        df[key[1]] = scores
        
df0 = pd.DataFrame(df, index=[f'GSM8k, {m}, PRR' for m in metric_names])  
df0.T.sort_values(f"GSM8k, {metric_names[-1]}, PRR", ascending=False).style.background_gradient(axis=0)

Stat calculators: [<lm_polygraph.stat_calculators.greedy_probs.GreedyProbsCalculator object at 0x7fc48b67cf40>]


Unnamed: 0,"GSM8k, Accuracy, PRR"
"TokenMahalanobisDistance_decoder (mean, accuracy, 1.0)",0.319444
"TokenMahalanobisDistance_decoder (sum, accuracy, 1.0)",0.319444
"TokenMahalanobisDistance_decoder (mean, alignscore, 0.5)",0.310199
"TokenMahalanobisDistance_decoder (sum, alignscore, 0.5)",0.310199
"RelativeMahalanobisDistance_decoder (sum, accuracy, 1.0)",0.307177
"RelativeMahalanobisDistance_decoder (mean, accuracy, 1.0)",0.307177
"RelativeMahalanobisDistance_decoder (sum, alignscore, 0.5)",0.299965
"RelativeMahalanobisDistance_decoder (mean, alignscore, 0.5)",0.299965
"TokenMahalanobisDistance_decoder (sum, alignscore, 0.0)",0.287797
"TokenMahalanobisDistance_decoder (mean, alignscore, 0.0)",0.287797


In [85]:
np.mean(man.gen_metrics[('sequence', 'Accuracy')])

0.323

# XSUM

## Vicuna 7b

In [87]:
path = "lm-polygraph/workdir/output/ats/lmsys/vicuna-7b-v1.5/xsum/2024-04-09/23-14-30/ue_manager_seed1"
man = UEManager.load(path)

metric_names = ['Rouge_rougeL', 'AlignScore']
df = {}
for key in man.metrics.keys():
    if (key[-1] == 'prr_normalized') and (key[-2] in metric_names):
        scores = df.get(key[1], [])
        scores.append(man.metrics[key])
        df[key[1]] = scores
        
df0 = pd.DataFrame(df, index=[f'XSUM, {m}, PRR' for m in metric_names])  
df0.T.sort_values(f"XSUM, {metric_names[-1]}, PRR", ascending=False).style.background_gradient(axis=0)

Stat calculators: [<lm_polygraph.stat_calculators.greedy_probs.GreedyProbsCalculator object at 0x7fc48b67cf40>]


Unnamed: 0,"XSUM, Rouge_rougeL, PRR","XSUM, AlignScore, PRR"
MeanTokenEntropy,0.024997,0.14844
"TokenMahalanobisDistance_decoder (mean, alignscore, 0.5)",0.075167,0.109446
"RelativeMahalanobisDistance_decoder (mean, alignscore, 0.5)",0.126173,0.109117
Perplexity,0.017226,0.090535
"RelativeMahalanobisDistance_decoder (sum, alignscore, 0.0)",0.040631,0.020652
MaximumSequenceProbability,-0.009734,0.019434
"TokenMahalanobisDistance_decoder (mean, alignscore, 0.0)",-0.055818,6.1e-05
"RelativeMahalanobisDistance_decoder (sum, alignscore, 0.5)",0.062274,-0.038587
"RelativeMahalanobisDistance_decoder (mean, alignscore, 0.0)",0.067524,-0.05099
"TokenMahalanobisDistance_decoder (sum, alignscore, 0.5)",0.037828,-0.059794


In [88]:
np.mean(man.gen_metrics[('sequence', 'AlignScore')])

0.0621609894928406

## Vicuna 13b

In [89]:
path = "lm-polygraph/workdir/output/ats/lmsys/vicuna-13b-v1.5/xsum/2024-04-09/23-14-29/ue_manager_seed1"
man = UEManager.load(path)

metric_names = ['Rouge_rougeL', 'AlignScore']
df = {}
for key in man.metrics.keys():
    if (key[-1] == 'prr_normalized') and (key[-2] in metric_names):
        scores = df.get(key[1], [])
        scores.append(man.metrics[key])
        df[key[1]] = scores
        
df0 = pd.DataFrame(df, index=[f'XSUM, {m}, PRR' for m in metric_names])  
df0.T.sort_values(f"XSUM, {metric_names[-1]}, PRR", ascending=False).style.background_gradient(axis=0)

Stat calculators: [<lm_polygraph.stat_calculators.greedy_probs.GreedyProbsCalculator object at 0x7fc48b67cf40>]


Unnamed: 0,"XSUM, Rouge_rougeL, PRR","XSUM, AlignScore, PRR"
"TokenMahalanobisDistance_decoder (mean, alignscore, 0.5)",0.127219,0.187453
"RelativeMahalanobisDistance_decoder (mean, alignscore, 0.5)",0.160918,0.167808
MeanTokenEntropy,0.088062,0.151454
Perplexity,0.084116,0.13502
MaximumSequenceProbability,0.135675,0.120805
"TokenMahalanobisDistance_decoder (mean, alignscore, 0.0)",0.085816,0.112239
"RelativeMahalanobisDistance_decoder (sum, alignscore, 0.5)",0.173773,0.081399
"TokenMahalanobisDistance_decoder (sum, alignscore, 0.5)",0.159304,0.05637
"TokenMahalanobisDistance_decoder (sum, alignscore, 0.0)",0.144362,0.023399
"RelativeMahalanobisDistance_decoder (sum, alignscore, 0.0)",0.052968,-0.013338


In [90]:
np.mean(man.gen_metrics[('sequence', 'AlignScore')])

0.08629431277797267

## Mistral 7b

In [133]:
path = "lm-polygraph/workdir/output/ats/alpindale/Mistral-7B-v0.2-hf/xsum/2024-04-09/23-14-31/ue_manager_seed1"
man = UEManager.load(path)

metric_names = ['Rouge_rougeL', 'AlignScore']
df = {}
for key in man.metrics.keys():
    if (key[-1] == 'prr_normalized') and (key[-2] in metric_names):
        scores = df.get(key[1], [])
        scores.append(man.metrics[key])
        df[key[1]] = scores
        
df0 = pd.DataFrame(df, index=[f'XSUM, {m}, PRR' for m in metric_names])  
df0.T.sort_values(f"XSUM, {metric_names[-1]}, PRR", ascending=False).style.background_gradient(axis=0)

Stat calculators: [<lm_polygraph.stat_calculators.greedy_probs.GreedyProbsCalculator object at 0x7fc48b67cf40>]


Unnamed: 0,"XSUM, Rouge_rougeL, PRR","XSUM, AlignScore, PRR"
"RelativeMahalanobisDistance_decoder (sum, alignscore, 0.5)",0.334371,0.103817
"TokenMahalanobisDistance_decoder (sum, alignscore, 0.5)",0.347345,0.08041
"TokenMahalanobisDistance_decoder (mean, alignscore, 0.5)",-0.194326,0.067441
"RelativeMahalanobisDistance_decoder (mean, alignscore, 0.5)",-0.189549,0.066426
"TokenMahalanobisDistance_decoder (sum, alignscore, 0.0)",0.365192,0.014786
Perplexity,-0.295978,0.013115
MeanTokenEntropy,-0.27447,0.009041
MaximumSequenceProbability,-0.092869,-0.001631
"TokenMahalanobisDistance_decoder (mean, alignscore, 0.0)",-0.213619,-0.019585
"RelativeMahalanobisDistance_decoder (mean, alignscore, 0.0)",-0.019231,-0.020377


In [134]:
np.mean(man.gen_metrics[('sequence', 'AlignScore')]), np.mean(man.gen_metrics[('sequence', 'Rouge_rougeL')])

(0.034991165048508265, 0.20618126565166967)

## Stablm 12b

In [136]:
path = "lm-polygraph/workdir/output/ats/stabilityai/stablelm-2-12b/xsum/2024-04-10/05-39-18/ue_manager_seed1"
man = UEManager.load(path)

metric_names = ['Rouge_rougeL', 'AlignScore']
df = {}
for key in man.metrics.keys():
    if (key[-1] == 'prr_normalized') and (key[-2] in metric_names):
        scores = df.get(key[1], [])
        scores.append(man.metrics[key])
        df[key[1]] = scores
        
df0 = pd.DataFrame(df, index=[f'XSUM, {m}, PRR' for m in metric_names])  
df0.T.sort_values(f"XSUM, {metric_names[-1]}, PRR", ascending=False).style.background_gradient(axis=0)

Stat calculators: [<lm_polygraph.stat_calculators.greedy_probs.GreedyProbsCalculator object at 0x7fc48b67cf40>]


Unnamed: 0,"XSUM, Rouge_rougeL, PRR","XSUM, AlignScore, PRR"
Perplexity,-0.094104,0.073626
MeanTokenEntropy,-0.086495,0.066653
"TokenMahalanobisDistance_decoder (mean, alignscore, 0.5)",-0.040553,0.04843
"RelativeMahalanobisDistance_decoder (mean, alignscore, 0.5)",0.022527,0.039754
"RelativeMahalanobisDistance_decoder (sum, alignscore, 0.0)",-0.033884,0.034142
"TokenMahalanobisDistance_decoder (mean, alignscore, 0.0)",-0.025711,0.006194
MaximumSequenceProbability,0.156525,0.001631
"TokenMahalanobisDistance_decoder (sum, alignscore, 0.5)",0.308204,-0.055999
"RelativeMahalanobisDistance_decoder (sum, alignscore, 0.5)",0.307004,-0.056375
"TokenMahalanobisDistance_decoder (sum, alignscore, 0.0)",0.32688,-0.069938


In [137]:
np.mean(man.gen_metrics[('sequence', 'AlignScore')])

0.08725634816825187

# WMT 14 De-En

## Vicuna 7b

In [110]:
path = "lm-polygraph/workdir/output/nmt/lmsys/vicuna-7b-v1.5/['wmt14', 'de-en']/2024-04-09/23-14-31/ue_manager_seed1"
man = UEManager.load(path)

metric_names = ['Comet', 'AlignScore']
df = {}
for key in man.metrics.keys():
    if (key[-1] == 'prr_normalized') and (key[-2] in metric_names):
        scores = df.get(key[1], [])
        scores.append(man.metrics[key])
        df[key[1]] = scores
        
df0 = pd.DataFrame(df, index=[f'WMT14 De-En, {m}, PRR' for m in metric_names])  
df0.T.sort_values(f"WMT14 De-En, {metric_names[-1]}, PRR", ascending=False).style.background_gradient(axis=0)

Stat calculators: [<lm_polygraph.stat_calculators.greedy_probs.GreedyProbsCalculator object at 0x7fc48b67cf40>]


Unnamed: 0,"WMT14 De-En, Comet, PRR","WMT14 De-En, AlignScore, PRR"
MeanTokenEntropy,0.274388,0.551702
Perplexity,0.233358,0.512977
MaximumSequenceProbability,0.244103,0.480413
"TokenMahalanobisDistance_decoder (mean, alignscore, 0.5)",0.199104,0.224835
"TokenMahalanobisDistance_decoder (mean, alignscore, 0.0)",0.193842,0.223493
"RelativeMahalanobisDistance_decoder (mean, alignscore, 0.5)",0.143306,0.116263
"RelativeMahalanobisDistance_decoder (sum, alignscore, 0.5)",0.140829,0.114453
"RelativeMahalanobisDistance_decoder (mean, alignscore, 0.0)",0.131894,0.109233
"RelativeMahalanobisDistance_decoder (sum, alignscore, 0.0)",0.136024,0.101627
"TokenMahalanobisDistance_decoder (sum, alignscore, 0.5)",0.068077,0.048075


In [111]:
np.mean(man.gen_metrics[('sequence', 'Comet')])

0.8373594695329666

In [112]:
path = "lm-polygraph/workdir/output/nmt/lmsys/vicuna-13b-v1.5/['wmt14', 'de-en']/2024-04-09/23-14-31/ue_manager_seed1"
man = UEManager.load(path)

metric_names = ['Comet', 'AlignScore']
df = {}
for key in man.metrics.keys():
    if (key[-1] == 'prr_normalized') and (key[-2] in metric_names):
        scores = df.get(key[1], [])
        scores.append(man.metrics[key])
        df[key[1]] = scores
        
df0 = pd.DataFrame(df, index=[f'WMT14 De-En, {m}, PRR' for m in metric_names])  
df0.T.sort_values(f"WMT14 De-En, {metric_names[-1]}, PRR", ascending=False).style.background_gradient(axis=0)

Stat calculators: [<lm_polygraph.stat_calculators.greedy_probs.GreedyProbsCalculator object at 0x7fc48b67cf40>]


Unnamed: 0,"WMT14 De-En, Comet, PRR","WMT14 De-En, AlignScore, PRR"
MeanTokenEntropy,0.258356,0.542357
Perplexity,0.238769,0.493736
MaximumSequenceProbability,0.216374,0.431071
"TokenMahalanobisDistance_decoder (mean, alignscore, 0.0)",0.229777,0.342794
"TokenMahalanobisDistance_decoder (mean, alignscore, 0.5)",0.232429,0.341478
"RelativeMahalanobisDistance_decoder (mean, alignscore, 0.5)",0.166717,0.157677
"RelativeMahalanobisDistance_decoder (mean, alignscore, 0.0)",0.163551,0.154636
"RelativeMahalanobisDistance_decoder (sum, alignscore, 0.5)",0.165375,0.123471
"RelativeMahalanobisDistance_decoder (sum, alignscore, 0.0)",0.162751,0.115593
"TokenMahalanobisDistance_decoder (sum, alignscore, 0.0)",0.036914,0.010969


In [113]:
np.mean(man.gen_metrics[('sequence', 'Comet')])

0.8482999328076839

## Mistral 7b

In [114]:
path = "lm-polygraph/workdir/output/nmt/alpindale/Mistral-7B-v0.2-hf/['wmt14', 'de-en']/2024-04-10/05-39-15/ue_manager_seed1"
man = UEManager.load(path)

metric_names = ['Comet', 'AlignScore']
df = {}
for key in man.metrics.keys():
    if (key[-1] == 'prr_normalized') and (key[-2] in metric_names):
        scores = df.get(key[1], [])
        scores.append(man.metrics[key])
        df[key[1]] = scores
        
df0 = pd.DataFrame(df, index=[f'WMT14 De-En, {m}, PRR' for m in metric_names])  
df0.T.sort_values(f"WMT14 De-En, {metric_names[-1]}, PRR", ascending=False).style.background_gradient(axis=0)

Stat calculators: [<lm_polygraph.stat_calculators.greedy_probs.GreedyProbsCalculator object at 0x7fc48b67cf40>]


Unnamed: 0,"WMT14 De-En, Comet, PRR","WMT14 De-En, AlignScore, PRR"
MeanTokenEntropy,0.008118,0.100074
MaximumSequenceProbability,-0.027159,0.033613
Perplexity,-0.022617,0.029431
"RelativeMahalanobisDistance_decoder (sum, alignscore, 0.0)",0.174806,-0.557457
"RelativeMahalanobisDistance_decoder (mean, alignscore, 0.0)",0.174661,-0.557478
"RelativeMahalanobisDistance_decoder (sum, alignscore, 0.5)",0.175602,-0.559307
"RelativeMahalanobisDistance_decoder (mean, alignscore, 0.5)",0.175552,-0.559364
"TokenMahalanobisDistance_decoder (sum, alignscore, 0.0)",0.077897,-0.568182
"TokenMahalanobisDistance_decoder (sum, alignscore, 0.5)",0.080204,-0.569053
"TokenMahalanobisDistance_decoder (mean, alignscore, 0.0)",0.101214,-0.59623


In [116]:
np.mean(man.gen_metrics[('sequence', 'Comet')])

0.5119356378316879

## Stablm 12b

In [119]:
path = "lm-polygraph/workdir/output/nmt/stabilityai/stablelm-2-12b/['wmt14', 'de-en']/2024-04-10/05-39-15/ue_manager_seed1"
man = UEManager.load(path)

metric_names = ['Comet', 'AlignScore']
df = {}
for key in man.metrics.keys():
    if (key[-1] == 'prr_normalized') and (key[-2] in metric_names):
        scores = df.get(key[1], [])
        scores.append(man.metrics[key])
        df[key[1]] = scores
        
df0 = pd.DataFrame(df, index=[f'WMT14 De-En, {m}, PRR' for m in metric_names])  
df0.T.sort_values(f"WMT14 De-En, {metric_names[-1]}, PRR", ascending=False).style.background_gradient(axis=0)

Stat calculators: [<lm_polygraph.stat_calculators.greedy_probs.GreedyProbsCalculator object at 0x7fc48b67cf40>]


Unnamed: 0,"WMT14 De-En, Comet, PRR","WMT14 De-En, AlignScore, PRR"
MaximumSequenceProbability,0.262681,0.729212
MeanTokenEntropy,0.247645,0.587531
Perplexity,0.249943,0.579881
"TokenMahalanobisDistance_decoder (sum, alignscore, 0.5)",0.080823,0.44989
"TokenMahalanobisDistance_decoder (sum, alignscore, 0.0)",0.080889,0.449403
"TokenMahalanobisDistance_decoder (mean, alignscore, 0.5)",0.156284,0.016247
"TokenMahalanobisDistance_decoder (mean, alignscore, 0.0)",0.147046,-0.058706
"RelativeMahalanobisDistance_decoder (mean, alignscore, 0.5)",0.123638,-0.625219
"RelativeMahalanobisDistance_decoder (mean, alignscore, 0.0)",0.123479,-0.649622
"RelativeMahalanobisDistance_decoder (sum, alignscore, 0.5)",0.097356,-0.761433


In [120]:
np.mean(man.gen_metrics[('sequence', 'Comet')])

0.822441279232502