In [1]:
import magnet
from magnet.predictor import Predictor

In [2]:
outputs = magnet.HelmOutputs.demo()
suite_path = outputs.suites()[0].path

In [3]:
base_predictor = Predictor(num_eval_samples=5)
prepared_predict_inputs = base_predictor.prepare_predict_inputs(suite_path)

# The predictor recieves a "train" and "sequestered_test" split, 
# each being a container for HELM data, with the "sequestered_test" 
# split excluding any "stats" data
train_split, sequestered_test_split = prepared_predict_inputs

# Note that to produce these dataframes the original HELM data is
# flattened (in the case of nested objects), however lists are not
# flattened and are included in the dataframe directly as lists
train_run_specs_df = train_split.run_specs
train_scenario_states_df = train_split.scenario_state
train_stats_df = train_split.stats
train_per_instance_stats_df = train_split.per_instance_stats

eval_run_specs_df = sequestered_test_split.run_specs
eval_scenario_states_df = sequestered_test_split.scenario_state

In [4]:
# The "run_specs" dataframes (e.g. `train_run_specs_df`) contain the
# top level information about each HELM run, such as what dataset and
# model were used for the run (along what dataset augmentation (if
# any), and metrics).  The "scenario_states" and "stats" dataframes link
# back to the "run_specs" dataframe via the "run_spec.name" column.
train_run_specs_df

Unnamed: 0,run_spec.name,run_spec.scenario_spec.class_name,run_spec.scenario_spec.args.subject,run_spec.adapter_spec.method,run_spec.adapter_spec.global_prefix,run_spec.adapter_spec.global_suffix,run_spec.adapter_spec.instructions,run_spec.adapter_spec.input_prefix,run_spec.adapter_spec.input_suffix,run_spec.adapter_spec.reference_prefix,...,run_spec.metric_specs,run_spec.data_augmenter_spec.perturbation_specs,run_spec.data_augmenter_spec.should_augment_train_instances,run_spec.data_augmenter_spec.should_include_original_train,run_spec.data_augmenter_spec.should_skip_unchanged_train,run_spec.data_augmenter_spec.should_augment_eval_instances,run_spec.data_augmenter_spec.should_include_original_eval,run_spec.data_augmenter_spec.should_skip_unchanged_eval,run_spec.data_augmenter_spec.seeds_per_instance,run_spec.groups
0,"mmlu:subject=anatomy,method=multiple_choice_jo...",helm.benchmark.scenarios.mmlu_scenario.MMLUSce...,anatomy,multiple_choice_joint,,,The following are multiple choice questions (w...,Question:,\n,A.,...,[{'class_name': 'helm.benchmark.metrics.basic_...,[],False,False,False,False,False,False,1,[mmlu]
0,"mmlu:subject=anatomy,method=multiple_choice_jo...",helm.benchmark.scenarios.mmlu_scenario.MMLUSce...,anatomy,multiple_choice_joint,,,The following are multiple choice questions (w...,Question:,\n,A.,...,[{'class_name': 'helm.benchmark.metrics.basic_...,[],False,False,False,False,False,False,1,[mmlu]
0,"mmlu:subject=philosophy,method=multiple_choice...",helm.benchmark.scenarios.mmlu_scenario.MMLUSce...,philosophy,multiple_choice_joint,,,The following are multiple choice questions (w...,Question:,\n,A.,...,[{'class_name': 'helm.benchmark.metrics.basic_...,[],False,False,False,False,False,False,1,[mmlu]


In [5]:
# The "scenario_states" dataframes contain the individual request and
# response entries for a given "run_spec".  This is where you can find
# the prompt that was provided to the model, along with it's
# responses. 
train_scenario_states_df

Unnamed: 0,run_spec.name,scenario_state.adapter_spec.chain_of_thought_prefix,scenario_state.adapter_spec.chain_of_thought_suffix,scenario_state.adapter_spec.global_prefix,scenario_state.adapter_spec.global_suffix,scenario_state.adapter_spec.input_prefix,scenario_state.adapter_spec.input_suffix,scenario_state.adapter_spec.instance_prefix,scenario_state.adapter_spec.instructions,scenario_state.adapter_spec.max_eval_instances,...,scenario_state.request_states.request.temperature,scenario_state.request_states.request.top_k_per_token,scenario_state.request_states.request.top_p,scenario_state.request_states.result.cached,scenario_state.request_states.result.completions,scenario_state.request_states.result.embedding,scenario_state.request_states.result.request_datetime,scenario_state.request_states.result.request_time,scenario_state.request_states.result.success,scenario_state.request_states.train_trial_index
0,"mmlu:subject=anatomy,method=multiple_choice_jo...",,\n,,,Question:,\n,\n,The following are multiple choice questions (w...,7,...,0.0,5,1,False,"[{'text': ' D', 'logprob': 0.0, 'tokens': [{'t...",[],1755892592,0.087804,True,0
1,"mmlu:subject=anatomy,method=multiple_choice_jo...",,\n,,,Question:,\n,\n,The following are multiple choice questions (w...,7,...,0.0,5,1,False,"[{'text': ' D', 'logprob': 0.0, 'tokens': [{'t...",[],1755892592,0.060737,True,0
2,"mmlu:subject=anatomy,method=multiple_choice_jo...",,\n,,,Question:,\n,\n,The following are multiple choice questions (w...,7,...,0.0,5,1,False,"[{'text': ' D', 'logprob': 0.0, 'tokens': [{'t...",[],1755892592,0.065824,True,0
3,"mmlu:subject=anatomy,method=multiple_choice_jo...",,\n,,,Question:,\n,\n,The following are multiple choice questions (w...,7,...,0.0,5,1,False,"[{'text': ' D', 'logprob': 0.0, 'tokens': [{'t...",[],1755892592,0.064173,True,0
4,"mmlu:subject=anatomy,method=multiple_choice_jo...",,\n,,,Question:,\n,\n,The following are multiple choice questions (w...,7,...,0.0,5,1,False,"[{'text': ' D', 'logprob': 0.0, 'tokens': [{'t...",[],1755892592,0.063471,True,0
5,"mmlu:subject=anatomy,method=multiple_choice_jo...",,\n,,,Question:,\n,\n,The following are multiple choice questions (w...,7,...,0.0,5,1,False,"[{'text': ' D', 'logprob': 0.0, 'tokens': [{'t...",[],1755892592,0.064719,True,0
6,"mmlu:subject=anatomy,method=multiple_choice_jo...",,\n,,,Question:,\n,\n,The following are multiple choice questions (w...,7,...,0.0,5,1,False,"[{'text': ' D', 'logprob': 0.0, 'tokens': [{'t...",[],1755892592,0.063231,True,0
0,"mmlu:subject=anatomy,method=multiple_choice_jo...",,\n,,,Question:,\n,\n,The following are multiple choice questions (w...,7,...,0.0,5,1,False,"[{'text': ' D', 'logprob': 0.0, 'tokens': [{'t...",[],1755892588,0.020622,True,0
1,"mmlu:subject=anatomy,method=multiple_choice_jo...",,\n,,,Question:,\n,\n,The following are multiple choice questions (w...,7,...,0.0,5,1,False,"[{'text': ' D', 'logprob': 0.0, 'tokens': [{'t...",[],1755892588,0.018926,True,0
2,"mmlu:subject=anatomy,method=multiple_choice_jo...",,\n,,,Question:,\n,\n,The following are multiple choice questions (w...,7,...,0.0,5,1,False,"[{'text': ' D', 'logprob': 0.0, 'tokens': [{'t...",[],1755892588,0.020449,True,0


In [6]:
# The "stats" dataframes contain the statistics for each of the
# metrics for a given "run_spec".  Note that there may be multiple
# records for the same metric for the different splits or
# perturbations
train_stats_df

Unnamed: 0,run_spec.name,stats.count,stats.max,stats.mean,stats.min,stats.name.name,stats.name.perturbation.computed_on,stats.name.perturbation.fairness,stats.name.perturbation.name,stats.name.perturbation.robustness,stats.name.split,stats.stddev,stats.sum,stats.sum_squared,stats.variance
0,"mmlu:subject=anatomy,method=multiple_choice_jo...",1,4.000000,4.000000,4.000000,num_references,,,,,test,0.0,4.000000,16.000000,0.0
1,"mmlu:subject=anatomy,method=multiple_choice_jo...",1,1.000000,1.000000,1.000000,num_train_trials,,,,,test,0.0,1.000000,1.000000,0.0
2,"mmlu:subject=anatomy,method=multiple_choice_jo...",1,347.833333,347.833333,347.833333,num_prompt_tokens,,,,,test,0.0,347.833333,120988.027778,0.0
3,"mmlu:subject=anatomy,method=multiple_choice_jo...",1,1.000000,1.000000,1.000000,num_completion_tokens,,,,,test,0.0,1.000000,1.000000,0.0
4,"mmlu:subject=anatomy,method=multiple_choice_jo...",1,1.000000,1.000000,1.000000,num_output_tokens,,,,,test,0.0,1.000000,1.000000,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
157,"mmlu:subject=philosophy,method=multiple_choice...",1,1.000000,1.000000,1.000000,num_perplexity_tokens,worst,True,fairness,False,valid,0.0,1.000000,1.000000,0.0
158,"mmlu:subject=philosophy,method=multiple_choice...",1,2.000000,2.000000,2.000000,num_bytes,worst,False,robustness,True,valid,0.0,2.000000,4.000000,0.0
159,"mmlu:subject=philosophy,method=multiple_choice...",1,2.000000,2.000000,2.000000,num_bytes,worst,True,fairness,False,valid,0.0,2.000000,4.000000,0.0
160,"mmlu:subject=philosophy,method=multiple_choice...",1,6.000000,6.000000,6.000000,num_instances,,,,,test,0.0,6.000000,36.000000,0.0


In [7]:
# The "per_instance_stats" dataframes contain the statistics for each
# metric for a given dataset "instance" and perturbation.  Meaning if
# a perturbation or data augmentation is applied via HELM during the
# run, there may be multiple "stats" for the same "instance", with the
# perturbation fields distinguishing them
train_per_instance_stats_df

Unnamed: 0,run_spec.name,per_instance_stats.instance_id,per_instance_stats.stats.count,per_instance_stats.stats.max,per_instance_stats.stats.mean,per_instance_stats.stats.min,per_instance_stats.stats.name.name,per_instance_stats.stats.name.split,per_instance_stats.stats.stddev,per_instance_stats.stats.sum,per_instance_stats.stats.sum_squared,per_instance_stats.stats.variance,per_instance_stats.train_trial_index
0,"mmlu:subject=anatomy,method=multiple_choice_jo...",id138,1,4.0,4.0,4.0,num_references,test,0.0,4.0,16.0,0.0,0
1,"mmlu:subject=anatomy,method=multiple_choice_jo...",id138,1,1.0,1.0,1.0,num_train_trials,test,0.0,1.0,1.0,0.0,0
2,"mmlu:subject=anatomy,method=multiple_choice_jo...",id138,1,334.0,334.0,334.0,num_prompt_tokens,test,0.0,334.0,111556.0,0.0,0
3,"mmlu:subject=anatomy,method=multiple_choice_jo...",id138,1,1.0,1.0,1.0,num_completion_tokens,test,0.0,1.0,1.0,0.0,0
4,"mmlu:subject=anatomy,method=multiple_choice_jo...",id138,1,1.0,1.0,1.0,num_output_tokens,test,0.0,1.0,1.0,0.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
184,"mmlu:subject=philosophy,method=multiple_choice...",id131,1,0.0,0.0,0.0,quasi_prefix_exact_match,test,0.0,0.0,0.0,0.0,0
185,"mmlu:subject=philosophy,method=multiple_choice...",id131,1,0.0,0.0,0.0,quasi_prefix_exact_match@5,test,0.0,0.0,0.0,0.0,0
186,"mmlu:subject=philosophy,method=multiple_choice...",id131,1,0.0,0.0,0.0,logprob,test,0.0,0.0,0.0,0.0,0
187,"mmlu:subject=philosophy,method=multiple_choice...",id131,1,1.0,1.0,1.0,num_perplexity_tokens,test,0.0,1.0,1.0,0.0,0


In [8]:
# Same format as "train_run_specs_df" above, but for the "eval" run(s)
# (the run(s) you're making predictions for)
eval_run_specs_df

Unnamed: 0,run_spec.name,run_spec.scenario_spec.class_name,run_spec.scenario_spec.args.subject,run_spec.adapter_spec.method,run_spec.adapter_spec.global_prefix,run_spec.adapter_spec.global_suffix,run_spec.adapter_spec.instructions,run_spec.adapter_spec.input_prefix,run_spec.adapter_spec.input_suffix,run_spec.adapter_spec.reference_prefix,...,run_spec.metric_specs,run_spec.data_augmenter_spec.perturbation_specs,run_spec.data_augmenter_spec.should_augment_train_instances,run_spec.data_augmenter_spec.should_include_original_train,run_spec.data_augmenter_spec.should_skip_unchanged_train,run_spec.data_augmenter_spec.should_augment_eval_instances,run_spec.data_augmenter_spec.should_include_original_eval,run_spec.data_augmenter_spec.should_skip_unchanged_eval,run_spec.data_augmenter_spec.seeds_per_instance,run_spec.groups
0,"mmlu:subject=philosophy,method=multiple_choice...",helm.benchmark.scenarios.mmlu_scenario.MMLUSce...,philosophy,multiple_choice_joint,,,The following are multiple choice questions (w...,Question:,\n,A.,...,[{'class_name': 'helm.benchmark.metrics.basic_...,[],False,False,False,False,False,False,1,[mmlu]


In [9]:
# Same format as "train_scenario_states_df" above, but only a subsample
# of the full dataframe
eval_scenario_states_df

Unnamed: 0,run_spec.name,scenario_state.adapter_spec.chain_of_thought_prefix,scenario_state.adapter_spec.chain_of_thought_suffix,scenario_state.adapter_spec.global_prefix,scenario_state.adapter_spec.global_suffix,scenario_state.adapter_spec.input_prefix,scenario_state.adapter_spec.input_suffix,scenario_state.adapter_spec.instance_prefix,scenario_state.adapter_spec.instructions,scenario_state.adapter_spec.max_eval_instances,...,scenario_state.request_states.request.temperature,scenario_state.request_states.request.top_k_per_token,scenario_state.request_states.request.top_p,scenario_state.request_states.result.cached,scenario_state.request_states.result.completions,scenario_state.request_states.result.embedding,scenario_state.request_states.result.request_datetime,scenario_state.request_states.result.request_time,scenario_state.request_states.result.success,scenario_state.request_states.train_trial_index
0,"mmlu:subject=philosophy,method=multiple_choice...",,\n,,,Question:,\n,\n,The following are multiple choice questions (w...,7,...,0.0,5,1,False,"[{'text': ' D', 'logprob': 0.0, 'tokens': [{'t...",[],1755892585,0.465341,True,0
2,"mmlu:subject=philosophy,method=multiple_choice...",,\n,,,Question:,\n,\n,The following are multiple choice questions (w...,7,...,0.0,5,1,False,"[{'text': ' D', 'logprob': 0.0, 'tokens': [{'t...",[],1755892586,0.020233,True,0
3,"mmlu:subject=philosophy,method=multiple_choice...",,\n,,,Question:,\n,\n,The following are multiple choice questions (w...,7,...,0.0,5,1,False,"[{'text': ' D', 'logprob': 0.0, 'tokens': [{'t...",[],1755892586,0.019465,True,0
4,"mmlu:subject=philosophy,method=multiple_choice...",,\n,,,Question:,\n,\n,The following are multiple choice questions (w...,7,...,0.0,5,1,False,"[{'text': ' D', 'logprob': 0.0, 'tokens': [{'t...",[],1755892586,0.019573,True,0
5,"mmlu:subject=philosophy,method=multiple_choice...",,\n,,,Question:,\n,\n,The following are multiple choice questions (w...,7,...,0.0,5,1,False,"[{'text': ' D', 'logprob': 0.0, 'tokens': [{'t...",[],1755892586,0.018493,True,0
