In [12]:
##!pip install lighteval
#!pip install "lighteval[all]"
#!pip install langdetect
#!pip install lighteval[multilingual]

In [13]:
from transformers import AutoConfig, AutoModelForCausalLM
from biatron import BiatronForCausalLM, BiatronConfig
#AutoConfig.register("Biatron", BiatronConfig)
#AutoModelForCausalLM.register(BiatronConfig, BiatronForCausalLM)

In [14]:
import torch
import dotenv
dotenv.load_dotenv()
bench  = {}

#biamodel = BiatronForCausalLM.from_pretrained("", torch_dtype=torch.bfloat16, device_map="cuda", use_cache=False, _attn_implementation='sdpa', revision="checkpoint-152000")


In [None]:


from lighteval.logging.evaluation_tracker import EvaluationTracker
from lighteval.models.transformers.transformers_model import TransformersModel, TransformersModelConfig
from lighteval.pipeline import ParallelismManager, Pipeline, PipelineParameters

BENCHMARKS = "enem_por_mcf,oab_exams_por_mcf,exams_por_mcf,m3exams_por_mcf,openai_mmlu_por_mcf"

for model_name in ['Fazzioni/biatron-345m',
                   'google/gemma-3-270m',
                   'google/gemma-3-1B-pt',
                   'TucanoBR/Tucano-630m',
                   'HuggingFaceTB/SmolLM2-360M',
                   'Qwen/Qwen3-0.6B-Base'
                   ]:
    print("STARTING EVAL FOR MODEL:", model_name)
    
    evaluation_tracker = EvaluationTracker(output_dir="./results")
    pipeline_params = PipelineParameters(
        launcher_type=ParallelismManager.NONE,
        load_tasks_multilingual=True
    )
    
    CLASS_NAME = AutoModelForCausalLM if 'biatron' not in model_name else BiatronForCausalLM
    kwargs = {}
    if 'biatron' in model_name:
        kwargs = {'revision':'checkpoint-152000'}
        
    model = CLASS_NAME.from_pretrained(model_name, device_map="cuda", dtype=torch.bfloat16, **kwargs)
    
    config = TransformersModelConfig(model_name=model.config._name_or_path, batch_size=1)
    Transmodel = TransformersModel.from_model(model, config)
    
    pipeline = Pipeline(
        model=Transmodel,
        pipeline_parameters=pipeline_params,
        evaluation_tracker=evaluation_tracker,
        tasks=BENCHMARKS,
    )
    
    results = pipeline.evaluate()
    pipeline.show_results()
    results = pipeline.get_results()

    #m = model.config._name_or_path
    bench[model_name] = {}
    for k,v in results['results'].items():
        bench[model_name][k] = v['acc']

    del model
    import gc
    gc.collect()
    torch.cuda.empty_cache()
    print("\n\n\n")
    

In [23]:
bench.keys()

dict_keys(['Fazzioni/biatron-345m', 'google/gemma-3-270m', 'google/gemma-3-1B-pt', 'TucanoBR/Tucano-630m', 'HuggingFaceTB/SmolLM2-360M', 'Qwen/Qwen3-0.6B-Base'])

In [None]:

import pandas as pd
print(pd.DataFrame(bench).to_markdown())

|                                                           |   Fazzioni/biatron-345m |   google/gemma-3-270m |   google/gemma-3-1B-pt |   TucanoBR/Tucano-630m |   HuggingFaceTB/SmolLM2-360M |   Qwen/Qwen3-0.6B-Base |
|:----------------------------------------------------------|------------------------:|----------------------:|-----------------------:|-----------------------:|-----------------------------:|-----------------------:|
| m3exams_por_mcf:0                                         |                0.225    |              0.202273 |               0.196591 |               0.201136 |                     0.197727 |               0.197727 |
| enem_por_mcf:2022:0                                       |                0.212291 |              0.195531 |               0.206704 |               0.178771 |                     0.195531 |               0.206704 |
| enem_por_mcf:2023:0                                       |                0.24581  |              0.24581  |               0.