# Introduction

We provide an evaluation pipeline to analyze results on **lmgame-Bench**. 🎮

- **Code:** <https://github.com/lmgame-org/GamingAgent>  
- **Leaderboard:** <https://huggingface.co/spaces/lmgame/game_arena_bench>  
- **Paper:** <https://arxiv.org/abs/2505.15146>

You can evaluate your model in two ways 📊:

1. **Entire cache directory:** Copy the entire cache directory containing all cached episodes to obtain aggregate performance across all games.  
2. **Single-episode JSONL:** Specify an individual log file (e.g., `episode_001_log.jsonl`) to evaluate performance for a single game episode.


In [1]:
import os
import glob

In [2]:
def check_evaluation_files(cache_dir_status, single_json_status):
    report = {}

    if cache_dir_status:
        cache_dir_path_to_check = "cache"
        if os.path.isdir(cache_dir_path_to_check):
            report["cache_directory"] = cache_dir_path_to_check
            # Optional: print(f"Cache directory '{cache_dir_path_to_check}' found.")
        else:
            report["cache_directory"] = None
            print(f"Error: Cache directory '{cache_dir_path_to_check}' not found.") # Cache not found error

    if single_json_status:
        agent_file_path = None
        agent_files = glob.glob("agent_config*.json")

        if len(agent_files) == 0:
            print("Error: No 'agent_config*.json' file found.") # No agent_config error
        elif len(agent_files) > 1:
            print(f"Error: Multiple 'agent_config*.json' files found: {agent_files}. Please ensure only one is present.") # Multiple agent_config error
            # agent_file_path remains None as we don't know which one to pick
        else: # Exactly one agent_config file found
            agent_file_path = agent_files[0]
            # Optional: print(f"Agent config file found: {agent_file_path}")


        episode_file_path = None
        episode_files = glob.glob("episode_*.jsonl")

        if len(episode_files) == 0:
            print("Error: No 'episode_*.jsonl' file found.") # No episode_log error
        elif len(episode_files) > 1:
            print(f"Error: Multiple 'episode_*.jsonl' files found: {episode_files}. Please ensure only one is present.") # Multiple episode_log error
            # episode_file_path remains None
        else: # Exactly one episode_log file found
            episode_file_path = episode_files[0]
            # Optional: print(f"Episode log file found: {episode_file_path}")
    
        report["json_files"] = [agent_file_path, episode_file_path]

    # Optional: A check to see if any evaluation mode was selected
    # if not cache_dir_status and not single_json_status:
    #     print("Info: Neither cache directory check nor single JSON file check was requested.")

    return report

# eval params
use_cache_dir = True # Set to True or False as needed
use_single_json = True # Set to True or False as needed

check_evaluation_files(use_cache_dir, use_single_json)

{'cache_directory': None, 'agent_config_files': [], 'episode_log_files': []}