EleutherAI · lintangsutawika · Jun 11, 2024 · Jun 5, 2024 · Jun 5, 2024 · Jun 5, 2024
@@ -17,9 +17,14 @@
 
 from lm_eval.utils import (
     eval_logger,
+    get_file_datetime,
+    get_file_task_name,
+    get_results_filenames,
+    get_sample_results_filenames,
     handle_non_serializable,
     hash_string,
     sanitize_list,
+    sanitize_task_name,
 )
 
 
@@ -319,23 +324,14 @@ def recreate_metadata_card(self) -> None:
         Creates a metadata card for the evaluation results dataset and pushes it to the Hugging Face hub.
         """
 
-        def get_file_task_name(filename: str) -> str:
-            return filename[filename.find("_") + 1 : filename.rfind("_")]
-
-        def get_file_datetime(filename: str) -> str:
-            return filename[filename.rfind("_") + 1 :].replace(".json", "")
-
-        def sanitize_task_name(task_name: str) -> str:
-            return re.sub(r"\W", "_", task_name)
-
         eval_logger.info("Recreating metadata card")
         repo_id = (
             self.hub_results_repo if self.public_repo else self.hub_results_repo_private
         )
 
         files_in_repo = self.api.list_repo_files(repo_id=repo_id, repo_type="dataset")
-        results_files = [f for f in files_in_repo if "/results_" in f and ".json" in f]
-        sample_files = [f for f in files_in_repo if "/samples_" in f and ".json" in f]
+        results_files = get_results_filenames(files_in_repo)
+        sample_files = get_sample_results_filenames(files_in_repo)
 
         # Build a dictionary to store the latest evaluation datetime for:
         # - Each tested model and its aggregated results

@@ -152,6 +152,48 @@ def general_detokenize(string):
     return string
 
 
+def get_file_task_name(filename: str) -> str:
+    """
+    Given the sample results filenames, extracts and returns the task name.
+    """
+    return filename[filename.find("_") + 1 : filename.rfind("_")]
+
+
+def get_file_datetime(filename: str) -> str:
+    """
+    Given the results and sample results filenames, extracts and returns the datetime.
+    """
+    return filename[filename.rfind("_") + 1 :].replace(".json", "")
+
+
+def sanitize_task_name(task_name: str) -> str:
+    """
+    Given the task name, returns a sanitized version of it.
+    """
+    return re.sub(r"\W", "_", task_name)
+
+
+def get_latest_filename(filenames: List[str]) -> str:
+    """
+    Given a list of filenames, returns the filename with the latest datetime.
+    """
+    return max(filenames, key=lambda f: get_file_datetime(f))
+
+
+def get_results_filenames(filenames: List[str]) -> List[str]:
+    """
+    Extracts filenames that correspond to aggregated results.
+    """
+    return [f for f in filenames if "/results_" in f and ".json" in f]
+
+
+def get_sample_results_filenames(filenames: List[str]) -> List[str]:
+    """
+    Extracts filenames that correspond to sample results.
+    """
+    return [f for f in filenames if "/samples_" in f and ".json" in f]
+
+
 def get_rolling_token_windows(token_list, prefix_token, max_seq_len, context_len):
     """
     - context_len allows for a rolling window context, allowing each prediction window to potentially

@@ -3,11 +3,17 @@
 import os
 import re
 from pathlib import Path
+from typing import List
 
 import pandas as pd
 from zeno_client import ZenoClient, ZenoMetric
 
-from lm_eval.utils import eval_logger
+from lm_eval.utils import (
+    eval_logger,
+    get_latest_filename,
+    get_results_filenames,
+    get_sample_results_filenames,
+)
 
 
 def parse_args():
@@ -45,13 +51,23 @@ def main():
 
     assert len(models) > 0, "No model directories found in the data_path."
 
-    tasks = set(tasks_for_model(models[0], args.data_path))
+    # Get the tasks from the latest results file of the first model.
+    model_dir = Path(args.data_path, models[0])
+    model_files = [f.as_posix() for f in model_dir.iterdir() if f.is_file()]
+    model_results_filenames = get_results_filenames(model_files)
+    latest_results = get_latest_filename(model_results_filenames)
+    tasks = set(tasks_for_results(latest_results))
 
-    for model in models:  # Make sure that all models have the same tasks.
+    # Get tasks names from the latest results file for each model
+    # Get intersection of tasks for all models
+    for model in models:
         old_tasks = tasks.copy()
         task_count = len(tasks)
-
-        model_tasks = tasks_for_model(model, args.data_path)
+        model_dir = Path(args.data_path, model)
+        model_files = [f.as_posix() for f in model_dir.iterdir() if f.is_file()]
+        model_results_filenames = get_results_filenames(model_files)
+        latest_results = get_latest_filename(model_results_filenames)
+        model_tasks = set(tasks_for_results(Path(latest_results)))
         tasks.intersection(set(model_tasks))
 
         if task_count != len(tasks):
@@ -66,22 +82,36 @@ def main():
     for task in tasks:
         # Upload data for all models
         for model_index, model in enumerate(models):
+            # Get latest results and sample results for a model
+            model_dir = Path(args.data_path, model)
+            model_files = [f.as_posix() for f in model_dir.iterdir() if f.is_file()]
+            model_results_filenames = get_results_filenames(model_files)
+            model_sample_filenames = get_sample_results_filenames(model_files)
+            latest_results = get_latest_filename(
+                [Path(f).name for f in model_results_filenames]
+            )
+            latest_sample_results = get_latest_filename(
+                [Path(f).name for f in model_sample_filenames if task in f]
+            )
             model_args = re.sub(
                 r"[\"<>:/\|\\?\*\[\]]+",
                 "__",
                 json.load(
-                    open(Path(args.data_path, model, "results.json"), encoding="utf-8")
+                    open(Path(args.data_path, model, latest_results), encoding="utf-8")
                 )["config"]["model_args"],
             )
+            print(model_args)
+            data = []
             with open(
-                Path(args.data_path, model, f"{model_args}_{task}.jsonl"),
+                Path(args.data_path, model, latest_sample_results),
                 "r",
                 encoding="utf-8",
             ) as file:
-                data = json.loads(file.read())
+                for line in file:
+                    data.append(json.loads(line.strip()))
 
             configs = json.load(
-                open(Path(args.data_path, model, "results.json"), encoding="utf-8")
+                open(Path(args.data_path, model, latest_results), encoding="utf-8")
             )["configs"]
             config = configs[task]
 
@@ -115,6 +145,19 @@ def main():
             )
 
 
+def tasks_for_results(results_filename: str) -> List[str]:
+    """Get the tasks from a specific results file.
+
+    Args:
+        results_filename (str): The path to the results file.
+
+    Returns:
+        list: A list of tasks for the model.
+    """
+    config = (json.load(open(results_filename, encoding="utf-8"))["configs"],)
+    return list(config[0].keys())
+
+
 def tasks_for_model(model: str, data_path: str):
     """Get the tasks for a specific model.