Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Results filenames handling fix #1926

Merged
merged 5 commits into from
Jun 11, 2024
Merged
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
18 changes: 7 additions & 11 deletions lm_eval/loggers/evaluation_tracker.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,9 +17,14 @@

from lm_eval.utils import (
eval_logger,
get_file_datetime,
get_file_task_name,
get_results_filenames,
get_sample_results_filenames,
handle_non_serializable,
hash_string,
sanitize_list,
sanitize_task_name,
)


Expand Down Expand Up @@ -319,23 +324,14 @@ def recreate_metadata_card(self) -> None:
Creates a metadata card for the evaluation results dataset and pushes it to the Hugging Face hub.
"""

def get_file_task_name(filename: str) -> str:
return filename[filename.find("_") + 1 : filename.rfind("_")]

def get_file_datetime(filename: str) -> str:
return filename[filename.rfind("_") + 1 :].replace(".json", "")

def sanitize_task_name(task_name: str) -> str:
return re.sub(r"\W", "_", task_name)

eval_logger.info("Recreating metadata card")
repo_id = (
self.hub_results_repo if self.public_repo else self.hub_results_repo_private
)

files_in_repo = self.api.list_repo_files(repo_id=repo_id, repo_type="dataset")
results_files = [f for f in files_in_repo if "/results_" in f and ".json" in f]
sample_files = [f for f in files_in_repo if "/samples_" in f and ".json" in f]
results_files = get_results_filenames(files_in_repo)
sample_files = get_sample_results_filenames(files_in_repo)

# Build a dictionary to store the latest evaluation datetime for:
# - Each tested model and its aggregated results
Expand Down
42 changes: 42 additions & 0 deletions lm_eval/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -152,6 +152,48 @@ def general_detokenize(string):
return string


def get_file_task_name(filename: str) -> str:
"""
Given the sample results filenames, extracts and returns the task name.
"""
return filename[filename.find("_") + 1 : filename.rfind("_")]


def get_file_datetime(filename: str) -> str:
"""
Given the results and sample results filenames, extracts and returns the datetime.
"""
return filename[filename.rfind("_") + 1 :].replace(".json", "")


def sanitize_task_name(task_name: str) -> str:
"""
Given the task name, returns a sanitized version of it.
"""
return re.sub(r"\W", "_", task_name)


def get_latest_filename(filenames: List[str]) -> str:
"""
Given a list of filenames, returns the filename with the latest datetime.
"""
return max(filenames, key=lambda f: get_file_datetime(f))


def get_results_filenames(filenames: List[str]) -> List[str]:
"""
Extracts filenames that correspond to aggregated results.
"""
return [f for f in filenames if "/results_" in f and ".json" in f]


def get_sample_results_filenames(filenames: List[str]) -> List[str]:
"""
Extracts filenames that correspond to sample results.
"""
return [f for f in filenames if "/samples_" in f and ".json" in f]


def get_rolling_token_windows(token_list, prefix_token, max_seq_len, context_len):
"""
- context_len allows for a rolling window context, allowing each prediction window to potentially
Expand Down
61 changes: 52 additions & 9 deletions scripts/zeno_visualize.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,11 +3,17 @@
import os
import re
from pathlib import Path
from typing import List

import pandas as pd
from zeno_client import ZenoClient, ZenoMetric

from lm_eval.utils import eval_logger
from lm_eval.utils import (
eval_logger,
get_latest_filename,
get_results_filenames,
get_sample_results_filenames,
)


def parse_args():
Expand Down Expand Up @@ -45,13 +51,23 @@ def main():

assert len(models) > 0, "No model directories found in the data_path."

tasks = set(tasks_for_model(models[0], args.data_path))
# Get the tasks from the latest results file of the first model.
model_dir = Path(args.data_path, models[0])
model_files = [f.as_posix() for f in model_dir.iterdir() if f.is_file()]
model_results_filenames = get_results_filenames(model_files)
latest_results = get_latest_filename(model_results_filenames)
tasks = set(tasks_for_results(latest_results))

for model in models: # Make sure that all models have the same tasks.
# Get tasks names from the latest results file for each model
# Get intersection of tasks for all models
for model in models:
old_tasks = tasks.copy()
task_count = len(tasks)

model_tasks = tasks_for_model(model, args.data_path)
model_dir = Path(args.data_path, model)
model_files = [f.as_posix() for f in model_dir.iterdir() if f.is_file()]
model_results_filenames = get_results_filenames(model_files)
latest_results = get_latest_filename(model_results_filenames)
model_tasks = set(tasks_for_results(Path(latest_results)))
tasks.intersection(set(model_tasks))

if task_count != len(tasks):
Expand All @@ -66,22 +82,36 @@ def main():
for task in tasks:
# Upload data for all models
for model_index, model in enumerate(models):
# Get latest results and sample results for a model
model_dir = Path(args.data_path, model)
model_files = [f.as_posix() for f in model_dir.iterdir() if f.is_file()]
model_results_filenames = get_results_filenames(model_files)
model_sample_filenames = get_sample_results_filenames(model_files)
latest_results = get_latest_filename(
[Path(f).name for f in model_results_filenames]
)
latest_sample_results = get_latest_filename(
[Path(f).name for f in model_sample_filenames if task in f]
)
model_args = re.sub(
r"[\"<>:/\|\\?\*\[\]]+",
"__",
json.load(
open(Path(args.data_path, model, "results.json"), encoding="utf-8")
open(Path(args.data_path, model, latest_results), encoding="utf-8")
)["config"]["model_args"],
)
print(model_args)
data = []
with open(
Path(args.data_path, model, f"{model_args}_{task}.jsonl"),
Path(args.data_path, model, latest_sample_results),
"r",
encoding="utf-8",
) as file:
data = json.loads(file.read())
for line in file:
data.append(json.loads(line.strip()))

configs = json.load(
open(Path(args.data_path, model, "results.json"), encoding="utf-8")
open(Path(args.data_path, model, latest_results), encoding="utf-8")
)["configs"]
config = configs[task]

Expand Down Expand Up @@ -115,6 +145,19 @@ def main():
)


def tasks_for_results(results_filename: str) -> List[str]:
"""Get the tasks from a specific results file.

Args:
results_filename (str): The path to the results file.

Returns:
list: A list of tasks for the model.
"""
config = (json.load(open(results_filename, encoding="utf-8"))["configs"],)
return list(config[0].keys())


def tasks_for_model(model: str, data_path: str):
"""Get the tasks for a specific model.

Expand Down
Loading