diff --git a/benchmarks/README.md b/benchmarks/README.md index 716931c..3e82680 100644 --- a/benchmarks/README.md +++ b/benchmarks/README.md @@ -28,8 +28,9 @@ Provide run.py Python script with following arguments: - -p, prompt size(s) to benchmark, size of an input prompt; multiple prompt sizes can be provided and they will be treated as separate cases to benchmark - -r, thread-range, e.g., on an 80-thread system, it should be input as 0-79, unless user wants to use just a subset of available threads, say 16-63 (48 threads indexed 16<>63) - -fa, 0/1, disable/enable flash attention, default: 0 +- -d, docker_image, docker image used for benchmarking, default: amperecomputingai/llama.cpp:latest ```bash -python3 run.py -m Meta-Llama-3-8B-Instruct.Q8_0.gguf -t 10 16 32 40 64 80 -b 1 2 4 8 16 32 64 -p 512 -r 0-79 -fa 1 +python3 run.py -m Meta-Llama-3-8B-Instruct.Q8_0.gguf -t 10 16 32 40 64 80 -b 1 2 4 8 16 32 64 -p 512 -r 0-79 -fa 1 -d amperecomputingai/llama.cpp:3.2.1-ampereone ``` ## Quick run on 80t OCI A1 system diff --git a/benchmarks/run.py b/benchmarks/run.py index fd58448..a09978b 100644 --- a/benchmarks/run.py +++ b/benchmarks/run.py @@ -6,24 +6,24 @@ import subprocess from utils.benchmark import parse_threads_range - def get_file_dir(): return os.path.dirname(os.path.realpath(__file__)) - def docker_init(node): - tag = "amperecomputingai/llama.cpp:3.1.2" - if subprocess.run( - ["docker", "pull", tag]).returncode != 0: - print("Docker pull process failed!") - sys.exit(1) + + if subprocess.run(["docker", "inspect", docker_image], capture_output=True).returncode != 0: + print(f"Docker image {docker_image} doesn't exsit, try to pull it.") + if subprocess.run(["docker", "pull", docker_image], capture_output=True).returncode != 0: + print("Docker pull process failed!") + sys.exit(1) + container_name = f"llama_benchmark_n{node}" - subprocess.run(["docker", "rm", "-f", container_name]) + subprocess.run(["docker", "rm", "-f", container_name], capture_output=True) memory = (psutil.virtual_memory().total >> 30) - 30 # leave 30GB for OS assert memory > 10, "less than 10GB of memory available on the system for llama.cpp" if subprocess.run( ["docker", "run", "--privileged=true", "--cpuset-mems", f"{str(node)}", "--name", container_name, "-d", "-m", f"{str(memory)}g", "-v", - f"{get_file_dir()}:/runner", "--entrypoint", "/bin/bash", "-it", tag]).returncode != 0: + f"{get_file_dir()}:/runner", "--entrypoint", "/bin/bash", "-it", docker_image],capture_output=True).returncode != 0: print("Docker run process failed!") sys.exit(1) return container_name @@ -33,13 +33,13 @@ def docker_restart(docker_name): break_time = 15 def docker_stop(): - if subprocess.run(["docker", "stop", docker_name]).returncode != 0: + if subprocess.run(["docker", "stop", docker_name], capture_output=True).returncode != 0: print(f"Stopping docker container {docker_name} failed, retrying in {break_time} seconds.") time.sleep(break_time) docker_stop() def docker_start(): - if subprocess.run(["docker", "start", docker_name]).returncode != 0: + if subprocess.run(["docker", "start", docker_name], capture_output=True).returncode != 0: print(f"Starting docker container {docker_name} failed, retrying in {break_time} seconds.") time.sleep(break_time) docker_start() @@ -94,6 +94,9 @@ def parse_args(): parser.add_argument("-m", "--model_names", type=str, required=True, nargs="+", help="model names, e.g. 'Meta-Llama-3-8B-Instruct.Q8_0.gguf'") + parser.add_argument("-d", "--docker_image", + type=str, required=True, + help="Docker image to use for benchmarking") parser.add_argument("-t", "--num_threads", type=int, required=True, nargs="+", help="number of threads per process to use") @@ -121,7 +124,11 @@ def parse_args(): def main(): + global docker_image + args = parse_args() + docker_image = args.docker_image + print(f"Test with docker image {docker_image}") benchmark(docker_init(args.numa), args)