You signed in with another tab or window. Reload to refresh your session.You signed out in another tab or window. Reload to refresh your session.You switched accounts on another tab or window. Reload to refresh your session.Dismiss alert
lm_eval --model vllm
--model_args pretrained=${MODELDIR},tokenizer_mode="slow",tensor_parallel_size=$NUM_GPU,dtype=auto,gpu_memory_utilization=0.8
--tasks arc_challenge
--batch_size auto
--output_path ${OUTDIR}/${FILE_PREFIX}_arc_challenge_25shot.json
--num_fewshot 25
--device cuda
Error
INFO 12-07 17:37:01 llm_engine.py:73] Initializing an LLM engine with config: model='/share5/users/ravi/output/meta/llama2_70b_4k_v15_rs/checkpoint-958/', tokenizer='/share5/users/ravi/output/meta/llama2_70b_4k_v15_rs/checkpoint-958/', tokenizer_mode=slow,
revision=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.bfloat16, max_seq_len=4096, download_dir=None, load_format=auto, tensor_parallel_size=8, quantization=None, seed=1234)
WARNING 12-07 17:37:01 tokenizer.py:79] Using a slow tokenizer. This might cause a significant slowdown. Consider using a fast tokenizer instead.
Traceback (most recent call last):
File "/usr/local/bin/lm_eval", line 33, in
sys.exit(load_entry_point('lm-eval', 'console_scripts', 'lm_eval')())
File "/data/users/ravi/experiments/summarization-research/FastChat/lm-evaluation-harness/lm_eval/main.py", line 207, in cli_evaluate
results = evaluator.simple_evaluate(
File "/data/users/ravi/experiments/summarization-research/FastChat/lm-evaluation-harness/lm_eval/utils.py", line 402, in _wrapper
return fn(*args, **kwargs)
File "/data/users/ravi/experiments/summarization-research/FastChat/lm-evaluation-harness/lm_eval/evaluator.py", line 102, in simple_evaluate
lm = lm_eval.api.registry.get_model(model).create_from_arg_string(
File "/data/users/ravi/experiments/summarization-research/FastChat/lm-evaluation-harness/lm_eval/api/model.py", line 136, in create_from_arg_string
return cls(**args, **args2)
File "/data/users/ravi/experiments/summarization-research/FastChat/lm-evaluation-harness/lm_eval/models/vllm_causallms.py", line 53, in init
self.model = LLM(
File "/usr/local/lib/python3.10/dist-packages/vllm/entrypoints/llm.py", line 93, in init
self.llm_engine = LLMEngine.from_engine_args(engine_args)
File "/usr/local/lib/python3.10/dist-packages/vllm/engine/llm_engine.py", line 246, in from_engine_args
engine = cls(*engine_configs,
File "/usr/local/lib/python3.10/dist-packages/vllm/engine/llm_engine.py", line 112, in init
self._init_cache()
File "/usr/local/lib/python3.10/dist-packages/vllm/engine/llm_engine.py", line 208, in _init_cache
num_blocks = self._run_workers(
File "/usr/local/lib/python3.10/dist-packages/vllm/engine/llm_engine.py", line 750, in _run_workers
self._run_workers_in_batch(workers, method, *args, **kwargs))
File "/usr/local/lib/python3.10/dist-packages/vllm/engine/llm_engine.py", line 727, in _run_workers_in_batch
all_outputs = ray.get(all_outputs)
File "/usr/local/lib/python3.10/dist-packages/ray/_private/auto_init_hook.py", line 24, in auto_init_wrapper
return fn(*args, **kwargs)
File "/usr/local/lib/python3.10/dist-packages/ray/_private/client_mode_hook.py", line 103, in wrapper
return func(*args, **kwargs)
File "/usr/local/lib/python3.10/dist-packages/ray/_private/worker.py", line 2563, in get
raise value.as_instanceof_cause()
ray.exceptions.RayTaskError(RuntimeError): ray::RayWorkerVllm.execute_method() (pid=31808, ip=10.233.96.98, actor_id=71039832389f57d3ac54592101000000, repr=<vllm.engine.ray_utils.RayWorkerVllm object at 0x7f0eab259120>)
File "/usr/local/lib/python3.10/dist-packages/vllm/engine/ray_utils.py", line 31, in execute_method
return executor(*args, **kwargs)
File "/usr/local/lib/python3.10/dist-packages/torch/utils/_contextlib.py", line 115, in decorate_context
return func(*args, **kwargs)
File "/usr/local/lib/python3.10/dist-packages/vllm/worker/worker.py", line 88, in profile_num_available_blocks
self.model_runner.profile_run()
File "/usr/local/lib/python3.10/dist-packages/torch/utils/_contextlib.py", line 115, in decorate_context
return func(*args, **kwargs)
File "/usr/local/lib/python3.10/dist-packages/vllm/worker/model_runner.py", line 321, in profile_run
self.execute_model(seqs, kv_caches)
File "/usr/local/lib/python3.10/dist-packages/torch/utils/_contextlib.py", line 115, in decorate_context
return func(*args, **kwargs)
File "/usr/local/lib/python3.10/dist-packages/vllm/worker/model_runner.py", line 279, in execute_model
hidden_states = self.model(
File "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py", line 1518, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
File "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py", line 1527, in _call_impl
return forward_call(*args, **kwargs)
File "/usr/local/lib/python3.10/dist-packages/vllm/model_executor/models/llama.py", line 294, in forward
hidden_states = self.model(input_ids, positions, kv_caches,
File "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py", line 1518, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
File "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py", line 1527, in _call_impl
return forward_call(*args, **kwargs)
File "/usr/local/lib/python3.10/dist-packages/vllm/model_executor/models/llama.py", line 255, in forward
hidden_states = self.embed_tokens(input_ids)
File "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py", line 1518, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
File "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py", line 1527, in call_impl
return forward_call(*args, **kwargs)
File "/usr/local/lib/python3.10/dist-packages/vllm/model_executor/layers/vocab_parallel_embedding.py", line 101, in forward
output = tensor_model_parallel_all_reduce(output_parallel)
File "/usr/local/lib/python3.10/dist-packages/vllm/model_executor/parallel_utils/communication_op.py", line 18, in tensor_model_parallel_all_reduce
torch.distributed.all_reduce(input,
File "/usr/local/lib/python3.10/dist-packages/torch/distributed/c10d_logger.py", line 47, in wrapper
return func(*args, **kwargs)
File "/usr/local/lib/python3.10/dist-packages/torch/distributed/distributed_c10d.py", line 2050, in all_reduce
work = group.allreduce([tensor], opts)
RuntimeError: Inplace update to inference tensor outside InferenceMode is not allowed.You can make a clone to get a normal tensor before doing inplace update.See pytorch/rfcs#17 for more details.
Command:
lm_eval --model vllm
--model_args pretrained=${MODELDIR},tokenizer_mode="slow",tensor_parallel_size=$NUM_GPU,dtype=auto,gpu_memory_utilization=0.8
--tasks arc_challenge
--batch_size auto
--output_path ${OUTDIR}/${FILE_PREFIX}_arc_challenge_25shot.json
--num_fewshot 25
--device cuda
Error
INFO 12-07 17:37:01 llm_engine.py:73] Initializing an LLM engine with config: model='/share5/users/ravi/output/meta/llama2_70b_4k_v15_rs/checkpoint-958/', tokenizer='/share5/users/ravi/output/meta/llama2_70b_4k_v15_rs/checkpoint-958/', tokenizer_mode=slow,
revision=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.bfloat16, max_seq_len=4096, download_dir=None, load_format=auto, tensor_parallel_size=8, quantization=None, seed=1234)
WARNING 12-07 17:37:01 tokenizer.py:79] Using a slow tokenizer. This might cause a significant slowdown. Consider using a fast tokenizer instead.
Traceback (most recent call last):
File "/usr/local/bin/lm_eval", line 33, in
sys.exit(load_entry_point('lm-eval', 'console_scripts', 'lm_eval')())
File "/data/users/ravi/experiments/summarization-research/FastChat/lm-evaluation-harness/lm_eval/main.py", line 207, in cli_evaluate
results = evaluator.simple_evaluate(
File "/data/users/ravi/experiments/summarization-research/FastChat/lm-evaluation-harness/lm_eval/utils.py", line 402, in _wrapper
return fn(*args, **kwargs)
File "/data/users/ravi/experiments/summarization-research/FastChat/lm-evaluation-harness/lm_eval/evaluator.py", line 102, in simple_evaluate
lm = lm_eval.api.registry.get_model(model).create_from_arg_string(
File "/data/users/ravi/experiments/summarization-research/FastChat/lm-evaluation-harness/lm_eval/api/model.py", line 136, in create_from_arg_string
return cls(**args, **args2)
File "/data/users/ravi/experiments/summarization-research/FastChat/lm-evaluation-harness/lm_eval/models/vllm_causallms.py", line 53, in init
self.model = LLM(
File "/usr/local/lib/python3.10/dist-packages/vllm/entrypoints/llm.py", line 93, in init
self.llm_engine = LLMEngine.from_engine_args(engine_args)
File "/usr/local/lib/python3.10/dist-packages/vllm/engine/llm_engine.py", line 246, in from_engine_args
engine = cls(*engine_configs,
File "/usr/local/lib/python3.10/dist-packages/vllm/engine/llm_engine.py", line 112, in init
self._init_cache()
File "/usr/local/lib/python3.10/dist-packages/vllm/engine/llm_engine.py", line 208, in _init_cache
num_blocks = self._run_workers(
File "/usr/local/lib/python3.10/dist-packages/vllm/engine/llm_engine.py", line 750, in _run_workers
self._run_workers_in_batch(workers, method, *args, **kwargs))
File "/usr/local/lib/python3.10/dist-packages/vllm/engine/llm_engine.py", line 727, in _run_workers_in_batch
all_outputs = ray.get(all_outputs)
File "/usr/local/lib/python3.10/dist-packages/ray/_private/auto_init_hook.py", line 24, in auto_init_wrapper
return fn(*args, **kwargs)
File "/usr/local/lib/python3.10/dist-packages/ray/_private/client_mode_hook.py", line 103, in wrapper
return func(*args, **kwargs)
File "/usr/local/lib/python3.10/dist-packages/ray/_private/worker.py", line 2563, in get
raise value.as_instanceof_cause()
ray.exceptions.RayTaskError(RuntimeError): ray::RayWorkerVllm.execute_method() (pid=31808, ip=10.233.96.98, actor_id=71039832389f57d3ac54592101000000, repr=<vllm.engine.ray_utils.RayWorkerVllm object at 0x7f0eab259120>)
File "/usr/local/lib/python3.10/dist-packages/vllm/engine/ray_utils.py", line 31, in execute_method
return executor(*args, **kwargs)
File "/usr/local/lib/python3.10/dist-packages/torch/utils/_contextlib.py", line 115, in decorate_context
return func(*args, **kwargs)
File "/usr/local/lib/python3.10/dist-packages/vllm/worker/worker.py", line 88, in profile_num_available_blocks
self.model_runner.profile_run()
File "/usr/local/lib/python3.10/dist-packages/torch/utils/_contextlib.py", line 115, in decorate_context
return func(*args, **kwargs)
File "/usr/local/lib/python3.10/dist-packages/vllm/worker/model_runner.py", line 321, in profile_run
self.execute_model(seqs, kv_caches)
File "/usr/local/lib/python3.10/dist-packages/torch/utils/_contextlib.py", line 115, in decorate_context
return func(*args, **kwargs)
File "/usr/local/lib/python3.10/dist-packages/vllm/worker/model_runner.py", line 279, in execute_model
hidden_states = self.model(
File "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py", line 1518, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
File "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py", line 1527, in _call_impl
return forward_call(*args, **kwargs)
File "/usr/local/lib/python3.10/dist-packages/vllm/model_executor/models/llama.py", line 294, in forward
hidden_states = self.model(input_ids, positions, kv_caches,
File "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py", line 1518, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
File "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py", line 1527, in _call_impl
return forward_call(*args, **kwargs)
File "/usr/local/lib/python3.10/dist-packages/vllm/model_executor/models/llama.py", line 255, in forward
hidden_states = self.embed_tokens(input_ids)
File "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py", line 1518, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
File "/usr/local/lib/python3.10/dist-packages/torch/nn/modules/module.py", line 1527, in call_impl
return forward_call(*args, **kwargs)
File "/usr/local/lib/python3.10/dist-packages/vllm/model_executor/layers/vocab_parallel_embedding.py", line 101, in forward
output = tensor_model_parallel_all_reduce(output_parallel)
File "/usr/local/lib/python3.10/dist-packages/vllm/model_executor/parallel_utils/communication_op.py", line 18, in tensor_model_parallel_all_reduce
torch.distributed.all_reduce(input,
File "/usr/local/lib/python3.10/dist-packages/torch/distributed/c10d_logger.py", line 47, in wrapper
return func(*args, **kwargs)
File "/usr/local/lib/python3.10/dist-packages/torch/distributed/distributed_c10d.py", line 2050, in all_reduce
work = group.allreduce([tensor], opts)
RuntimeError: Inplace update to inference tensor outside InferenceMode is not allowed.You can make a clone to get a normal tensor before doing inplace update.See pytorch/rfcs#17 for more details.
pip install vllm==0.2.2
pip install torch==2.1.0 torchvision==0.16.0 torchaudio==2.1.0 --index-url https://download.pytorch.org/whl/cu121
The text was updated successfully, but these errors were encountered: