diff --git a/converter/download_and_convert_model.sh b/converter/download_and_convert_model.sh index d2a99b0..da36afd 100755 --- a/converter/download_and_convert_model.sh +++ b/converter/download_and_convert_model.sh @@ -17,6 +17,8 @@ fi echo ${MODEL_TYPE} +export HUGGINGFACE_HUB_CACHE=/root/.cache/huggingface/hub + # If the model type is codegen, then we need to download the codegen and convert it: if [[ ${MODEL_TYPE} == codegen ]]; then exit diff --git a/converter/huggingface_opt_convert.py b/converter/huggingface_opt_convert.py index 795344d..3d72b0e 100644 --- a/converter/huggingface_opt_convert.py +++ b/converter/huggingface_opt_convert.py @@ -401,7 +401,7 @@ def split_and_convert(args): parser.add_argument('-trained_gpu_num', '-t_g', type=int, help='How many gpus for inference', default=1) parser.add_argument('-infer_gpu_num', '-i_g', type=int, help='How many gpus for inference', required=True) parser.add_argument("-processes", "-p", type=int, help="How many processes to spawn for conversion (default: 4)", default=4) - parser.add_argument("-weight_data_type", type=str, default="fp32", choices=["fp32", "fp16"]) + parser.add_argument("-weight_data_type", type=str, default="fp16", choices=["fp32", "fp16"]) parser.add_argument("-quantize", help="Store selected in int8, run the models to determine scaling factors", action="store_true") diff --git a/converter/models/galactica-125m-1gpu/fastertransformer/config.pbtxt b/converter/models/galactica-125m-1gpu/fastertransformer/config.pbtxt index 3d63710..6e3409e 100644 --- a/converter/models/galactica-125m-1gpu/fastertransformer/config.pbtxt +++ b/converter/models/galactica-125m-1gpu/fastertransformer/config.pbtxt @@ -93,7 +93,7 @@ input [ }, { name: "random_seed" - data_type: TYPE_UINT64 + data_type: TYPE_INT32 dims: [ 1 ] reshape: { shape: [ ] } optional: true diff --git a/converter/models/galactica-6.7b-1gpu/fastertransformer/config.pbtxt b/converter/models/galactica-6.7b-1gpu/fastertransformer/config.pbtxt new file mode 100644 index 0000000..061d75d --- /dev/null +++ b/converter/models/galactica-6.7b-1gpu/fastertransformer/config.pbtxt @@ -0,0 +1,331 @@ +name: "fastertransformer" +backend: "fastertransformer" +default_model_filename: "facebook/galactica-6.7b" +max_batch_size: 1024 + +model_transaction_policy { + decoupled: False +} + + +dynamic_batching { + max_queue_delay_microseconds: 50000 +} + +batch_input [ + { + kind: BATCH_ITEM_SHAPE + target_name: "input_ids_item_shape" + data_type: TYPE_INT32 + source_input: "input_ids" + } +] + +# this is not needed when not using request prompt embedding +#batch_input [ +# { +# kind: BATCH_ITEM_SHAPE +# target_name: "request_prompt_embedding_item_shape" +# data_type: TYPE_INT32 +# source_input: "request_prompt_embedding" +# } +#] + +input [ + { + name: "input_ids" + data_type: TYPE_UINT32 + dims: [ -1 ] + allow_ragged_batch: true + }, + { + name: "input_lengths" + data_type: TYPE_UINT32 + dims: [ 1 ] + reshape: { shape: [ ] } + }, + { + name: "request_output_len" + data_type: TYPE_UINT32 + dims: [ -1 ] + }, + { + name: "runtime_top_k" + data_type: TYPE_UINT32 + dims: [ 1 ] + reshape: { shape: [ ] } + optional: true + }, + { + name: "runtime_top_p" + data_type: TYPE_FP32 + dims: [ 1 ] + reshape: { shape: [ ] } + optional: true + }, + { + name: "beam_search_diversity_rate" + data_type: TYPE_FP32 + dims: [ 1 ] + reshape: { shape: [ ] } + optional: true + }, + { + name: "temperature" + data_type: TYPE_FP32 + dims: [ 1 ] + reshape: { shape: [ ] } + optional: true + }, + { + name: "len_penalty" + data_type: TYPE_FP32 + dims: [ 1 ] + reshape: { shape: [ ] } + optional: true + }, + { + name: "repetition_penalty" + data_type: TYPE_FP32 + dims: [ 1 ] + reshape: { shape: [ ] } + optional: true + }, + { + name: "random_seed" + data_type: TYPE_INT32 + dims: [ 1 ] + reshape: { shape: [ ] } + optional: true + }, + { + name: "is_return_log_probs" + data_type: TYPE_BOOL + dims: [ 1 ] + reshape: { shape: [ ] } + optional: true + }, + { + name: "is_return_context_embeddings" + data_type: TYPE_BOOL + dims: [ 1 ] + reshape: { shape: [ ] } + optional: true + }, + { + name: "beam_width" + data_type: TYPE_UINT32 + dims: [ 1 ] + reshape: { shape: [ ] } + optional: true + }, + { + name: "start_id" + data_type: TYPE_UINT32 + dims: [ 1 ] + reshape: { shape: [ ] } + optional: true + }, + { + name: "end_id" + data_type: TYPE_UINT32 + dims: [ 1 ] + reshape: { shape: [ ] } + optional: true + }, + { + name: "stop_words_list" + data_type: TYPE_INT32 + dims: [ 2, -1 ] + optional: true + }, + { + name: "bad_words_list" + data_type: TYPE_INT32 + dims: [ 2, -1 ] + optional: true + }, + { + name: "prompt_learning_task_name_ids" + data_type: TYPE_UINT32 + dims: [ 1 ] + reshape: { shape: [ ] } + optional: true + }, + { + name: "request_prompt_embedding" + data_type: TYPE_FP16 + dims: [ -1, -1 ] + optional: true + allow_ragged_batch: false + }, + { + name: "request_prompt_lengths" + data_type: TYPE_UINT32 + dims: [ 1 ] + reshape: { shape: [ ] } + optional: true + }, + { + name: "request_prompt_type" + data_type: TYPE_UINT32 + dims: [ 1 ] + reshape: { shape: [ ] } + optional: true + }, + { + name: "top_p_decay" + data_type: TYPE_FP32 + dims: [ 1 ] + reshape: { shape: [ ] } + optional: true + }, + { + name: "top_p_min" + data_type: TYPE_FP32 + dims: [ 1 ] + reshape: { shape: [ ] } + optional: true + }, + { + name: "top_p_reset_ids" + data_type: TYPE_UINT32 + dims: [ 1 ] + reshape: { shape: [ ] } + optional: true + } +] +output [ + { + name: "output_ids" + data_type: TYPE_UINT32 + dims: [ -1, -1 ] + }, + { + name: "sequence_length" + data_type: TYPE_UINT32 + dims: [ -1 ] + }, + { + name: "response_input_lengths" + data_type: TYPE_INT32 + dims: [ -1 ] + }, + { + name: "cum_log_probs" + data_type: TYPE_FP32 + dims: [ -1 ] + }, + { + name: "output_log_probs" + data_type: TYPE_FP32 + dims: [ -1, -1 ] + }, + { + name: "context_embeddings" + data_type: TYPE_FP32 + dims: [ -1, -1 ] + } +] +instance_group [ + { + count: 1 + kind : KIND_CPU + } +] +parameters { + key: "tensor_para_size" + value: { + string_value: "1" + } +} +parameters { + key: "pipeline_para_size" + value: { + string_value: "1" + } +} +parameters { + key: "max_seq_len" + value: { + string_value: "2048" + } +} +parameters { + key: "is_half" + value: { + string_value: "1" + } +} +parameters { + key: "head_num" + value: { + string_value: "32" + } +} +parameters { + key: "size_per_head" + value: { + string_value: "128" + } +} +parameters { + key: "inter_size" + value: { + string_value: "16384" + } +} +parameters { + key: "vocab_size" + value: { + string_value: "50000" + } +} +parameters { + key: "start_id" + value: { + string_value: "0" + } +} +parameters { + key: "end_id" + value: { + string_value: "2" + } +} +parameters { + key: "decoder_layers" + value: { + string_value: "32" + } +} +parameters { + key: "data_type" + value: { + string_value: "fp16" + } +} +parameters { + key: "model_type" + value: { + string_value: "GPT" + } +} +parameters { + key: "model_checkpoint_path" + value: { + string_value: "/model/fastertransformer/1/1-gpu" + } +} +parameters { + key: "int8_mode" + value: { + string_value: "0" + } +} +parameters { + key: "enable_custom_all_reduce" + value: { + string_value: "0" + } +} diff --git a/converter/triton_config_gen.py b/converter/triton_config_gen.py index caf80e5..8d74b11 100644 --- a/converter/triton_config_gen.py +++ b/converter/triton_config_gen.py @@ -3,7 +3,7 @@ import argparse import os from string import Template -from transformers import GPTJConfig, AutoTokenizer +from transformers import AutoTokenizer, AutoConfig import torch def round_up(x, multiple): @@ -42,9 +42,10 @@ def round_up(x, multiple): # Global options if args.hf_model_dir.endswith('/'): args.hf_model_dir = args.hf_model_dir[:-1] -config = GPTJConfig.from_pretrained(args.hf_model_dir) + +config = AutoConfig.from_pretrained(args.hf_model_dir) tokenizer = AutoTokenizer.from_pretrained(args.tokenizer) -max_seq_len = config.n_positions +max_seq_len = config.n_positions if hasattr(config, 'n_positions') else config.max_position_embeddings is_half = '1' if config.torch_dtype == torch.float16 else '0' # Read in the template config file @@ -58,15 +59,16 @@ def round_up(x, multiple): params['name'] = model_name params['max_seq_len'] = max_seq_len params['is_half'] = is_half -params['head_num'] = config.n_head -params['size_per_head'] = config.n_embd // config.n_head -params['inter_size'] = 4*config.n_embd +params['head_num'] = config.n_head if hasattr(config, 'n_head') else config.num_attention_heads +n_embd = config.n_embd if hasattr(config, 'n_embd') else config.hidden_size +params['size_per_head'] = n_embd // params['head_num'] +params['inter_size'] = 4*n_embd # Vocab size gets rounded up to a multiple of 1024 params['vocab_size'] = round_up(tokenizer.vocab_size, 1024) params['start_id'] = tokenizer.eos_token_id params['end_id'] = tokenizer.eos_token_id -params['decoder_layers'] = config.n_layer -params['rotary_embedding'] = config.rotary_dim +params['decoder_layers'] = config.n_layer if hasattr(config, 'n_layer') else config.num_hidden_layers +params['rotary_embedding'] = config.rotary_dim if hasattr(config, 'rotary_dim') else 0 # NOTE: this assumes that the model dir follows the format used by the other conversion scripts model_dir = os.path.join(args.model_store, f'{model_name}-{args.num_gpu}gpu') weights_path = os.path.join(model_dir, 'fastertransformer', f'{version}', f'{args.num_gpu}-gpu') @@ -86,4 +88,4 @@ def round_up(x, multiple): print(f'Created config file for {model_name}') print(f' Config: {config_path}') print(f' Weights: {weights_path}') -print('==========================================================') \ No newline at end of file +print('==========================================================') diff --git a/copilot_proxy/models.py b/copilot_proxy/models.py index 7722bc4..74493c0 100644 --- a/copilot_proxy/models.py +++ b/copilot_proxy/models.py @@ -4,7 +4,7 @@ class OpenAIinput(BaseModel): - model: str = "py-model" + model: str = "fastertransformer" prompt: Optional[str] suffix: Optional[str] max_tokens: Optional[int] = 16 diff --git a/copilot_proxy/utils/codegen.py b/copilot_proxy/utils/codegen.py index 43ecab9..a4a8f44 100644 --- a/copilot_proxy/utils/codegen.py +++ b/copilot_proxy/utils/codegen.py @@ -18,7 +18,7 @@ class CodeGenProxy: def __init__(self, host: str = 'triton', port: int = 8001, verbose: bool = False): #self.tokenizer = Tokenizer.from_file('/python-docker/cgtok/tokenizer.json') - self.tokenizer = Tokenizer.from_pretrained('facebook/galactica-30b') + self.tokenizer = Tokenizer.from_pretrained('facebook/galactica-6.7b') self.client = client_util.InferenceServerClient(url=f'{host}:{port}', verbose=verbose) self.PAD_CHAR = 1 diff --git a/setup.sh b/setup.sh index e5d9dcf..eb04bc4 100755 --- a/setup.sh +++ b/setup.sh @@ -121,13 +121,12 @@ function fastertransformer_backend(){ # rm -f "$ARCHIVE" # else echo "Downloading and converting the model, this will take a while..." - docker run --rm -v "${MODELS_ROOT_DIR}":/models -e MODEL=${MODEL} -e NUM_GPUS="${NUM_GPUS}" fauxpilot-converter + echo docker run --rm -v $(realpath converter):/workspace -v ~/.cache/huggingface:/root/.cache/huggingface -v "${MODELS_ROOT_DIR}":/models -e MODEL=${MODEL} -e NUM_GPUS="${NUM_GPUS}" fauxpilot-converter # fi fi # Not used for this backend but needs to be present - HF_CACHE_DIR="$(pwd)/.hf_cache" - mkdir -p "$HF_CACHE_DIR" + HF_CACHE_DIR=$(realpath ~/.cache/huggingface) echo "HF_CACHE_DIR=${HF_CACHE_DIR}" >> .env }