Add Galactica models to FasterTransformer fauxpilot#2

Triton with FasterTransformer for Galactica 6.7b works now
BlackHC · Jan 14, 2023 · 2ada95e · 2ada95e
1 parent 303ef0a
commit 2ada95e
Show file tree

Hide file tree

Showing 8 changed files with 350 additions and 16 deletions.
diff --git a/converter/download_and_convert_model.sh b/converter/download_and_convert_model.sh
@@ -17,6 +17,8 @@ fi
 
 echo ${MODEL_TYPE}
 
+export HUGGINGFACE_HUB_CACHE=/root/.cache/huggingface/hub
+
 # If the model type is codegen, then we need to download the codegen and convert it:
 if [[ ${MODEL_TYPE} == codegen ]]; then
     exit

diff --git a/converter/huggingface_opt_convert.py b/converter/huggingface_opt_convert.py
@@ -401,7 +401,7 @@ def split_and_convert(args):
     parser.add_argument('-trained_gpu_num', '-t_g', type=int, help='How many gpus for inference', default=1)
     parser.add_argument('-infer_gpu_num', '-i_g', type=int, help='How many gpus for inference', required=True)
     parser.add_argument("-processes", "-p", type=int, help="How many processes to spawn for conversion (default: 4)", default=4)
-    parser.add_argument("-weight_data_type", type=str, default="fp32", choices=["fp32", "fp16"])
+    parser.add_argument("-weight_data_type", type=str, default="fp16", choices=["fp32", "fp16"])
     parser.add_argument("-quantize",
                         help="Store selected in int8, run the models to determine scaling factors",
                         action="store_true")

diff --git a/converter/models/galactica-125m-1gpu/fastertransformer/config.pbtxt b/converter/models/galactica-125m-1gpu/fastertransformer/config.pbtxt
@@ -93,7 +93,7 @@ input [
   },
   {
     name: "random_seed"
-    data_type: TYPE_UINT64
+    data_type: TYPE_INT32
     dims: [ 1 ]
     reshape: { shape: [ ] }
     optional: true

diff --git a/converter/models/galactica-6.7b-1gpu/fastertransformer/config.pbtxt b/converter/models/galactica-6.7b-1gpu/fastertransformer/config.pbtxt
@@ -0,0 +1,331 @@
+name: "fastertransformer"
+backend: "fastertransformer"
+default_model_filename: "facebook/galactica-6.7b"
+max_batch_size: 1024
+
+model_transaction_policy {
+  decoupled: False
+}
+
+
+dynamic_batching {
+   max_queue_delay_microseconds: 50000
+}
+
+batch_input [
+  {
+    kind: BATCH_ITEM_SHAPE
+    target_name: "input_ids_item_shape"
+    data_type: TYPE_INT32
+    source_input: "input_ids"
+  }
+]
+
+# this is not needed when not using request prompt embedding
+#batch_input [
+#  {
+#    kind: BATCH_ITEM_SHAPE
+#    target_name: "request_prompt_embedding_item_shape"
+#    data_type: TYPE_INT32
+#    source_input: "request_prompt_embedding"
+#  }
+#]
+
+input [
+  {
+    name: "input_ids"
+    data_type: TYPE_UINT32
+    dims: [ -1 ]
+    allow_ragged_batch: true
+  },
+  {
+    name: "input_lengths"
+    data_type: TYPE_UINT32
+    dims: [ 1 ]
+    reshape: { shape: [ ] }
+  },
+  {
+    name: "request_output_len"
+    data_type: TYPE_UINT32
+    dims: [ -1 ]
+  },
+  {
+    name: "runtime_top_k"
+    data_type: TYPE_UINT32
+    dims: [ 1 ]
+    reshape: { shape: [ ] }
+    optional: true
+  },
+  {
+    name: "runtime_top_p"
+    data_type: TYPE_FP32
+    dims: [ 1 ]
+    reshape: { shape: [ ] }
+    optional: true
+  },
+  {
+    name: "beam_search_diversity_rate"
+    data_type: TYPE_FP32
+    dims: [ 1 ]
+    reshape: { shape: [ ] }
+    optional: true
+  },
+  {
+    name: "temperature"
+    data_type: TYPE_FP32
+    dims: [ 1 ]
+    reshape: { shape: [ ] }
+    optional: true
+  },
+  {
+    name: "len_penalty"
+    data_type: TYPE_FP32
+    dims: [ 1 ]
+    reshape: { shape: [ ] }
+    optional: true
+  },
+  {
+    name: "repetition_penalty"
+    data_type: TYPE_FP32
+    dims: [ 1 ]
+    reshape: { shape: [ ] }
+    optional: true
+  },
+  {
+    name: "random_seed"
+    data_type: TYPE_INT32
+    dims: [ 1 ]
+    reshape: { shape: [ ] }
+    optional: true
+  },
+  {
+    name: "is_return_log_probs"
+    data_type: TYPE_BOOL
+    dims: [ 1 ]
+    reshape: { shape: [ ] }
+    optional: true
+  },
+  {
+    name: "is_return_context_embeddings"
+    data_type: TYPE_BOOL
+    dims: [ 1 ]
+    reshape: { shape: [ ] }
+    optional: true
+  },
+  {
+    name: "beam_width"
+    data_type: TYPE_UINT32
+    dims: [ 1 ]
+    reshape: { shape: [ ] }
+    optional: true
+  },
+  {
+    name: "start_id"
+    data_type: TYPE_UINT32
+    dims: [ 1 ]
+    reshape: { shape: [ ] }
+    optional: true
+  },
+  {
+    name: "end_id"
+    data_type: TYPE_UINT32
+    dims: [ 1 ]
+    reshape: { shape: [ ] }
+    optional: true
+  },
+  {
+    name: "stop_words_list"
+    data_type: TYPE_INT32
+    dims: [ 2, -1 ]
+    optional: true
+  },
+  {
+    name: "bad_words_list"
+    data_type: TYPE_INT32
+    dims: [ 2, -1 ]
+    optional: true
+  },
+  {
+    name: "prompt_learning_task_name_ids"
+    data_type: TYPE_UINT32
+    dims: [ 1 ]
+    reshape: { shape: [ ] }
+    optional: true
+  },
+  {
+    name: "request_prompt_embedding"
+    data_type: TYPE_FP16
+    dims: [ -1, -1 ]
+    optional: true
+    allow_ragged_batch: false
+  },
+  {
+    name: "request_prompt_lengths"
+    data_type: TYPE_UINT32
+    dims: [ 1 ]
+    reshape: { shape: [ ] }
+    optional: true
+  },
+  {
+    name: "request_prompt_type"
+    data_type: TYPE_UINT32
+    dims: [ 1 ]
+    reshape: { shape: [ ] }
+    optional: true
+  },
+  {
+    name: "top_p_decay"
+    data_type: TYPE_FP32
+    dims: [ 1 ]
+    reshape: { shape: [ ] }
+    optional: true
+  },
+  {
+    name: "top_p_min"
+    data_type: TYPE_FP32
+    dims: [ 1 ]
+    reshape: { shape: [ ] }
+    optional: true
+  },
+  {
+    name: "top_p_reset_ids"
+    data_type: TYPE_UINT32
+    dims: [ 1 ]
+    reshape: { shape: [ ] }
+    optional: true
+  }
+]
+output [
+  {
+    name: "output_ids"
+    data_type: TYPE_UINT32
+    dims: [ -1, -1 ]
+  },
+  {
+    name: "sequence_length"
+    data_type: TYPE_UINT32
+    dims: [ -1 ]
+  },
+  {
+    name: "response_input_lengths"
+    data_type: TYPE_INT32
+    dims: [ -1 ]
+  },
+  {
+    name: "cum_log_probs"
+    data_type: TYPE_FP32
+    dims: [ -1 ]
+  },
+  {
+    name: "output_log_probs"
+    data_type: TYPE_FP32
+    dims: [ -1, -1 ]
+  },
+  {
+    name: "context_embeddings"
+    data_type: TYPE_FP32
+    dims: [ -1, -1 ]
+  }
+]
+instance_group [
+  {
+    count: 1
+    kind : KIND_CPU
+  }
+]
+parameters {
+  key: "tensor_para_size"
+  value: {
+    string_value: "1"
+  }
+}
+parameters {
+  key: "pipeline_para_size"
+  value: {
+    string_value: "1"
+  }
+}
+parameters {
+  key: "max_seq_len"
+  value: {
+    string_value: "2048"
+  }
+}
+parameters {
+  key: "is_half"
+  value: {
+    string_value: "1"
+  }
+}
+parameters {
+  key: "head_num"
+  value: {
+    string_value: "32"
+  }
+}
+parameters {
+  key: "size_per_head"
+  value: {
+    string_value: "128"
+  }
+}
+parameters {
+  key: "inter_size"
+  value: {
+    string_value: "16384"
+  }
+}
+parameters {
+  key: "vocab_size"
+  value: {
+    string_value: "50000"
+  }
+}
+parameters {
+  key: "start_id"
+  value: {
+    string_value: "0"
+  }
+}
+parameters {
+  key: "end_id"
+  value: {
+    string_value: "2"
+  }
+}
+parameters {
+  key: "decoder_layers"
+  value: {
+    string_value: "32"
+  }
+}
+parameters {
+  key: "data_type"
+  value: {
+    string_value: "fp16"
+  }
+}
+parameters {
+  key: "model_type"
+  value: {
+    string_value: "GPT"
+  }
+}
+parameters {
+  key: "model_checkpoint_path"
+  value: {
+    string_value: "/model/fastertransformer/1/1-gpu"
+  }
+}
+parameters {
+  key: "int8_mode"
+  value: {
+    string_value: "0"
+  }
+}
+parameters {
+  key: "enable_custom_all_reduce"
+  value: {
+    string_value: "0"
+  }
+}