diff --git a/converter/download_and_convert_model.sh b/converter/download_and_convert_model.sh
index d2a99b0..da36afd 100755
--- a/converter/download_and_convert_model.sh
+++ b/converter/download_and_convert_model.sh
@@ -17,6 +17,8 @@ fi
 
 echo ${MODEL_TYPE}
 
+export HUGGINGFACE_HUB_CACHE=/root/.cache/huggingface/hub
+
 # If the model type is codegen, then we need to download the codegen and convert it:
 if [[ ${MODEL_TYPE} == codegen ]]; then
     exit
diff --git a/converter/huggingface_opt_convert.py b/converter/huggingface_opt_convert.py
index 795344d..3d72b0e 100644
--- a/converter/huggingface_opt_convert.py
+++ b/converter/huggingface_opt_convert.py
@@ -401,7 +401,7 @@ def split_and_convert(args):
     parser.add_argument('-trained_gpu_num', '-t_g', type=int, help='How many gpus for inference', default=1)
     parser.add_argument('-infer_gpu_num', '-i_g', type=int, help='How many gpus for inference', required=True)
     parser.add_argument("-processes", "-p", type=int, help="How many processes to spawn for conversion (default: 4)", default=4)
-    parser.add_argument("-weight_data_type", type=str, default="fp32", choices=["fp32", "fp16"])
+    parser.add_argument("-weight_data_type", type=str, default="fp16", choices=["fp32", "fp16"])
     parser.add_argument("-quantize",
                         help="Store selected in int8, run the models to determine scaling factors",
                         action="store_true")
diff --git a/converter/models/galactica-125m-1gpu/fastertransformer/config.pbtxt b/converter/models/galactica-125m-1gpu/fastertransformer/config.pbtxt
index 3d63710..6e3409e 100644
--- a/converter/models/galactica-125m-1gpu/fastertransformer/config.pbtxt
+++ b/converter/models/galactica-125m-1gpu/fastertransformer/config.pbtxt
@@ -93,7 +93,7 @@ input [
   },
   {
     name: "random_seed"
-    data_type: TYPE_UINT64
+    data_type: TYPE_INT32
     dims: [ 1 ]
     reshape: { shape: [ ] }
     optional: true
diff --git a/converter/models/galactica-6.7b-1gpu/fastertransformer/config.pbtxt b/converter/models/galactica-6.7b-1gpu/fastertransformer/config.pbtxt
new file mode 100644
index 0000000..061d75d
--- /dev/null
+++ b/converter/models/galactica-6.7b-1gpu/fastertransformer/config.pbtxt
@@ -0,0 +1,331 @@
+name: "fastertransformer"
+backend: "fastertransformer"
+default_model_filename: "facebook/galactica-6.7b"
+max_batch_size: 1024
+
+model_transaction_policy {
+  decoupled: False
+}
+
+
+dynamic_batching {
+   max_queue_delay_microseconds: 50000
+}
+
+batch_input [
+  {
+    kind: BATCH_ITEM_SHAPE
+    target_name: "input_ids_item_shape"
+    data_type: TYPE_INT32
+    source_input: "input_ids"
+  }
+]
+
+# this is not needed when not using request prompt embedding
+#batch_input [
+#  {
+#    kind: BATCH_ITEM_SHAPE
+#    target_name: "request_prompt_embedding_item_shape"
+#    data_type: TYPE_INT32
+#    source_input: "request_prompt_embedding"
+#  }
+#]
+
+input [
+  {
+    name: "input_ids"
+    data_type: TYPE_UINT32
+    dims: [ -1 ]
+    allow_ragged_batch: true
+  },
+  {
+    name: "input_lengths"
+    data_type: TYPE_UINT32
+    dims: [ 1 ]
+    reshape: { shape: [ ] }
+  },
+  {
+    name: "request_output_len"
+    data_type: TYPE_UINT32
+    dims: [ -1 ]
+  },
+  {
+    name: "runtime_top_k"
+    data_type: TYPE_UINT32
+    dims: [ 1 ]
+    reshape: { shape: [ ] }
+    optional: true
+  },
+  {
+    name: "runtime_top_p"
+    data_type: TYPE_FP32
+    dims: [ 1 ]
+    reshape: { shape: [ ] }
+    optional: true
+  },
+  {
+    name: "beam_search_diversity_rate"
+    data_type: TYPE_FP32
+    dims: [ 1 ]
+    reshape: { shape: [ ] }
+    optional: true
+  },
+  {
+    name: "temperature"
+    data_type: TYPE_FP32
+    dims: [ 1 ]
+    reshape: { shape: [ ] }
+    optional: true
+  },
+  {
+    name: "len_penalty"
+    data_type: TYPE_FP32
+    dims: [ 1 ]
+    reshape: { shape: [ ] }
+    optional: true
+  },
+  {
+    name: "repetition_penalty"
+    data_type: TYPE_FP32
+    dims: [ 1 ]
+    reshape: { shape: [ ] }
+    optional: true
+  },
+  {
+    name: "random_seed"
+    data_type: TYPE_INT32
+    dims: [ 1 ]
+    reshape: { shape: [ ] }
+    optional: true
+  },
+  {
+    name: "is_return_log_probs"
+    data_type: TYPE_BOOL
+    dims: [ 1 ]
+    reshape: { shape: [ ] }
+    optional: true
+  },
+  {
+    name: "is_return_context_embeddings"
+    data_type: TYPE_BOOL
+    dims: [ 1 ]
+    reshape: { shape: [ ] }
+    optional: true
+  },
+  {
+    name: "beam_width"
+    data_type: TYPE_UINT32
+    dims: [ 1 ]
+    reshape: { shape: [ ] }
+    optional: true
+  },
+  {
+    name: "start_id"
+    data_type: TYPE_UINT32
+    dims: [ 1 ]
+    reshape: { shape: [ ] }
+    optional: true
+  },
+  {
+    name: "end_id"
+    data_type: TYPE_UINT32
+    dims: [ 1 ]
+    reshape: { shape: [ ] }
+    optional: true
+  },
+  {
+    name: "stop_words_list"
+    data_type: TYPE_INT32
+    dims: [ 2, -1 ]
+    optional: true
+  },
+  {
+    name: "bad_words_list"
+    data_type: TYPE_INT32
+    dims: [ 2, -1 ]
+    optional: true
+  },
+  {
+    name: "prompt_learning_task_name_ids"
+    data_type: TYPE_UINT32
+    dims: [ 1 ]
+    reshape: { shape: [ ] }
+    optional: true
+  },
+  {
+    name: "request_prompt_embedding"
+    data_type: TYPE_FP16
+    dims: [ -1, -1 ]
+    optional: true
+    allow_ragged_batch: false
+  },
+  {
+    name: "request_prompt_lengths"
+    data_type: TYPE_UINT32
+    dims: [ 1 ]
+    reshape: { shape: [ ] }
+    optional: true
+  },
+  {
+    name: "request_prompt_type"
+    data_type: TYPE_UINT32
+    dims: [ 1 ]
+    reshape: { shape: [ ] }
+    optional: true
+  },
+  {
+    name: "top_p_decay"
+    data_type: TYPE_FP32
+    dims: [ 1 ]
+    reshape: { shape: [ ] }
+    optional: true
+  },
+  {
+    name: "top_p_min"
+    data_type: TYPE_FP32
+    dims: [ 1 ]
+    reshape: { shape: [ ] }
+    optional: true
+  },
+  {
+    name: "top_p_reset_ids"
+    data_type: TYPE_UINT32
+    dims: [ 1 ]
+    reshape: { shape: [ ] }
+    optional: true
+  }
+]
+output [
+  {
+    name: "output_ids"
+    data_type: TYPE_UINT32
+    dims: [ -1, -1 ]
+  },
+  {
+    name: "sequence_length"
+    data_type: TYPE_UINT32
+    dims: [ -1 ]
+  },
+  {
+    name: "response_input_lengths"
+    data_type: TYPE_INT32
+    dims: [ -1 ]
+  },
+  {
+    name: "cum_log_probs"
+    data_type: TYPE_FP32
+    dims: [ -1 ]
+  },
+  {
+    name: "output_log_probs"
+    data_type: TYPE_FP32
+    dims: [ -1, -1 ]
+  },
+  {
+    name: "context_embeddings"
+    data_type: TYPE_FP32
+    dims: [ -1, -1 ]
+  }
+]
+instance_group [
+  {
+    count: 1
+    kind : KIND_CPU
+  }
+]
+parameters {
+  key: "tensor_para_size"
+  value: {
+    string_value: "1"
+  }
+}
+parameters {
+  key: "pipeline_para_size"
+  value: {
+    string_value: "1"
+  }
+}
+parameters {
+  key: "max_seq_len"
+  value: {
+    string_value: "2048"
+  }
+}
+parameters {
+  key: "is_half"
+  value: {
+    string_value: "1"
+  }
+}
+parameters {
+  key: "head_num"
+  value: {
+    string_value: "32"
+  }
+}
+parameters {
+  key: "size_per_head"
+  value: {
+    string_value: "128"
+  }
+}
+parameters {
+  key: "inter_size"
+  value: {
+    string_value: "16384"
+  }
+}
+parameters {
+  key: "vocab_size"
+  value: {
+    string_value: "50000"
+  }
+}
+parameters {
+  key: "start_id"
+  value: {
+    string_value: "0"
+  }
+}
+parameters {
+  key: "end_id"
+  value: {
+    string_value: "2"
+  }
+}
+parameters {
+  key: "decoder_layers"
+  value: {
+    string_value: "32"
+  }
+}
+parameters {
+  key: "data_type"
+  value: {
+    string_value: "fp16"
+  }
+}
+parameters {
+  key: "model_type"
+  value: {
+    string_value: "GPT"
+  }
+}
+parameters {
+  key: "model_checkpoint_path"
+  value: {
+    string_value: "/model/fastertransformer/1/1-gpu"
+  }
+}
+parameters {
+  key: "int8_mode"
+  value: {
+    string_value: "0"
+  }
+}
+parameters {
+  key: "enable_custom_all_reduce"
+  value: {
+    string_value: "0"
+  }
+}
diff --git a/converter/triton_config_gen.py b/converter/triton_config_gen.py
index caf80e5..8d74b11 100644
--- a/converter/triton_config_gen.py
+++ b/converter/triton_config_gen.py
@@ -3,7 +3,7 @@
 import argparse
 import os
 from string import Template
-from transformers import GPTJConfig, AutoTokenizer
+from transformers import AutoTokenizer, AutoConfig
 import torch
 
 def round_up(x, multiple):
@@ -42,9 +42,10 @@ def round_up(x, multiple):
 # Global options
 if args.hf_model_dir.endswith('/'):
     args.hf_model_dir = args.hf_model_dir[:-1]
-config = GPTJConfig.from_pretrained(args.hf_model_dir)
+
+config = AutoConfig.from_pretrained(args.hf_model_dir)
 tokenizer = AutoTokenizer.from_pretrained(args.tokenizer)
-max_seq_len = config.n_positions
+max_seq_len = config.n_positions if hasattr(config, 'n_positions') else config.max_position_embeddings
 is_half = '1' if config.torch_dtype == torch.float16 else '0'
 
 # Read in the template config file
@@ -58,15 +59,16 @@ def round_up(x, multiple):
 params['name'] = model_name
 params['max_seq_len'] = max_seq_len
 params['is_half'] = is_half
-params['head_num'] = config.n_head
-params['size_per_head'] = config.n_embd // config.n_head
-params['inter_size'] = 4*config.n_embd
+params['head_num'] = config.n_head if hasattr(config, 'n_head') else config.num_attention_heads
+n_embd = config.n_embd if hasattr(config, 'n_embd') else config.hidden_size
+params['size_per_head'] = n_embd // params['head_num']
+params['inter_size'] = 4*n_embd
 # Vocab size gets rounded up to a multiple of 1024
 params['vocab_size'] = round_up(tokenizer.vocab_size, 1024)
 params['start_id'] = tokenizer.eos_token_id
 params['end_id'] = tokenizer.eos_token_id
-params['decoder_layers'] = config.n_layer
-params['rotary_embedding'] = config.rotary_dim
+params['decoder_layers'] = config.n_layer if hasattr(config, 'n_layer') else config.num_hidden_layers
+params['rotary_embedding'] = config.rotary_dim if hasattr(config, 'rotary_dim') else 0
 # NOTE: this assumes that the model dir follows the format used by the other conversion scripts
 model_dir = os.path.join(args.model_store, f'{model_name}-{args.num_gpu}gpu')
 weights_path = os.path.join(model_dir, 'fastertransformer', f'{version}', f'{args.num_gpu}-gpu')
@@ -86,4 +88,4 @@ def round_up(x, multiple):
 print(f'Created config file for {model_name}')
 print(f'  Config:  {config_path}')
 print(f'  Weights: {weights_path}')
-print('==========================================================')
\ No newline at end of file
+print('==========================================================')
diff --git a/copilot_proxy/models.py b/copilot_proxy/models.py
index 7722bc4..74493c0 100644
--- a/copilot_proxy/models.py
+++ b/copilot_proxy/models.py
@@ -4,7 +4,7 @@
 
 
 class OpenAIinput(BaseModel):
-    model: str = "py-model"
+    model: str = "fastertransformer"
     prompt: Optional[str]
     suffix: Optional[str]
     max_tokens: Optional[int] = 16
diff --git a/copilot_proxy/utils/codegen.py b/copilot_proxy/utils/codegen.py
index 43ecab9..a4a8f44 100644
--- a/copilot_proxy/utils/codegen.py
+++ b/copilot_proxy/utils/codegen.py
@@ -18,7 +18,7 @@
 class CodeGenProxy:
     def __init__(self, host: str = 'triton', port: int = 8001, verbose: bool = False):
         #self.tokenizer = Tokenizer.from_file('/python-docker/cgtok/tokenizer.json')
-        self.tokenizer = Tokenizer.from_pretrained('facebook/galactica-30b')
+        self.tokenizer = Tokenizer.from_pretrained('facebook/galactica-6.7b')
         self.client = client_util.InferenceServerClient(url=f'{host}:{port}', verbose=verbose)
         self.PAD_CHAR = 1
 
diff --git a/setup.sh b/setup.sh
index e5d9dcf..eb04bc4 100755
--- a/setup.sh
+++ b/setup.sh
@@ -121,13 +121,12 @@ function fastertransformer_backend(){
 #        rm -f "$ARCHIVE"
 #      else
       echo "Downloading and converting the model, this will take a while..."
-      docker run --rm -v "${MODELS_ROOT_DIR}":/models -e MODEL=${MODEL} -e NUM_GPUS="${NUM_GPUS}" fauxpilot-converter
+      echo docker run --rm -v $(realpath converter):/workspace -v ~/.cache/huggingface:/root/.cache/huggingface -v "${MODELS_ROOT_DIR}":/models -e MODEL=${MODEL} -e NUM_GPUS="${NUM_GPUS}" fauxpilot-converter
 #      fi
     fi
 
     # Not used for this backend but needs to be present
-    HF_CACHE_DIR="$(pwd)/.hf_cache"
-    mkdir -p "$HF_CACHE_DIR"
+    HF_CACHE_DIR=$(realpath ~/.cache/huggingface)
     echo "HF_CACHE_DIR=${HF_CACHE_DIR}" >> .env
 }