cms-sw · silviodonato · Oct 14, 2020 · Oct 7, 2020 · Oct 7, 2020 · Oct 8, 2020
diff --git a/HeterogeneousCore/SonicTriton/README.md b/HeterogeneousCore/SonicTriton/README.md
@@ -23,23 +23,25 @@ The model information from the server can be printed by enabling `verbose` outpu
 * `modelName`: name of model with which to perform inference
 * `modelVersion`: version number of model (default: -1, use latest available version on server)
 * `batchSize`: number of objects sent per request
-  * can also be set on per-event basis
+  * can also be set on per-event basis using `setBatchSize()`
   * some models don't support batching
 * `address`: server IP address
 * `port`: server port
-* `timeout`: maximum time a request is allowed to take
-  * currently not used, will be supported in next Triton version
+* `timeout`: maximum allowed time for a request
 * `outputs`: optional, specify which output(s) the server should send
 
 Useful `TritonData` accessors include:
-* `dims()`: return dimensions (provided by server)
 * `variableDims()`: return true if any variable dimensions
 * `sizeDims()`: return product of dimensions (-1 if any variable dimensions)
-* `shape()`: return concrete shape (if any variable dimensions), otherwise `dims()`
-  * a non-`const` accessor is also provided to modify `shape()` directly (for specifying concrete values)
+* `shape()`: return actual shape (list of dimensions)
 * `sizeShape()`: return product of shape dimensions (returns `sizeDims()` if no variable dimensions)
-* `byteSize()`: return # bytes for data type
+* `byteSize()`: return number of bytes for data type
 * `dname()`: return name of data type
+* `batchSize()`: return current batch size
+
+To update the `TritonData` shape in the variable-dimension case:
+* `setShape(const std::vector<int64_t>& newShape)`: update all (variable) dimensions with values provided in `newShape`
+* `setShape(unsigned loc, int64_t val)`: update variable dimension at `loc` with `val`
 
 There are specific local input and output containers that should be used in producers.
 Here, `T` is a primitive type, and the two aliases listed below are passed to `TritonInputData::toServer()`

diff --git a/HeterogeneousCore/SonicTriton/interface/TritonClient.h b/HeterogeneousCore/SonicTriton/interface/TritonClient.h
@@ -12,18 +12,20 @@
 #include <exception>
 #include <unordered_map>
 
-#include "request_grpc.h"
+#include "grpc_client.h"
+#include "grpc_service.pb.h"
 
 class TritonClient : public SonicClient<TritonInputMap, TritonOutputMap> {
 public:
-  using ModelStatus = nvidia::inferenceserver::ModelStatus;
-  using InferContext = nvidia::inferenceserver::client::InferContext;
-
   struct ServerSideStats {
-    uint64_t request_count_;
-    uint64_t cumul_time_ns_;
+    uint64_t inference_count_;
+    uint64_t execution_count_;
+    uint64_t success_count_;
+    uint64_t cumm_time_ns_;
     uint64_t queue_time_ns_;
-    uint64_t compute_time_ns_;
+    uint64_t compute_input_time_ns_;
+    uint64_t compute_infer_time_ns_;
+    uint64_t compute_output_time_ns_;
   };
 
   //constructor
@@ -40,28 +42,29 @@ class TritonClient : public SonicClient<TritonInputMap, TritonOutputMap> {
 
 protected:
   //helper
-  bool getResults(std::map<std::string, std::unique_ptr<InferContext::Result>>& results);
+  bool getResults(std::shared_ptr<nvidia::inferenceserver::client::InferResult> results);
 
   void evaluate() override;
 
   void reportServerSideStats(const ServerSideStats& stats) const;
-  ServerSideStats summarizeServerStats(const ModelStatus& start_status, const ModelStatus& end_status) const;
+  ServerSideStats summarizeServerStats(const inference::ModelStatistics& start_status,
+                                       const inference::ModelStatistics& end_status) const;
 
-  ModelStatus getServerSideStatus() const;
+  inference::ModelStatistics getServerSideStatus() const;
 
   //members
-  std::string url_;
-  unsigned timeout_;
-  std::string modelName_;
-  int modelVersion_;
   unsigned maxBatchSize_;
   unsigned batchSize_;
   bool noBatch_;
   bool verbose_;
 
-  std::unique_ptr<InferContext> context_;
-  std::unique_ptr<nvidia::inferenceserver::client::ServerStatusContext> serverCtx_;
-  std::unique_ptr<InferContext::Options> options_;
+  //IO pointers for triton
+  std::vector<nvidia::inferenceserver::client::InferInput*> inputsTriton_;
+  std::vector<const nvidia::inferenceserver::client::InferRequestedOutput*> outputsTriton_;
+
+  std::unique_ptr<nvidia::inferenceserver::client::InferenceServerGrpcClient> client_;
+  //stores timeout, model name and version
+  nvidia::inferenceserver::client::InferOptions options_;
 };
 
 #endif
diff --git a/HeterogeneousCore/SonicTriton/interface/TritonData.h b/HeterogeneousCore/SonicTriton/interface/TritonData.h
@@ -12,7 +12,11 @@
 #include <memory>
 #include <any>
 
-#include "request_grpc.h"
+#include "grpc_client.h"
+#include "grpc_service.pb.h"
+
+//forward declaration
+class TritonClient;
 
 //aliases for local input and output types
 template <typename DT>
@@ -24,16 +28,16 @@ using TritonOutput = std::vector<edm::Span<const DT*>>;
 template <typename IO>
 class TritonData {
 public:
-  using Result = nvidia::inferenceserver::client::InferContext::Result;
+  using Result = nvidia::inferenceserver::client::InferResult;
+  using TensorMetadata = inference::ModelMetadataResponse_TensorMetadata;
+  using ShapeView = edm::Span<const int64_t*>;
 
   //constructor
-  TritonData(const std::string& name, std::shared_ptr<IO> data);
+  TritonData(const std::string& name, const TensorMetadata& model_info, bool noBatch);
 
   //some members can be modified
-  std::vector<int64_t>& shape() { return shape_; }
-  void reset();
-  void setBatchSize(unsigned bsize) { batchSize_ = bsize; }
-  void setResult(std::unique_ptr<Result> result) { result_ = std::move(result); }
+  bool setShape(const std::vector<int64_t>& newShape) { return setShape(newShape, true); }
+  bool setShape(unsigned loc, int64_t val) { return setShape(loc, val, true); }
 
   //io accessors
   template <typename DT>
@@ -42,8 +46,7 @@ class TritonData {
   TritonOutput<DT> fromServer() const;
 
   //const accessors
-  const std::vector<int64_t>& dims() const { return dims_; }
-  const std::vector<int64_t>& shape() const { return shape_.empty() ? dims() : shape_; }
+  const ShapeView& shape() const { return shape_; }
   int64_t byteSize() const { return byteSize_; }
   const std::string& dname() const { return dname_; }
   unsigned batchSize() const { return batchSize_; }
@@ -52,35 +55,48 @@ class TritonData {
   bool variableDims() const { return variableDims_; }
   int64_t sizeDims() const { return productDims_; }
   //default to dims if shape isn't filled
-  int64_t sizeShape() const { return shape_.empty() ? sizeDims() : dimProduct(shape_); }
+  int64_t sizeShape() const { return variableDims_ ? dimProduct(shape_) : sizeDims(); }
 
 private:
+  friend class TritonClient;
+
+  //private accessors only used by client
+  bool setShape(const std::vector<int64_t>& newShape, bool canThrow);
+  bool setShape(unsigned loc, int64_t val, bool canThrow);
+  void setBatchSize(unsigned bsize);
+  void reset();
+  void setResult(std::shared_ptr<Result> result) { result_ = result; }
+  IO* data() { return data_.get(); }
+
   //helpers
-  bool anyNeg(const std::vector<int64_t>& vec) const {
+  bool anyNeg(const ShapeView& vec) const {
     return std::any_of(vec.begin(), vec.end(), [](int64_t i) { return i < 0; });
   }
-  int64_t dimProduct(const std::vector<int64_t>& vec) const {
+  int64_t dimProduct(const ShapeView& vec) const {
     return std::accumulate(vec.begin(), vec.end(), 1, std::multiplies<int64_t>());
   }
+  void createObject(IO** ioptr) const;
 
   //members
   std::string name_;
   std::shared_ptr<IO> data_;
-  std::vector<int64_t> dims_;
+  const std::vector<int64_t> dims_;
+  bool noBatch_;
+  unsigned batchSize_;
+  std::vector<int64_t> fullShape_;
+  ShapeView shape_;
   bool variableDims_;
   int64_t productDims_;
-  nvidia::inferenceserver::DataType dtype_;
   std::string dname_;
+  inference::DataType dtype_;
   int64_t byteSize_;
-  std::vector<int64_t> shape_;
-  unsigned batchSize_;
   std::any holder_;
-  std::unique_ptr<Result> result_;
+  std::shared_ptr<Result> result_;
 };
 
-using TritonInputData = TritonData<nvidia::inferenceserver::client::InferContext::Input>;
+using TritonInputData = TritonData<nvidia::inferenceserver::client::InferInput>;
 using TritonInputMap = std::unordered_map<std::string, TritonInputData>;
-using TritonOutputData = TritonData<nvidia::inferenceserver::client::InferContext::Output>;
+using TritonOutputData = TritonData<nvidia::inferenceserver::client::InferRequestedOutput>;
 using TritonOutputMap = std::unordered_map<std::string, TritonOutputData>;
 
 //avoid "explicit specialization after instantiation" error
@@ -94,9 +110,13 @@ template <>
 void TritonInputData::reset();
 template <>
 void TritonOutputData::reset();
+template <>
+void TritonInputData::createObject(nvidia::inferenceserver::client::InferInput** ioptr) const;
+template <>
+void TritonOutputData::createObject(nvidia::inferenceserver::client::InferRequestedOutput** ioptr) const;
 
 //explicit template instantiation declarations
-extern template class TritonData<nvidia::inferenceserver::client::InferContext::Input>;
-extern template class TritonData<nvidia::inferenceserver::client::InferContext::Output>;
+extern template class TritonData<nvidia::inferenceserver::client::InferInput>;
+extern template class TritonData<nvidia::inferenceserver::client::InferRequestedOutput>;
 
 #endif
diff --git a/HeterogeneousCore/SonicTriton/interface/triton_utils.h b/HeterogeneousCore/SonicTriton/interface/triton_utils.h
@@ -1,12 +1,14 @@
 #ifndef HeterogeneousCore_SonicTriton_triton_utils
 #define HeterogeneousCore_SonicTriton_triton_utils
 
+#include "FWCore/Utilities/interface/Span.h"
+
 #include <string>
 #include <string_view>
 #include <vector>
 #include <unordered_set>
 
-#include "request_grpc.h"
+#include "grpc_client.h"
 
 namespace triton_utils {
 
@@ -23,7 +25,7 @@ namespace triton_utils {
 
 }  // namespace triton_utils
 
-extern template std::string triton_utils::printColl(const std::vector<int64_t>& coll, const std::string& delim);
+extern template std::string triton_utils::printColl(const edm::Span<const int64_t*>& coll, const std::string& delim);
 extern template std::string triton_utils::printColl(const std::vector<uint8_t>& coll, const std::string& delim);
 extern template std::string triton_utils::printColl(const std::vector<float>& coll, const std::string& delim);
 extern template std::string triton_utils::printColl(const std::unordered_set<std::string>& coll,