# ONNX に変換したモデルを Kubernetes 上にデプロイ

In [None]:
import os
from azureml.core import Workspace
from azureml.core.conda_dependencies import CondaDependencies 
from azureml.core.model import InferenceConfig
from azureml.core.environment import Environment
from azureml.core.webservice import AksWebservice
from azureml.core.compute.aks import AksCompute 
from azureml.core.model import Model
import random
import numpy as np
import torch
import json
import requests

In [None]:
ws = Workspace.from_config()
ws

In [None]:
# 推論環境の定義ファイル生成と環境設定

env_file_path = os.path.join("src", "environment.yml")
score_file_path = os.path.join("src", "score.py")

env = Environment.from_conda_specification(name="rinna-predict-env", file_path=env_file_path)
env.register(ws)
inference_config = InferenceConfig(entry_script=score_file_path, environment=env)

In [None]:
# デプロイ設定
deploy_config = AksWebservice.deploy_configuration(
    cpu_cores = 1,
    memory_gb = 4,
    tags = {'framework': 'onnx'},
    auth_enabled = False,
    description = 'rinna gpt-2'
)

target_aks = AksCompute(ws, 'aml-cluster')

In [None]:
# モデル指定
model = Model(ws, 'rinna-GPT2-quantized-model')

In [None]:
# デプロイ
service_name = 'rinna-gpt2-aks'
print("Service", service_name)
webservice = Model.deploy(ws, service_name, [model], inference_config, deploy_config, target_aks)
webservice.wait_for_deployment(True)
print(webservice.state)

In [None]:
# 推論
endpoint = webservice.scoring_uri
input_data = json.dumps({'data': "機械学習"})
res = requests.post(url=endpoint, data=input_data, headers={'Content-Type': 'application/json'})
res.json()