In [1]:
!pip install "ray==2.6.1"
!pip install "ray[serve]" requests diffusers transformers fastapi==0.96

Collecting ray==2.6.1
  Downloading ray-2.6.1-cp310-cp310-manylinux2014_x86_64.whl (56.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m56.9/56.9 MB[0m [31m15.3 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
Collecting filelock (from ray==2.6.1)
  Using cached filelock-3.12.4-py3-none-any.whl (11 kB)
Collecting aiosignal (from ray==2.6.1)
  Using cached aiosignal-1.3.1-py3-none-any.whl (7.6 kB)
Collecting frozenlist (from ray==2.6.1)
  Using cached frozenlist-1.4.0-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl (225 kB)
Installing collected packages: frozenlist, filelock, aiosignal, ray
Successfully installed aiosignal-1.3.1 filelock-3.12.4 frozenlist-1.4.0 ray-2.6.1
Collecting diffusers
  Downloading diffusers-0.21.4-py3-none-any.whl (1.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.5/1.5 MB[0m [31m20.6 MB/s[0m eta [36m0:00:00[0m00:01[0m0:01[0m
[?25hCollecting transformers
  Usi

In [2]:
model_id = "EleutherAI/gpt-j-6B"
revision = "float16"  # use float16 weights to fit in 16GB GPUs

In [3]:
import ray

In [4]:
ray.init(
    address="ray://example-cluster-kuberay-head-svc:10001",
    runtime_env={
        "pip": [
            "IPython",
            "boto3==1.26",
            "botocore==1.29", 
            "datasets",
            "fastapi==0.96",
            "accelerate>=0.16.0",
            "transformers>=4.26.0",
            "numpy<1.24",  # remove when mlflow updates beyond 2.2
            "torch",
        ]
    }
)

0,1
Python version:,3.10.8
Ray version:,2.6.1
Dashboard:,http://10.56.1.3:8265


In [5]:
import pandas as pd

from ray import serve
from starlette.requests import Request


@serve.deployment(ray_actor_options={"num_gpus": 1})
class PredictDeployment:
    def __init__(self, model_id: str, revision: str = None):
        from transformers import AutoModelForCausalLM, AutoTokenizer
        import torch

        self.model = AutoModelForCausalLM.from_pretrained(
            model_id,
            revision=revision,
            torch_dtype=torch.float16,
            low_cpu_mem_usage=True,
            device_map="auto",  # automatically makes use of all GPUs available to the Actor
        )
        self.tokenizer = AutoTokenizer.from_pretrained(model_id)

    def generate(self, text: str) -> pd.DataFrame:
        input_ids = self.tokenizer(text, return_tensors="pt").input_ids.to(
            self.model.device
        )

        gen_tokens = self.model.generate(
            input_ids,
            do_sample=True,
            temperature=0.9,
            max_length=100,
        )
        return pd.DataFrame(
            self.tokenizer.batch_decode(gen_tokens), columns=["responses"]
        )

    async def __call__(self, http_request: Request) -> str:
        json_request: str = await http_request.json()
        prompts = []
        for prompt in json_request:
            text = prompt["text"]
            if isinstance(text, list):
                prompts.extend(text)
            else:
                prompts.append(text)
        return self.generate(prompts)

In [None]:
deployment = PredictDeployment.bind(model_id=model_id, revision=revision)
serve.run(deployment, host="0.0.0.0")

In [8]:
import requests

prompt = (
    "Israel said it brought in special forces to try to wrest control of four Israeli sites "+
    "from Hamas fighters, including two kibbutzim that militants entered earlier in their attacks. "+
    "Footage released by Israeli police from one area showed forces kneeling in tall grass as they "+
    "exchanged fire with Hamas militants across an open field."
)

sample_input = {"text": prompt}

output = requests.post("http://example-cluster-kuberay-head-svc:8000/", json=[sample_input]).json()
print(output)

[2m[36m(ServeReplica:default_PredictDeployment pid=5871)[0m The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
[2m[36m(ServeReplica:default_PredictDeployment pid=5871)[0m Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


[{'responses': 'Israel said it brought in special forces to try to wrest control of four Israeli sites from Hamas fighters, including two kibbutzim that militants entered earlier in their attacks. Footage released by Israeli police from one area showed forces kneeling in tall grass as they exchanged fire with Hamas militants across an open field.\n\nMeanwhile, the Israeli military said a "humanitarian" rocket had hit southern Israel earlier on Friday.\n\nIt said the projectile did not cause casualties or damage. Israel accused Gaza'}]


[2m[36m(ServeReplica:default_PredictDeployment pid=5871)[0m INFO 2023-10-08 21:43:45,355 default_PredictDeployment default_PredictDeployment#VwVcPr nwvZviHUvW / default replica.py:723 - __CALL__ OK 2679.9ms
