## Add required dependencies
These will be moved to requirements.txt file

In [None]:
!pip install "ray==2.6.1"
!pip install "ray[serve]" 
!pip install requests 
!pip install diffusers 
!pip install transformers 
!pip install fastapi==0.96
!pip install tensorflow
!pip install langchain
!pip install torch torchvision

Collecting ray==2.6.1
  Using cached ray-2.6.1-cp310-cp310-manylinux2014_x86_64.whl (56.9 MB)
Collecting filelock (from ray==2.6.1)
  Using cached filelock-3.12.4-py3-none-any.whl (11 kB)
Collecting aiosignal (from ray==2.6.1)
  Using cached aiosignal-1.3.1-py3-none-any.whl (7.6 kB)
Collecting frozenlist (from ray==2.6.1)
  Using cached frozenlist-1.4.0-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl (225 kB)
Installing collected packages: frozenlist, filelock, aiosignal, ray
Successfully installed aiosignal-1.3.1 filelock-3.12.4 frozenlist-1.4.0 ray-2.6.1
Collecting virtualenv<20.21.1,>=20.0.24 (from ray[serve])
  Using cached virtualenv-20.21.0-py3-none-any.whl (8.7 MB)
Collecting py-spy>=0.2.0 (from ray[serve])
  Using cached py_spy-0.3.14-py2.py3-none-manylinux_2_5_x86_64.manylinux1_x86_64.whl (3.0 MB)
Collecting colorful (from ray[serve])
  Using cached colorful-0.5.5-py2.py3-none-any.whl (201 kB)
Collecting aiohttp-cors (from ray[

In [None]:
# https://python.langchain.com/docs/integrations/llms/huggingface_hub.html
# next: try using https://python.langchain.com/docs/integrations/llms/huggingface_pipelines

from langchain.llms import HuggingFaceHub
from langchain.prompts import PromptTemplate
from langchain.chains import LLMChain

from langchain.prompts.chat import (
    ChatPromptTemplate,
    SystemMessagePromptTemplate,
    AIMessagePromptTemplate,
    HumanMessagePromptTemplate,
)
from langchain.schema import AIMessage, HumanMessage, SystemMessage
from langchain.llms import OpenAI
from langchain.chains import SimpleSequentialChain
from langchain.chains import LLMChain
from langchain import PromptTemplate


text_model_name = "google/flan-t5-xxl"
# text_model_name = "bigscience/bloom-1b7"
# text_model_name = "chavinlo/gpt4-x-alpaca"
# model_name="gpt-3.5-turbo"

template1 = """Give me a fact about {topic}. """
template2 = "Translate to french: {fact}"

# create the prompt
prompt = PromptTemplate(
    input_variables=["topic"],
    template=template1,
)

# create the second prompt
second_prompt = PromptTemplate(
    input_variables=["fact"],
    template=template2,
)

def create_chain (llm):
    # create two chains 
    fact_chain = LLMChain(llm=llm, prompt=prompt)
    translate_chain = LLMChain(llm=llm, prompt=second_prompt)

    # combine the fact chain with the translate chain
    overall_chain = SimpleSequentialChain(chains=[fact_chain, translate_chain], verbose=True)
    return overall_chain
    

In [24]:
from langchain.llms import HuggingFacePipeline

llm = HuggingFacePipeline.from_model_id(
    model_id=text_model_name,
    task="text-generation",
    model_kwargs={"temperature": 0.6, "max_length": 64},
)


#llm = HuggingFaceHub(
#    repo_id=text_model_name, model_kwargs={"temperature": 0.63 + random.uniform(-0.05, 0.05)}
# )
overall_chain = create_chain(llm)

# Run the chain specifying only the input variable for the first chain.
overall_chain.run("football")
print (overall_chain)



[1m> Entering new SimpleSequentialChain chain...[0m
[36;1m[1;3ma football consists of a ball[0m
[33;1m[1;3mchaque football consist de un balle,[0m

[1m> Finished chain.[0m
verbose=True chains=[LLMChain(prompt=PromptTemplate(input_variables=['topic'], template='Give me a fact about {topic}. '), llm=HuggingFaceHub(client=InferenceAPI(api_url='https://api-inference.huggingface.co/pipeline/text2text-generation/google/flan-t5-xxl', task='text2text-generation', options={'wait_for_model': True, 'use_gpu': False}), repo_id='google/flan-t5-xxl', model_kwargs={'temperature': 0.6168526282915378})), LLMChain(prompt=PromptTemplate(input_variables=['fact'], template='Translate to french: {fact}'), llm=HuggingFaceHub(client=InferenceAPI(api_url='https://api-inference.huggingface.co/pipeline/text2text-generation/google/flan-t5-xxl', task='text2text-generation', options={'wait_for_model': True, 'use_gpu': False}), repo_id='google/flan-t5-xxl', model_kwargs={'temperature': 0.6168526282915378

In [None]:
import ray

# initialize ray
ray.init(
    address="ray://example-cluster-kuberay-head-svc:10001",
    runtime_env={
        "pip": [
            "IPython",
            "boto3==1.26",
            "botocore==1.29", 
            "datasets",
            "fastapi==0.96",
            "accelerate>=0.16.0",
            "transformers>=4.26.0",
            "numpy<1.24",  # remove when mlflow updates beyond 2.2
            "torch",
            "langchain"
        ]
    }
)

In [24]:
from ray import serve
from starlette.requests import Request


@serve.deployment(ray_actor_options={"num_gpus": 1})
class DeployLLM:
    def __init__(self):
        print("Hello!")
        
        #llm = HuggingFaceHub(
        #    repo_id=text_model_name, model_kwargs={"temperature": 0.5 + random.uniform(-0.1, 0.1)}
        #)
        
        self.chain = create_chain(llm)

    def _run_chain(self, text: str):
        return self.chain.run(text)

    async def __call__(self, request: Request):
        # 1. Parse the request
        text = request.query_params["text"]
        # 2. Run the chain
        resp = self._run_chain(text)
        # 3. Return the response
        return resp

In [26]:
# Bind the model to deployment
deployment = DeployLLM.bind()

In [None]:
serve.run(deployment, host="0.0.0.0")

In [None]:
import requests

query = "bunny"
response = requests.post(f'http://example-cluster-kuberay-head-svc:8000/?text={query}')
print(response.content.decode())

To delete deployment:
```
serve.get_deployment("default_DeployLLM").delete()
```

To list deployments:
```
serve.list_deployments()
```

To shut down:
```
ray.shutdown()
serve.shutdown()
```