# Chatbot Backend Server Deployment Notebook

In [1]:
import os

import covalent_cloud as cc
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
from covalent_cloud.cloud_executor import GPU_TYPE

cc.save_api_key(os.environ["CC_API_KEY"])

In [2]:
ENV_NAME = "chatbot-demo-backend"

cc.create_env(
    name=ENV_NAME,
    pip = [
        "accelerate==0.29.1",
        "sentencepiece==0.2.0",
        "torch==2.2.2",
        "transformers==4.39.3",
    ],
    wait=True
)

Environment Already Exists.


In [3]:
gpu_executor = cc.CloudExecutor(
    env=ENV_NAME,
    num_cpus=24,
    memory="54 GB",
    time_limit="15 days",
    num_gpus=1,
    gpu_type=GPU_TYPE.L40,
)

@cc.service(executor=gpu_executor, name="LLM Chatbot Server")
def chatbot_backend(model_path: str, device_map="auto"):
    """Create a Llama2 chatbot server."""
    model = AutoModelForCausalLM.from_pretrained(
        model_path,
        device_map=device_map,
        torch_dtype=torch.float16,
        do_sample=True,
    )
    tokenizer = AutoTokenizer.from_pretrained(model_path)
    pipe = pipeline(task="text-generation", model=model, tokenizer=tokenizer)
    return {"pipe": pipe}

In [4]:
@chatbot_backend.endpoint("/generate", name="Generate Response")
def generate(pipe, prompt, max_new_tokens=50):
    """Generate a response to a prompt."""
    output = pipe(
        prompt, max_new_tokens=max_new_tokens,
        do_sample=True, truncation=True, temperature=0.9
    )
    gen_text = output[0]['generated_text']
    return gen_text

@chatbot_backend.endpoint("/stream", name="Stream Response", streaming=True)
def generate_stream(pipe, prompt, max_new_tokens=200):
    """Generate a response to a prompt, streaming tokens."""

    def _starts_with_space(tokenizer, token_id):
        token = tokenizer.convert_ids_to_tokens(token_id)
        return token.startswith('▁')

    model = pipe.model
    tokenizer = pipe.tokenizer
    _input = tokenizer(prompt, return_tensors='pt').to("cuda")

    for output_length in range(max_new_tokens):
        # Generate next token
        output = model.generate(
            **_input, max_new_tokens=1, do_sample=True,
            temperature=0.9, pad_token_id=tokenizer.eos_token_id
        )
        # Check for stopping condition
        current_token_id = output[0][-1]
        if current_token_id == tokenizer.eos_token_id:
            break
        # Decode token
        current_token = tokenizer.decode(
            current_token_id, skip_special_tokens=True
        )
        if _starts_with_space(tokenizer, current_token_id.item()) and output_length > 1:
            current_token = ' ' + current_token

        yield current_token

        # Update input for next iteration.
        # Output grows in size with each iteration.
        _input = {
            'input_ids': output.to("cuda"),
            'attention_mask': torch.ones(1, len(output[0])).to("cuda"),
        }

#### When ready, copy backend base URL (address) into streamlit app

In [5]:
info = cc.deploy(chatbot_backend)(model_path="NousResearch/Llama-2-7b-chat-hf")
info = cc.get_deployment(info.function_id, wait=True)
print(info)
print(info.address)

╭──────────────────────── Deployment Information ────────────────────────╮
│  Name          LLM Chatbot Server                                      │
│  Description   Create a Llama2 chatbot server.                         │
│  Function ID   66563a64f7d37dbf2a468ca9                                │
│  Address       https://fn.prod.covalent.xyz/166563a64f7d37dbf2a468ca9  │
│  Status        ACTIVE                                                  │
│  Tags                                                                  │
│  Auth Enabled  Yes                                                     │
╰────────────────────────────────────────────────────────────────────────╯
╭─────────────────────────────────────────────────╮
│ [3m                POST /generate                 [0m │
│  Streaming    No                                │
│  Description  Generate a response to a prompt.  │
╰─────────────────────────────────────────────────╯
╭────────────────────────────────────────────────────────

#### Danger Zone!

In [None]:
# info.teardown()