<a href="https://colab.research.google.com/github/AinzOwl/mysticai-colab/blob/main/mystic-ai%20deploy%20llm.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install torch transformers einops pipeline-ai typing

In [None]:
!pip install torch transformers einops typing pipeline-ai

In [None]:
mystic_input = input("enter mystic api: ")
!pipeline cluster login catalyst-api {mystic_input} -u https://mystic.ai -a

In [None]:
from typing import List

import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, GenerationConfig

from pipeline import Pipeline, Variable, entity, pipe
from pipeline.cloud import compute_requirements, environments, pipelines
from pipeline.objects.graph import InputField, InputSchema

In [None]:
class ModelKwargs(InputSchema):
    system: str | None = InputField(
        default = "I am OrcaPhi. The following is my internal dialogue as an AI assistant.\n" \
            "Today is September 15, 2023. I have no access to outside tools, news, or current events.\n" \
            "I carefully provide accurate, factual, thoughtful, nuanced answers and am brilliant at reasoning.\n" \
            "I think through my answers step-by-step to be sure I always get the right answer.\n" \
            "I think more clearly if I write out my thought process in a scratchpad manner first; therefore, I always " \
            "explain background context, assumptions, and step-by-step thinking BEFORE trying to answer a question." \
            "Take a deep breath and think calmly about everything presented.",
        title = "System Prompt",
        description = "Enter a description of your system"
    )
    do_sample: bool | None = InputField(default=True)
    use_cache: bool | None = InputField(default=True)
    temperature: float | None = InputField(default=0.6)
    repetition_penalty: float | None = InputField(default=1.1)
    top_p: float | None = InputField(default=0.9)
    max_length: int | None = InputField(default=100, ge=1, le=4096)
    presence_penalty: float | None = InputField(default=1.0)

In [None]:
@entity
class LlamaPipeline:
    def __init__(self) -> None:
        self.model = None
        self.tokenizer = None

        self.streamer = None

    @pipe(on_startup=True, run_once=True)
    def load_model(self) -> None:
        self.model = AutoModelForCausalLM.from_pretrained("Open-Orca/oo-phi-1_5", trust_remote_code=True, torch_dtype=torch.bfloat16).to("cuda")
        self.tokenizer = AutoTokenizer.from_pretrained("Open-Orca/oo-phi-1_5", trust_remote_code=True, torch_dtype=torch.bfloat16)


    @pipe
    def inference(self, prompt: str, kwargs: ModelKwargs) -> List[str]:


        prefix = "<|im_start|>"
        suffix = "<|im_end|>\n"
        sys_format = prefix + "system\n" + kwargs.system + suffix
        user_format = prefix + "user\n" + prompt + suffix
        assistant_format = prefix + "assistant\n"
        input_text = sys_format + user_format + assistant_format

        generation_config = GenerationConfig(
            max_length=kwargs.max_length, temperature=kwargs.temperature, top_p=kwargs.top_p, repetition_penalty=kwargs.repetition_penalty,
            do_sample=kwargs.do_sample, use_cache=kwargs.use_cache,
            eos_token_id=self.tokenizer.eos_token_id, pad_token_id=self.tokenizer.pad_token_id,
            transformers_version="4.33.1"
            )

        inputs = self.tokenizer(input_text, return_tensors="pt", return_attention_mask=False).to('cuda')
        outputs = self.model.generate(**inputs, generation_config=generation_config)

        return self.tokenizer.batch_decode(outputs)[0]


In [None]:
with Pipeline() as builder:
    prompt = Variable(str)
    kwargs = Variable(ModelKwargs)

    _pipeline = LlamaPipeline()
    _pipeline.load_model()
    out = _pipeline.inference(prompt, kwargs)

    builder.output(out)


my_pipeline = builder.get_pipeline()


try:
    environments.create_environment(
        "oophi1_5",
        python_requirements=[
            "torch==2.0.1",
            "transformers==4.33.1",
            "einops==0.6.1"
        ],
    )
except Exception:
    pass

In [None]:
output = my_pipeline.run(
    "Tell me a short story about an orca swimming in the sea",
    ModelKwargs(),
)

print(output)

In [None]:
result = pipelines.upload_pipeline(
    my_pipeline,
    "Ainzoil/oo_phi1_5:latest",
    environment_id_or_name="oophi1_5",
    required_gpu_vram_mb=20_000,
    accelerators=[
        compute_requirements.Accelerator.nvidia_a100,
    ],
)


In [None]:
output = my_pipeline.run(
    "Hello, how are you?",
    ModelKwargs(),
)

print(output)

In [None]:
print(f"Pipeline ID: {result.id}")
output = my_pipeline.run(
    "Hello, how are you?",
    ModelKwargs(),
)


# print(output)

In [None]:
import webbrowser

from pipeline.cloud.pipelines import run_pipeline

output = run_pipeline(
    # Pipeline pointer or ID
    "stabilityai/stable-diffusion-xl-refiner-1.0:v1",
    # Prompt
    "Mountain winds and babbling springs and moonlight seas.",
    # Model kwargs
    dict(
        denoising_end=0.8,
        num_inference_steps=25,
    ),
)

result = output.result.result_array()

# Extract the image URL from the result
url = result[0][0]["file"]["url"]

# Open the URL in the default web browser
webbrowser.open(url)