### This is the notebook to run the Open LLM's as an Web Service

In [0]:
%run "./util/notebook-config"

**This notebook is not required if you want to run the OpenAI model**

In [0]:
if config['model_id'] == "openai":
  raise "Notebook note required , Use this notebook to run on when using open LLM. change the config"

In [0]:
%pip install torch==2.0.1

In [0]:
# ! rm -rf /dbfs/$user/tgi/*

In [0]:
%sh
# install rust
curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs > tmp.sh
sh tmp.sh -y
source "$HOME/.cargo/env"

# install protoc
PROTOC_ZIP=protoc-21.12-linux-x86_64.zip
curl -OL https://github.com/protocolbuffers/protobuf/releases/download/v21.12/$PROTOC_ZIP
sudo unzip -o $PROTOC_ZIP -d /usr/local bin/protoc
sudo unzip -o $PROTOC_ZIP -d /usr/local 'include/*'
rm -f $PROTOC_ZIP

# install text-generation-inference
rm -rf  /local_disk0/tmp/text-generation-inference
cd /local_disk0/tmp && git clone https://github.com/huggingface/text-generation-inference.git  
cd /local_disk0/tmp/text-generation-inference && make install

In [0]:
%sh 
FILE=/dbfs/$user/tgi/flash_attn-1.0.8-cp310-cp310-linux_x86_64.whl 
if test -f "$FILE"; then
    echo "$FILE exists."
else
    export flash_att_commit='3a9bfd076f98746c73362328958dbc68d145fbec'
    mkdir /dbfs/$user/tgi/  -p
    rm -rf  /local_disk0/tmp/flash-attention
    cd /local_disk0/tmp && git clone https://github.com/HazyResearch/flash-attention.git 

    cd flash-attention && git fetch && git checkout ${flash_att_commit}
    python setup.py build
    python setup.py bdist_wheel
    cp  dist/flash_attn-1.0.8-cp310-cp310-linux_x86_64.whl /dbfs/$user/tgi/flash_attn-1.0.8-cp310-cp310-linux_x86_64.whl
    cd csrc/rotary && python setup.py build 
    python setup.py bdist_wheel
    cp  dist/rotary_emb-0.1-cp310-cp310-linux_x86_64.whl /dbfs/$user/tgi/rotary_emb-0.1-cp310-cp310-linux_x86_64.whl
    cd ..
    cd layer_norm && python setup.py build 
    python setup.py bdist_wheel
    cp  dist/dropout_layer_norm-0.1-cp310-cp310-linux_x86_64.whl /dbfs/$user/tgi/dropout_layer_norm-0.1-cp310-cp310-linux_x86_64.whl
fi

In [0]:
%sh 
FILE=/dbfs/$user/tgi/flash_attn-2.0.0.post1-cp310-cp310-linux_x86_64.whl
if test -f "$FILE"; then
    echo "$FILE exists."
else
    export flash_att_v2_commit='4f285b354796fb17df8636485b9a04df3ebbb7dc'

    rm -rf  /local_disk0/tmp/flash-attention-v2
    cd /local_disk0/tmp && git clone https://github.com/HazyResearch/flash-attention.git flash-attention-v2

    cd flash-attention-v2 && git fetch && git checkout ${flash_att_v2_commit}
    python setup.py build
    python setup.py bdist_wheel
    cp  dist/flash_attn-2.0.0.post1-cp310-cp310-linux_x86_64.whl /dbfs/$user/tgi/flash_attn-2.0.0.post1-cp310-cp310-linux_x86_64.whl
fi

In [0]:
%sh 
FILE=/dbfs/$user/tgi/vllm-0.0.0-cp310-cp310-linux_x86_64.whl
if test -f "$FILE"; then
    echo "$FILE exists."
else
    export vllm_commit='d284b831c17f42a8ea63369a06138325f73c4cf9'

    rm -rf  /local_disk0/tmp/vllm
    cd /local_disk0/tmp && git clone https://github.com/OlivierDehaene/vllm.git

    cd vllm && git fetch && git checkout ${vllm_commit}
    python setup.py build
    python setup.py bdist_wheel
    cp dist/vllm-0.0.0-cp310-cp310-linux_x86_64.whl /dbfs/$user/tgi/vllm-0.0.0-cp310-cp310-linux_x86_64.whl
fi

In [0]:
%pip install /dbfs/$user/tgi/flash_attn-2* /dbfs/$user/tgi/dropout_laye* /dbfs/$user/tgi/rotary_emb*  /dbfs/$user/tgi/vllm*  urllib3==1.25.4 protobuf==3.20.*

In [0]:
#  dbutils.library.restartPython() 

In [0]:
import os 
nodeid = spark.conf.get('spark.databricks.driverNodeTypeId')
if "A100" in nodeid:
  os.environ['sharded'] = 'false'
  os.environ['CUDA_VISIBLE_DEVICES'] = "0"
else:
  os.environ['sharded'] = 'true'
  os.environ['CUDA_VISIBLE_DEVICES'] = "0,1,2,3"

if "Llama-2" in config['model_id']: 
  os.environ['HUGGING_FACE_HUB_TOKEN'] = config['HUGGING_FACE_HUB_TOKEN']
os.environ['HUGGINGFACE_HUB_CACHE'] ='/local_disk0/tmp/'
os.environ['CUDA_MEMORY_FRACTION'] = "0.95"

# get model variables
os.environ['model_id'] = config['model_id']
if "load_in_8bit" in config['model_kwargs']:
  os.environ['quantize'] = "bitsandbytes"
if config['model_id'] != 'meta-llama/Llama-2-70b-chat-hf':
  os.environ['CUDA_MEMORY_FRACTION'] = ".9"

In [0]:
from dbruntime.databricks_repl_context import get_context
ctx = get_context()

port = "8880"
driver_proxy_api = f"https://{ctx.browserHostName}/driver-proxy-api/o/0/{ctx.clusterId}/{port}"

print(f"""
driver_proxy_api = '{driver_proxy_api}'
cluster_id = '{ctx.clusterId}'
port = {port}
""")

In [0]:
%sh
source "$HOME/.cargo/env"

if [ -z ${quantize} ]; 
    then echo "quantize" && text-generation-launcher --model-id $model_id --port 8880 --trust-remote-code --sharded $sharded --max-input-length 2048 --max-total-tokens 2500 --max-batch-prefill-tokens 2500 ;
else text-generation-launcher --model-id $model_id --port 8880 --trust-remote-code --sharded $sharded --max-input-length 2048 --max-total-tokens 2500 --quantize bitsandbytes --max-batch-prefill-tokens 2500  ;
fi

In [0]:
! kill -9  $(ps aux | grep 'text-generation' | awk '{print $2}')