<a href="https://colab.research.google.com/github/1ucky40nc3/serve_mpt/blob/main/examples/notebooks/run_quant_mpt_with_ctransformers.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# MPT Quantization and Inference

## Install Dependencies

In [None]:
# @title Install Requirements
!pip install einops
!pip install transformers
!pip install accelerate
!pip install sentencepiece
!pip install ctransformers

!git lfs install

In [None]:
# @title Clone the ggml Repository
!git clone https://github.com/ggerganov/ggml

In [None]:
# @title Build the ggml MPT Features
!mkdir -p /content/ggml/build
%cd /content/ggml/build
!cmake ..
!make -j4 mpt mpt-quantize

## Prepare a **MosaicML MPT-7B** PLM



In [None]:
# @title Download a `mosaicml/mpt-7b*` PLM from [https://huggingface.co](https://huggingface.co)
!mkdir -p /content/models
%cd /content/models

# @markdown Select a pretrained model checkpoint:
model_repository = "mosaicml/mpt-7b-instruct" # @param ['mosaicml/mpt-7b', 'mosaicml/mpt-7b-storywriter', 'mosaicml/mpt-7b-chat', 'mosaicml/mpt-7b-instruct']
model_dir = "_".join(model_repository.split("/")[1:])
huggingface_base_url = "https://huggingface.co"
huggingface_mpt_repository_url = f"{huggingface_base_url}/{model_repository}"

!git clone $huggingface_mpt_repository_url

In [None]:
# @title Import Dependencies
import os

In [None]:
# @title Convert the PLM into the ggml Format
%cd /content/ggml/build

base_dir = "/content/models"
model_dir = "mpt-7b-instruct"
model_path = os.path.join(base_dir, model_dir)

# @markdown Select a floating point precision for your ggml model:
ftype = "f32" # @param ['f32', 'f16']
ftype_types = ["f32", "f16"]
ftype_index = ftype_types.index(ftype)

!python ../examples/mpt/convert-h5-to-ggml.py \
    $model_path \
    $ftype_index

In [None]:
# @title Quantize the PLM
%cd /content/ggml/build

# @markdown Select a quantization config:
quant_config = "q4_0" # @param ['q4_0', 'q4_1', 'q5_0', 'q5_1', 'q8_0']
ggml_model_path = os.path.join(base_dir, model_dir, f"ggml-model-{ftype}.bin")
quant_model_file = f"{model_dir}_ggml_quant-{quant_config}.bin"
quant_model_path = os.path.join(base_dir, model_dir, quant_model_file)

!./bin/mpt-quantize \
    $ggml_model_path \
    $quant_model_path \
    $quant_config

In [None]:
# @title Save the Quantized Model in your Google Drive

from google.colab import drive
drive.mount("/content/gdrive")

# @markdown Set a target path for the quantized model:
gdrive_path = "/content/gdrive/MyDrive/" # @param {"type": "string"}

!cp $quant_model_path $gdrive_path

## Do some Inference!

In [None]:
# @title Import Dependencies
from ctransformers import AutoModelForCausalLM

In [None]:
# @title Load the *Quantized* GGML **MosaicML MPT-7B-Instruct**
llm_quant = AutoModelForCausalLM.from_pretrained(
    quant_model_file, 
    model_type="mpt"
)

In [None]:
# @title Do some Inference
llm_quant("AI is going to")