<a target="_blank" href="https://colab.research.google.com/github/Blaizzy/LLMOps/blob/main/inference/local/mlx/Summarization.ipynb">
  <img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/>
</a>

## Getting started with Gemma Google's lightweight family of LLMs

<img src="./assets/gemma_logo.webp" width=500>

In this guide, you'll learn how to use Google's Gemma 2B and 7B to summarise large documents.

### Tools
- Huggingface
- MLX
- Langchain


In [None]:
!pip install -U -q langchain pypdf langchain_community tqdm
!pip install -U -q mlx-lm # For MacBook
!pip install -U -q huggingface-hub hf-transfer
# !pip install -U -q transformers accelerate bitsandbytes huggingface-hub hf-transfer # For GPU accelerated

In [None]:
from mlx_lm import load, generate
import os
os.environ["HF_HUB_ENABLE_HF_TRANSFER"] = "1"
model_id = "mlx-community/quantized-gemma-2b-it"
model, tokenizer = load(model_id)

In [None]:
from jinja2 import Template
from typing import List
from pprint import pprint

def apply_chat_template(messages:List, add_generation:bool=True):
    template_str = "{% for item in messages %}" \
                   "<start_of_turn>{{ item.role }}\n{{ item.content }}<end_of_turn>\n" \
                   "{% if loop.last %}{% if add_generation %}<start_of_turn>model\n{% endif %}{% endif %}" \
                   "{% endfor %}"

    template = Template(template_str)
    result = template.render(messages=messages, add_generation=add_generation)
    return result

In [None]:
messages = [
    {"content": "Respond like snoop dogg. Who is Einstein?", "role":"user"},
]

In [None]:
response = generate(model, tokenizer, prompt=apply_chat_template(messages), temp=0.1, max_tokens=500, verbose=True)

In [None]:
from langchain_community.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter

def load_file(file_name, file_type):
    loader = PyPDFLoader(f"./assets/{file_name}.{file_type}")

    text_splitter = RecursiveCharacterTextSplitter(
        # Set a really small chunk size, just to show.
        chunk_size=5000,
        chunk_overlap=20,
        length_function=len,
        is_separator_regex=False,
    )
    return loader.load_and_split(text_splitter)

In [None]:
documents = load_file('gemma-report', 'pdf')
documents

In [None]:
prompt = "Create a summary of the following document:"

In [None]:
from tqdm import tqdm
prompt = "Create a summary of the following document:"
summaries = []


for doc in tqdm(documents[:9]):
    summaries.append(
        generate(
            model,
            tokenizer=tokenizer,
            prompt=apply_chat_template([
                {"content": prompt + f"'{doc.page_content}'", "role": "user"}
            ]),
            temp=0.1,
            max_tokens=500,
            verbose=True
        ))

In [None]:
len("\n".join(summaries)) # No. Characters ~ 2621 tokens

In [None]:
summaries_text = "\n".join(summaries)

In [None]:
del model, tokenizer # Free up resources to run a bigger model

In [None]:
model_id = "mlx-community/quantized-gemma-7b-it"
model, tokenizer = load(model_id)

In [None]:
from tqdm import tqdm
prompt_for_long_summary = {"content": f"Give me a summary of the following document: '{summaries_text}'", "role":"user"}
final_summary = generate(model, tokenizer=tokenizer, prompt=apply_chat_template([prompt_for_long_summary]), temp=0.1, max_tokens=1000)

In [None]:
pprint(final_summary)

## MLX-LM server
You can host your model's locally with a OpenAI compatible API using MLX-LM server.

Command to run server:

`python -m mlx_lm.server --model 'mlx-community/quantized-gemma-2b-it'`

In [None]:
import requests
import json

url = "http://localhost:8080/v1/chat/completions"
headers = {"Content-Type": "application/json"}
data = {
    "messages": [{"role": "user", "content": "Hi"}],
    "temperature": 0.7,
    "max_tokens": 100,
}

response = requests.post(url, headers=headers, data=json.dumps(data))

# Check if the request was successful (status code 200)
if response.status_code == 200:
    result = response.json()
    response = result['choices'][0]['message']['content'].replace("<eos>", "")
    print(f"Response:\n{response}")
    print("===========")
    print(f"Usage:\n{result['usage']}")
else:
    print(f"Request failed with status code: {response.status_code}")
    print(response.text)


In [None]:
!curl localhost:8080/v1/chat/completions \
  -H "Content-Type: application/json" \
  -d '{"messages": [{"role": "user", "content": "Say this is a test!"}],"temperature": 0.7}'