In [None]:
# @title Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# LLM Basics with Hugging Face
This notebook demonstrates how to load LLM models by utilizing Hugging Face, and how to make queries.
<!--table align="left">
  <td>
    <a target="_blank" href="https://colab.research.google.com/github/google-gemini/gemma-cookbook/blob/main/Gemma/Gemma_Basics_with_HF.ipynb"><img src="https://www.tensorflow.org/images/colab_logo_32px.png" />Run in Google Colab</a>
  </td>
</table-->


Adapted for EECE.4860/5860 at UMass Lowell

## Prerequisites 

### Account on Intel Tiber AI Cloud (or run on your local GPU if available)

You will need a standard account on Intel Tiber AI Cloud, where we have tested this notebook. Students have been given instructions on how to sign up for an account on Intel Tiber.

### HuggingFace setup

Before we dive into the tutorial, let's get you set up with HuggingFace:

1. **Hugging Face Account:**  If you don't already have one, you can create a free Hugging Face account by clicking [here](https://huggingface.co/join).
2. **LLM Model Access:** Head over to the [Gemma model page](https://huggingface.co/google/gemma-2b) and [llama2 model papge](https://huggingface.co/meta-llama/Llama-2-7b-hf) and accept the usage conditions.
3. **Hugging Face Token:**  You need to create a token on HuggingFace and use it to login from this notebook. Once you are logged in, you can download the models. Check [this guide](https://huggingface.co/docs/hub/en/security-tokens) on how to create a token on HF. Generate a Hugging Face access (preferably `write` permission) token by clicking [here](https://huggingface.co/settings/tokens). **Save the token in a safe document that you can access**. Once you've completed these steps, you're ready to move on to the next section where we'll install necessary packages and log into HuggingFace Hub.


**If there is no error in the previous step, you are all set and ready to explore the possibilities with LLM models!**


**You need to click the next cell to proceed**

## Instantiate the Gemma 2B model (or other models)

Gemma is a family of lightweight, state-of-the-art open models from Google, built from the same research and technology used to create the Gemini models. They are text-to-text, decoder-only large language models, available in English, with open weights, pre-trained variants, and instruction-tuned variants. Gemma models are well-suited for a variety of text generation tasks, including question answering, summarization, and reasoning. Their relatively small size makes it possible to deploy them in environments with limited resources such as a laptop, desktop or your own cloud infrastructure, democratizing access to state of the art AI models and helping foster innovation for everyone.

Please note we list here a few variants of the Gemma models for you to play with.

Other models is this example include Llama 2 from Meta.

Let's get started by loading the model from Hugging Face Hub.

In [None]:
%env HF_HOME=/opt/notebooks/.cache/huggingface

### Log into Hugging Face Hub

In [None]:
# you could use OS env variable to store the HF token
#from huggingface_hub import login
#login(os.environ["HF_TOKEN"])

# or use an input box on this notebook to copy/paste the token
from huggingface_hub import notebook_login
notebook_login()

### Loading the model from HF Hub

In [None]:
import torch

model_id = "google/gemma-2-2b-it"

device = "cuda:0" if torch.cuda.is_available() else "cpu"

print(f"using device: {device}")

In [None]:
# Let's load the tokenizer first
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained(model_id, device_map="auto")

In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig

# We could typically quantize the model to reduce its weight
# But to simplify the process, we won't quantize it in this notebook

# Let's load the chosen model
model = AutoModelForCausalLM.from_pretrained(model_id, device_map="auto")


### Trying it out

In [None]:
prompt = "My favourite color is"
inputs = tokenizer.encode(prompt, return_tensors="pt").to(device)
outputs = model.generate(inputs, max_new_tokens=20)
text = tokenizer.decode(outputs[0], skip_special_tokens=True)
print(text)

In [None]:
prompt = "Who won the 2016 baseball World Series? Answer:"
inputs = tokenizer.encode(prompt, return_tensors="pt").to(device)
outputs = model.generate(inputs, max_new_tokens=40)
text = tokenizer.decode(outputs[0], skip_special_tokens=True)
print(text)

In [None]:
prompt = "What can you use an LLM for? Answer:"
inputs = tokenizer.encode(prompt, return_tensors="pt").to(device)
outputs = model.generate(inputs, max_new_tokens=512)
text = tokenizer.decode(outputs[0], skip_special_tokens=True)
print(text)