In [1]:
!pip install -q transformers torch bitsandbytes accelerate

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m3.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.8/13.8 MB[0m [31m47.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.6/24.6 MB[0m [31m33.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m883.7/883.7 kB[0m [31m33.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m664.8/664.8 MB[0m [31m2.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m211.5/211.5 MB[0m [31m7.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m56.3/56.3 MB[0m [31m9.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m127.9/127.9 MB[0m [31m7.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [1]:
!huggingface-cli login


    _|    _|  _|    _|    _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|_|_|_|    _|_|      _|_|_|  _|_|_|_|
    _|    _|  _|    _|  _|        _|          _|    _|_|    _|  _|            _|        _|    _|  _|        _|
    _|_|_|_|  _|    _|  _|  _|_|  _|  _|_|    _|    _|  _|  _|  _|  _|_|      _|_|_|    _|_|_|_|  _|        _|_|_|
    _|    _|  _|    _|  _|    _|  _|    _|    _|    _|    _|_|  _|    _|      _|        _|    _|  _|        _|
    _|    _|    _|_|      _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|        _|    _|    _|_|_|  _|_|_|_|

    A token is already saved on your machine. Run `huggingface-cli whoami` to get more information or `huggingface-cli logout` if you want to log out.
    Setting a new token will erase the existing one.
    To log in, `huggingface_hub` requires a token generated from https://huggingface.co/settings/tokens .
Enter your token (input will not be visible): 
Add token as git credential? (Y/n) n
Token is valid (permission: fineG

In [2]:
import torch
import time
import bitsandbytes as bnb
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig
)

In [3]:
def load_model(model_name, quantization=True, model_sharding=True):
    """
    Load model into memory.

    Args:
        model_name: The model to load
        quantization: Whether to use quantization
        model_sharding: Whether to use model sharding

    Returns:
        Loaded model with tokenizer
    """
    try:

        device = "cuda" if torch.cuda.is_available() else "cpu"

        start_time = time.time()

        if quantization and model_sharding:
          quantization_config = BitsAndBytesConfig(
              load_in_8bit=True,
              llm_int8_threshold=6.0,
              llm_int8_has_fp16_weight=False
          )
          model = AutoModelForCausalLM.from_pretrained(
              model_name,
              quantization_config=quantization_config,
              device_map="auto" # model sharding
          )

        elif quantization:
          quantization_config = BitsAndBytesConfig(
              load_in_8bit=True,
              llm_int8_threshold=6.0,
              llm_int8_has_fp16_weight=False
          )
          model = AutoModelForCausalLM.from_pretrained(
              model_name,
              quantization_config=quantization_config,
          )

        elif model_sharding:
          model = AutoModelForCausalLM.from_pretrained(
              model_name,
              device_map="auto" # model sharding
          )

        else:
          model = AutoModelForCausalLM.from_pretrained(
              model_name
          )

        model_load_time = time.time() - start_time

        print("Model loaded successfully in ", model_load_time)
        print("Quantization = ", quantization)
        print("Sharding = ", model_sharding)

        tokenizer = AutoTokenizer.from_pretrained(model_name)

        return model, tokenizer

    except Exception as e:
        print(f"Loading the model failed: {e}")
        return None, None

In [4]:
def complete_sentence(model, tokenizer, prompt, max_length=100, batch_size=1):
    """
    Complete a sentence using the loaded model

    Args:
        model: Loaded language model
        tokenizer: Corresponding tokenizer
        prompt (str): Initial sentence to complete
        max_length (int): Maximum length of generated text
        batch_size (int): Number of sentences to feed

    Returns:
        str: Completed sentence
    """
    if isinstance(prompt, str):
        prompt = [prompt] * batch_size

    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)

    start_time = time.time()

    # Generate text
    outputs = model.generate(
        **inputs,
        max_length=max_length,
        num_return_sequences=1,
        do_sample=True,
        temperature=0.7
    )

    inference_time = time.time() - start_time
    print(f"Inference Time for {model.__class__.__name__} is {inference_time} with batch_size = {batch_size}")

    return tokenizer.decode(outputs[0], skip_special_tokens=True)

In [5]:
prompt = "The future of artificial intelligence is"
prompt_hi = "एक मछली, पानी में गई,"

## Loading with quantization and sharding

In [6]:
model_name = "google/gemma-7b"
model, tokenizer = load_model(model_name, quantization=True, model_sharding=True)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
`config.hidden_act` is ignored, you should use `config.hidden_activation` instead.
Gemma's activation function will be set to `gelu_pytorch_tanh`. Please, use
`config.hidden_activation` if you want to override this behaviour.
See https://github.com/huggingface/transformers/pull/29402 for more details.


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

Model loaded successfully in  112.5585343837738
Quantization =  True
Sharding =  True


In [9]:
output = complete_sentence(model, tokenizer, prompt)
output_hi = complete_sentence(model, tokenizer, prompt_hi)

Inference Time for GemmaForCausalLM is 23.182682514190674 with batch_size = 1
Inference Time for GemmaForCausalLM is 17.290358543395996 with batch_size = 1


In [10]:
output, output_hi

('The future of artificial intelligence is in the hands of those who can make it work for them. For this purpose, we have compiled a list of the best AI companies in Singapore.\n\n<h2>1.  Cogito</h2>\n\nCogito provides a platform that helps companies improve their customers’ experience. It’s an artificial intelligence-powered chatbot with a natural language processing engine. The company’s mission is to help companies build better customer relationships by using AI to understand customer behavior and needs.\n\n',
 'एक मछली, पानी में गई, 40 मीटर की गहराई तक 15 मिनट में जाती है। उसका औसत चाल कितना है?\n\nA) 2 मीटर/ मिनट\n\nB) 3 मीटर/ मिनट\n\nC) 4 मीटर/ मिनट\n\nD) 5 मीटर/ मिनट\n\n<strong>Correct Answer:</strong> C\n\n<strong>Solution :</strong>\n\n<table> <tbody><tr> <td>उत्तर - 4')

In [11]:
output = complete_sentence(model, tokenizer, prompt, batch_size=16)
output_hi = complete_sentence(model, tokenizer, prompt_hi, batch_size=16)

Inference Time for GemmaForCausalLM is 32.874855518341064 with batch_size = 16
Inference Time for GemmaForCausalLM is 33.58133053779602 with batch_size = 16


In [12]:
output, output_hi

('The future of artificial intelligence is looking brighter than ever.\n\nFrom a self-driving car to a personal AI assistant, we have seen the potential of artificial intelligence in our lives.\n\nBut how do you know if the program on your computer is an AI? How can you tell if it is really artificial intelligence or just another software application?\n\nIf you want to know more about AI, then keep reading!\n\nThis article will cover all the basics of AI and give you some practical tips on',
 'एक मछली, पानी में गई, उसका स्वागत करती है, और तुरंत उससे पूछती है, कि वह कहाँ से आ रही है? मछली कहती है, कि "मैं अपने घर से 5 मीटर की दूरी पर जा रही थी, पर मेरा घर, एक दीवार के सामने था, इसलिए मैं दीवार की सीमा रेखा पर 3 मीटर की दूरी पर रही, और तो')

## 7B Model loaded and inferenced successfully with 16s generation speed

## Trying to load without quantization and sharding

In [None]:
model_name = "google/gemma-7b"
model, tokenizer = load_model(model_name, quantization=False, model_sharding=False)

Downloading shards:   0%|          | 0/4 [00:00<?, ?it/s]

model-00002-of-00004.safetensors:  35%|###5      | 1.76G/4.98G [00:00<?, ?B/s]

model-00003-of-00004.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

model-00004-of-00004.safetensors:   0%|          | 0.00/2.11G [00:00<?, ?B/s]

`config.hidden_act` is ignored, you should use `config.hidden_activation` instead.
Gemma's activation function will be set to `gelu_pytorch_tanh`. Please, use
`config.hidden_activation` if you want to override this behaviour.
See https://github.com/huggingface/transformers/pull/29402 for more details.


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

In [None]:
output = complete_sentence(model, tokenizer, prompt)

## Experiment failed: Ran out of RAM

# Now trying to load with only quantization

In [None]:
model_name = "google/gemma-7b"
model, tokenizer = load_model(model_name, quantization=True, model_sharding=False)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
`low_cpu_mem_usage` was None, now default to True since model is quantized.
`config.hidden_act` is ignored, you should use `config.hidden_activation` instead.
Gemma's activation function will be set to `gelu_pytorch_tanh`. Please, use
`config.hidden_activation` if you want to override this behaviour.
See https://github.com/huggingface/transformers/pull/29402 for more details.


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/137 [00:00<?, ?B/s]

Model loaded successfully in  100.75733375549316
Quantization =  True
Sharding =  False


tokenizer_config.json:   0%|          | 0.00/33.6k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/4.24M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.5M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/636 [00:00<?, ?B/s]

In [None]:
output = complete_sentence(model, tokenizer, prompt, batch_size=16)
print(output)

Inference Time for GemmaForCausalLM is 33.79022002220154 with batch_size = 16
The future of artificial intelligence is not just a technology, it's a philosophy.

The goal of AI is to create machines that can think and act like humans. It's not just about making computers smarter, it's about making humans smarter too.

If you want to be an AI expert, you need to understand the philosophy behind it.

AI is not about replacing humans but making them more efficient and effective in their work.

If you want to be an AI expert


In [None]:
output = complete_sentence(model, tokenizer, prompt, batch_size=1)
print(output)

Inference Time for GemmaForCausalLM is 20.03234314918518 with batch_size = 1
The future of artificial intelligence is bright. The technology is already being used in a variety of industries, and it is only going to become more prevalent in the coming years. Here are five predictions for the future of AI:

Artificial intelligence will be used in more industries. As the technology continues to improve, it will be used in a wider range of industries. We will see AI being used in healthcare, finance, and even education.

AI will become more human-like. One of the


## 7B Model loaded and inferenced successfully wiht 21s generation speed with sharding=False

# Without Quantization, with Sharding

In [7]:
model_name = "google/gemma-7b"
model, tokenizer = load_model(model_name, quantization=False, model_sharding=True)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/629 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/20.9k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/4 [00:00<?, ?it/s]

model-00001-of-00004.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00002-of-00004.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

model-00003-of-00004.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

model-00004-of-00004.safetensors:   0%|          | 0.00/2.11G [00:00<?, ?B/s]

`config.hidden_act` is ignored, you should use `config.hidden_activation` instead.
Gemma's activation function will be set to `gelu_pytorch_tanh`. Please, use
`config.hidden_activation` if you want to override this behaviour.
See https://github.com/huggingface/transformers/pull/29402 for more details.


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/137 [00:00<?, ?B/s]



Model loaded successfully in  496.50402426719666
Quantization =  False
Sharding =  True


tokenizer_config.json:   0%|          | 0.00/33.6k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/4.24M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.5M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/636 [00:00<?, ?B/s]

In [None]:
output = complete_sentence(model, tokenizer, prompt, batch_size=16)
print(output)

Inference Time for GemmaForCausalLM is 3079.313209295273 with batch_size = 16
The future of artificial intelligence is bright. It is changing the way we live, work, and interact with the world. From virtual assistants to self-driving cars, AI is transforming our lives in ways we never imagined. In this article, we will explore the future of AI and how it will impact society in the coming years.

<h2>What is Artificial Intelligence (AI)?</h2>

Artificial intelligence is a branch of computer science that deals with the creation of intelligent machines that can perform tasks that typically


In [8]:
output = complete_sentence(model, tokenizer, prompt, batch_size=1)
print(output)

Inference Time for GemmaForCausalLM is 2880.123885154724 with batch_size = 1
The future of artificial intelligence is already here, and it's not just for the robots. From self-driving cars to virtual assistants, AI is changing the way we live and work. In the future, AI will be able to do things that we never thought possible.

Artificial intelligence (AI) is already having a major impact on the world around us. From self-driving cars to virtual assistants, AI is changing the way we live and work. And the future of AI is even


## Model loaded successfully without quantization and with sharding with generation speed of 2880s and 3079s respectively for each batch size