In [1]:
from transformers import AutoModelForCausalLM, AutoTokenizer

In [2]:
!pip install -U git+https://github.com/ridgerchu/matmulfreellm

Collecting git+https://github.com/ridgerchu/matmulfreellm
  Cloning https://github.com/ridgerchu/matmulfreellm to /tmp/pip-req-build-ow3czn3y
  Running command git clone --filter=blob:none --quiet https://github.com/ridgerchu/matmulfreellm /tmp/pip-req-build-ow3czn3y
  Resolved https://github.com/ridgerchu/matmulfreellm to commit ec1c298ffa3db6436831f3e6d46f4e59d0b99194
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting einops (from mmfreelm==0.1)
  Downloading einops-0.8.0-py3-none-any.whl (43 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.2/43.2 kB[0m [31m2.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting ninja (from mmfreelm==0.1)
  Downloading ninja-1.11.1.1-py2.py3-none-manylinux1_x86_64.manylinux_2_5_x86_64.whl (307 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m307.2/307.2 kB[0m [31m8.4 MB/s[0m eta [36m0:00:00[0m
Building wheels for collected packages: mmfreelm
  Building wheel for mmfreelm (setup.py) ... [?25l[?

In [3]:
from mmfreelm.models import HGRNBitConfig

In [4]:
import torch
import time

In [5]:
def measure_performance(model, tokenizer, prompt):
    inputs = tokenizer(prompt, return_tensors="pt", padding=True)
    input_ids = inputs.input_ids.cuda()
    attention_mask = inputs.attention_mask.cuda()

    torch.cuda.reset_peak_memory_stats()


    start_time = time.time()
    with torch.no_grad():
        outputs = model.generate(
            input_ids,
            attention_mask=attention_mask,
            max_length=128,
            pad_token_id=tokenizer.eos_token_id,
            repetition_penalty=1.1,
            no_repeat_ngram_size=2
        )
    end_time = time.time()


    peak_memory = torch.cuda.max_memory_allocated()  # Peak memory usage during the operation

    generation_time = end_time - start_time
    memory_consumption = peak_memory / (1024 ** 2)

    generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)

    return generation_time, memory_consumption, generated_text

In [6]:
model_names = ["ridger/MMfreeLM-2.7B"]

In [7]:
prompts = ["What are the benefits of renewable energy?","Explain the theory of relativity.","Compose a poem about the changing seasons.","Explain the differences between supervised and unsupervised learning in machine learning.","What is one divided by zero"] #example prompt

In [8]:
def compare_models(model_name):
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    if tokenizer.pad_token is None:
        tokenizer.pad_token = tokenizer.eos_token
    model = AutoModelForCausalLM.from_pretrained(model_name).half().cuda()

    results = {}
    for prompt in prompts:
        time_taken, peak_memory_used, output = measure_performance(model, tokenizer, prompt)
        results[prompt] = {
            "time_taken": time_taken,
            "peak_memory_used": peak_memory_used,
            "output": output
        }

    del model
    torch.cuda.empty_cache()
    return results

In [9]:
all_results = {}
for model_name in model_names:
    all_results[model_name] = compare_models(model_name)


for model_name, model_results in all_results.items():
    print(f"Results for model: {model_name}\n{'=' * 50}")
    for prompt, result in model_results.items():
        print(f"Prompt: {prompt}")
        print(f"Time taken: {result['time_taken']:.2f} seconds")
        print(f"Peak memory used: {result['peak_memory_used']:.2f} MB")
        print(f"Output: {result['output']}")
        print("-" * 50)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/967 [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/493k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.80M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/414 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/737 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/38.7k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/429M [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/111 [00:00<?, ?B/s]

Results for model: ridger/MMfreeLM-2.7B
Prompt: What are the benefits of renewable energy?
Time taken: 57.31 seconds
Peak memory used: 5499.81 MB
Output: What are the benefits of renewable energy?
The benefits for a community or an individual are that it provides the opportunity to reduce their carbon footprint in the short term and in a long term it reduces the amount of CO2 emissions. The benefits in this case are in terms of the reduction on the cost incurred on electricity bills, in addition in some cases you can get the tax credits in your country in order to make it more attractive in that way in particular in countries like Spain in which they have a very high level of solar energy in their energy mix in general in relation to other countries in Europe in fact the Spanish
--------------------------------------------------
Prompt: Explain the theory of relativity.
Time taken: 34.99 seconds
Peak memory used: 5499.75 MB
Output: Explain the theory of relativity.
203: What is a parti