In [49]:
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch
import time
torch.manual_seed(1230)


<torch._C.Generator at 0x7ff5bd55a4f0>

In [50]:
def time_it(start,end):
    nano = end-start
    return nano/1e9

In [73]:
device = "cuda"
model_id = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
max_token = 200

## Full Precision Model

In [74]:
model = AutoModelForCausalLM.from_pretrained(model_id).to(device)
tokenizer = AutoTokenizer.from_pretrained(model_id)

# Print model size
print(f"Model size: {model.get_memory_footprint():,} bytes")

Model size: 4,423,265,024 bytes


In [76]:
text = "Hello my name is"
inputs = tokenizer(text, return_tensors="pt").to(device)
start = time.time_ns()
outputs = model.generate(**inputs, max_new_tokens=max_token)
end = time.time_ns()
print(tokenizer.decode(outputs[0], skip_special_tokens=True))
t = time_it(start,end)
print("Seconds:",t)
print("Token/s",len(outputs[0])/t)

Hello my name is John Smith and I am a software engineer. I have been working in the software industry for the past 5 years and have experience in developing web applications using various technologies such as Java, JavaScript, and HTML. I am proficient in using tools such as Git, JIRA, and Slack to manage projects and communicate with team members. I am also skilled in designing and implementing user-friendly interfaces using CSS and HTML. In my free time, I enjoy playing video games, reading books, and spending time with my family and friends. I am passionate about learning new technologies and staying up-to-date with the latest trends in the industry. I am looking forward to working with you and contributing to the development of the project. Thank you for considering my application. Best regards,

[Your Name]
Seconds: 4.424143006
Token/s 40.91187824501349


In [77]:
del model

## INT 8 Quantization

In [78]:
import bitsandbytes as bnb
from transformers import BitsAndBytesConfig
from transformers import AutoModelForCausalLM
import torch

In [79]:
quantization_config = BitsAndBytesConfig(
   load_in_8bit=True,
   bnb_8bit_compute_dtype=torch.bfloat16
)

model_8bit = AutoModelForCausalLM.from_pretrained(model_id, quantization_config=quantization_config)
print(f"Model size: {model_8bit.get_memory_footprint():,} bytes")

`low_cpu_mem_usage` was None, now set to True since model is quantized.


Model size: 1,242,749,696 bytes


In [80]:
%%time
text = "Hello my name is"
inputs = tokenizer(text, return_tensors="pt").to(device)
start = time.time_ns()
outputs = model_8bit.generate(**inputs, max_new_tokens=max_token)
end = time.time_ns()
print(tokenizer.decode(outputs[0], skip_special_tokens=True))
t = time_it(start,end)
print("Seconds:",t)
print("Token/s",len(outputs[0])/t)

Hello my name is John Smith and I am a software engineer. I have been working on a project for the past 6 months and I am excited to share it with you.

The project is a web application that allows users to create and manage their own online portfolios. The portfolios can be customized with images, videos, and text, and users can also add links to their social media profiles. The application also includes a search function that allows users to find specific portfolios based on keywords or tags.

The design of the application is clean and modern, with a focus on simplicity and usability. The user interface is intuitive and easy to navigate, with a clear navigation menu and a responsive design that adapts to different screen sizes.

The application is built using React, a popular front-end framework that allows us to create reusable components and manage state efficiently. We have also used Redux, a state management library that allows us to manage state in
Seconds: 24.980280081
Token/s 

In [81]:
del model_8bit

## INT4 Quantization FP4

In [82]:
quantization_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_compute_dtype=torch.bfloat16
)

model_4bit = AutoModelForCausalLM.from_pretrained(model_id, quantization_config=quantization_config)
print(f"Model size: {model_4bit.get_memory_footprint():,} bytes")

`low_cpu_mem_usage` was None, now set to True since model is quantized.


Model size: 758,307,584 bytes


In [83]:
%%time
text = "Hello my name is"
inputs = tokenizer(text, return_tensors="pt").to(device)
start = time.time_ns()
outputs = model_4bit.generate(**inputs, max_new_tokens=max_token)
end = time.time_ns()
print(tokenizer.decode(outputs[0], skip_special_tokens=True))
t = time_it(start,end)
print("Seconds:",t)
print("Token/s",len(outputs[0])/t)

Hello my name is John and I am 25 years old. I am a student and I am studying in the University of London. I am a very enthusiastic and motivated person. I am very interested in sports and fitness. I have a passion for fitness and I love to work out. I am a very hardworking person and I am always looking for ways to improve my fitness. I am a very active person and I love to go out and exercise. I am very passionate about fitness and I love to share my knowledge and experience with others. I am a very friendly and outgoing person and I love to meet new people and make new friends. I am a very positive and optimistic person and I always see the best in people. I am a very hardworking person and I always put in the effort to achieve my goals. I am a very determined person and I always have a positive attitude towards life. I am a very creative and artistic person and
Seconds: 9.891947199
Token/s 20.723927845138913
CPU times: user 9.9 s, sys: 0 ns, total: 9.9 s
Wall time: 9.89 s


In [84]:
del model_4bit

## INT4 Quantization NF4

In [85]:
quantization_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
)

model_4bit = AutoModelForCausalLM.from_pretrained(model_id, quantization_config=quantization_config)
print(f"Model size: {model_4bit.get_memory_footprint():,} bytes")

`low_cpu_mem_usage` was None, now set to True since model is quantized.


Model size: 758,307,584 bytes


In [86]:
%%time
text = "Hello my name is"
inputs = tokenizer(text, return_tensors="pt").to(device)
start = time.time_ns()
outputs = model_4bit.generate(**inputs, max_new_tokens=max_token)
end = time.time_ns()
print(tokenizer.decode(outputs[0], skip_special_tokens=True))
t = time_it(start,end)
print("Seconds:",t)
print("Token/s",len(outputs[0])/t)

Hello my name is John Smith and I am a student at the University of XYZ. I am currently enrolled in the Bachelor of Science in Computer Science program. I am currently taking 12 credits and have completed 10 credits. I am currently working on a project for my final project. I am currently working on a project for my final project. I am currently working on a project for my final project. I am currently working on a project for my final project. I am currently working on a project for my final project. I am currently working on a project for my final project. I am currently working on a project for my final project. I am currently working on a project for my final project. I am currently working on a project for my final project. I am currently working on a project for my final project. I am currently working on a project for my final project. I am currently working on a project for my final project. I am currently working on a project
Seconds: 10.365734959
Token/s 19.776697051472432
CP

In [87]:
del model_4bit

## Nested 4Bit Quantization

In [88]:
double_quant_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
)

model_4bit = AutoModelForCausalLM.from_pretrained(model_id, quantization_config=quantization_config)
print(f"Model size: {model_4bit.get_memory_footprint():,} bytes")

`low_cpu_mem_usage` was None, now set to True since model is quantized.


Model size: 758,307,584 bytes


In [89]:
%%time
text = "Hello my name is"
inputs = tokenizer(text, return_tensors="pt").to(device)
start = time.time_ns()
outputs = model_4bit.generate(**inputs, max_new_tokens=max_token)
end = time.time_ns()
print(tokenizer.decode(outputs[0], skip_special_tokens=True))
t = time_it(start,end)
print("Seconds:",t)
print("Token/s",len(outputs[0])/t)

Hello my name is John Smith and I am a student at the University of XYZ. I am currently enrolled in the Bachelor of Science in Computer Science program. I am currently taking 12 credits and have completed 10 credits. I am currently working on a project for my final project. I am currently working on a project for my final project. I am currently working on a project for my final project. I am currently working on a project for my final project. I am currently working on a project for my final project. I am currently working on a project for my final project. I am currently working on a project for my final project. I am currently working on a project for my final project. I am currently working on a project for my final project. I am currently working on a project for my final project. I am currently working on a project for my final project. I am currently working on a project for my final project. I am currently working on a project
Seconds: 10.141085694
Token/s 20.214798117847362
CP

In [90]:
del model_4bit

## All Quantization feature together

In [91]:
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)

model_4bit = AutoModelForCausalLM.from_pretrained(model_id, quantization_config=quantization_config)
print(f"Model size: {model_4bit.get_memory_footprint():,} bytes")

`low_cpu_mem_usage` was None, now set to True since model is quantized.


Model size: 758,307,584 bytes


In [92]:
%%time
text = "Hello my name is"
inputs = tokenizer(text, return_tensors="pt").to(device)
start = time.time_ns()
outputs = model_4bit.generate(**inputs, max_new_tokens=max_token)
end = time.time_ns()
print(tokenizer.decode(outputs[0], skip_special_tokens=True))
t = time_it(start,end)
print("Seconds:",t)
print("Token/s",len(outputs[0])/t)

Hello my name is John Smith and I am a student at the University of XYZ. I am currently enrolled in the Bachelor of Science in Computer Science program. I am currently taking 12 credits and have completed 10 credits. I am currently working on a project for my final project. I am currently working on a project for my final project. I am currently working on a project for my final project. I am currently working on a project for my final project. I am currently working on a project for my final project. I am currently working on a project for my final project. I am currently working on a project for my final project. I am currently working on a project for my final project. I am currently working on a project for my final project. I am currently working on a project for my final project. I am currently working on a project for my final project. I am currently working on a project for my final project. I am currently working on a project
Seconds: 9.325512288
Token/s 21.982706544046106
CPU

| Quant | Memory (GB) | Inference (Tokens/s) |
| ------ | -------- | ------- |
| Full Precision | 4,4 | 40.91 |
| 8bit | 1,2 | 8.2 |
| 4 bit FP4 | 0.750 | 20.72 | 
| 4 bit Normal Float 4 | 0.750 |19.77 |
| Nested 4 bit | 0.758 | 20.21 | 
| All together | 0.750 | 21.98 | 
