In [1]:
import os
import time
from pathlib import Path
from dotenv import load_dotenv

env_path = os.path.join(Path.cwd().parent, ".env")
load_dotenv(env_path)

True

In [2]:
from transformers import AutoModelForCausalLM, AutoTokenizer

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
# quantization_config = BitsAndBytesConfig(load_in_8bit=True)

### CPU

In [15]:
start_time = time.time()
tokenizer = AutoTokenizer.from_pretrained("google/gemma-2b-it",
                                          # device_map="auto",
                                          token=os.environ.get("hugging_face_token"))
model = AutoModelForCausalLM.from_pretrained("google/gemma-2b-it",
                                             # device_map="auto",
                                             token=os.environ.get("hugging_face_token"),
                                             use_cache=True
                                             # quantization_config=quantization_config
                                            )
print(f"Model loaded in {time.time() - start_time}s")

Loading checkpoint shards: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 2/2 [00:11<00:00,  5.72s/it]


Model loaded in 14.775222063064575s


In [16]:
input_text = "What is coroutine in Python? Explain with an example."
input_ids = tokenizer(input_text, return_tensors="pt")#.to("mps")

In [17]:
inf_start_time = time.time()
outputs = model.generate(**input_ids, max_length=1000)
print(f"Inference completed and output generated in {time.time() - inf_start_time}s")

Inference completed and output generated in 193.66823482513428s


In [18]:
print(tokenizer.decode(outputs[0]))

<bos>What is coroutine in Python? Explain with an example.

Coroutine is a special type of function in Python that allows you to define a function that can be paused and resumed later. This makes it possible to perform long-running tasks without blocking the main thread.

Here's an example of how to use coroutines:

```python
import asyncio

async def my_coroutine():
    print("Starting coroutine...")
    await asyncio.sleep(2)
    print("Coroutine finished!")

asyncio.run(my_coroutine())
```

**Output:**

```
Starting coroutine...
Coroutine finished!
```

**Explanation:**

1. The `async` keyword is used to define an async function.
2. The `await` keyword is used to pause the execution of the function until it completes.
3. The `sleep(2)` function blocks the main thread for 2 seconds.
4. After 2 seconds, the `await` keyword resumes the execution of the function.
5. The `print("Coroutine finished!")` statement is executed when the coroutine finishes.

**Benefits of using coroutines:**



### GPU

In [11]:
start_time = time.time()
tokenizer_gpu = AutoTokenizer.from_pretrained("google/gemma-2b-it",
                                              device_map="mps",
                                              token=os.environ.get("hugging_face_token")
                                             )
model_gpu = AutoModelForCausalLM.from_pretrained("google/gemma-2b-it",
                                                 device_map="mps",
                                                 token=os.environ.get("hugging_face_token"),
                                                 use_cache=False
                                                 # quantization_config=quantization_config
                                                )
print(f"Model loaded in {time.time() - start_time}s")

Loading checkpoint shards: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 2/2 [00:11<00:00,  5.88s/it]


Model loaded in 14.285676956176758s


In [12]:
input_text_gpu = "What is coroutine in Python? Explain with an executable example."
input_ids_gpu = tokenizer_gpu(input_text_gpu, return_tensors="pt").to("mps")

In [13]:
input_ids_gpu['input_ids'].shape

torch.Size([1, 14])

In [14]:
inf_start_time = time.time()
outputs_gpu = model_gpu.generate(**input_ids_gpu, max_length=1000)
print(f"Inference completed and output generated in {time.time() - inf_start_time}s")

KeyboardInterrupt: 

In [None]:
print(tokenizer_gpu.decode(outputs[0]))