In [1]:
import os
import time
from pathlib import Path
from dotenv import load_dotenv
from IPython.display import display, Markdown

env_path = os.path.join(Path.cwd().parent, ".env")
load_dotenv(env_path)

True

In [2]:
from transformers import AutoModelForCausalLM, AutoTokenizer

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
# quantization_config = BitsAndBytesConfig(load_in_8bit=True)

### Function

In [9]:
def colorize_text(text):
    for word, color in zip(["Instruction"], ["blue"]):
        text = text.replace(f"{word}:\n", f"**<font color='{color}'>{word}:</font>**\n\n")
    for word, color in zip(["Response"], ["green"]):
        text = text.replace(f"\n\n{word}:", f"\n\n**<font color='{color}'>{word}:</font>**")
    return text
template = "Instruction:\n{instruction}\n\nResponse:\n"

### CPU

In [4]:
start_time = time.time()
tokenizer = AutoTokenizer.from_pretrained("google/gemma-2b-it",
                                          # device_map="auto",
                                          token=os.environ.get("hugging_face_token"))
model = AutoModelForCausalLM.from_pretrained("google/gemma-2b-it",
                                             # device_map="auto",
                                             token=os.environ.get("hugging_face_token"),
                                             use_cache=True
                                             # quantization_config=quantization_config
                                            )
print(f"Model loaded in {time.time() - start_time}s")

Loading checkpoint shards: 100%|████████████████████████████████████████████████████████████████████████████████████████████| 2/2 [00:09<00:00,  4.96s/it]


Model loaded in 12.508863925933838s


In [5]:
input_text = "What is coroutine in Python? Explain with an example."
input_ids = tokenizer(input_text, return_tensors="pt")#.to("mps")

In [7]:
inf_start_time = time.time()
outputs = model.generate(**input_ids, max_length=1000)
print(f"Inference completed and output generated in {time.time() - inf_start_time}s")

Inference completed and output generated in 449.9799690246582s


In [11]:
display(Markdown(tokenizer.decode(outputs[0])))

<bos>What is coroutine in Python? Explain with an example.

Coroutine is a special type of function in Python that allows you to define a function that can be paused and resumed later. This makes it possible to perform long-running tasks without blocking the main thread.

Here's an example of how to use coroutines:

```python
import asyncio

async def my_coroutine():
    print("Starting coroutine...")
    await asyncio.sleep(2)
    print("Coroutine finished!")

asyncio.run(my_coroutine())
```

**Output:**

```
Starting coroutine...
Coroutine finished!
```

**Explanation:**

1. The `async` keyword is used to define an async function.
2. The `await` keyword is used to pause the execution of the function until it completes.
3. The `sleep(2)` function blocks the main thread for 2 seconds.
4. After 2 seconds, the `await` keyword resumes the execution of the function.
5. The `print("Coroutine finished!")` statement is executed when the coroutine finishes.

**Benefits of using coroutines:**

* **Non-blocking:** Coroutines do not block the main thread, allowing other tasks to be executed while they are running.
* **Efficient:** Coroutines are more efficient than threads, as they do not need to create new threads for each task.
* **Easy to use:** Coroutines are easy to use, with the `async` and `await` keywords providing a clear way to define and execute them.

**Use cases for coroutines:**

* **Long-running tasks:** Coroutines can be used to perform long-running tasks without blocking the main thread.
* **UI updates:** Coroutines can be used to perform UI updates without blocking the main thread.
* **Asynchronous operations:** Coroutines can be used to perform asynchronous operations, such as network requests.<eos>

In [12]:
print(tokenizer.decode(outputs[0]))

<bos>What is coroutine in Python? Explain with an example.

Coroutine is a special type of function in Python that allows you to define a function that can be paused and resumed later. This makes it possible to perform long-running tasks without blocking the main thread.

Here's an example of how to use coroutines:

```python
import asyncio

async def my_coroutine():
    print("Starting coroutine...")
    await asyncio.sleep(2)
    print("Coroutine finished!")

asyncio.run(my_coroutine())
```

**Output:**

```
Starting coroutine...
Coroutine finished!
```

**Explanation:**

1. The `async` keyword is used to define an async function.
2. The `await` keyword is used to pause the execution of the function until it completes.
3. The `sleep(2)` function blocks the main thread for 2 seconds.
4. After 2 seconds, the `await` keyword resumes the execution of the function.
5. The `print("Coroutine finished!")` statement is executed when the coroutine finishes.

**Benefits of using coroutines:**



In [16]:
# Inferencing using prompt template
prompt_inf_start_time = time.time()
instruction = "What is coroutine in Python? Explain with an example."
prompt = template.format(instruction=instruction)
prompt_input_ids = tokenizer(prompt, return_tensors="pt")#.to("mps")
out = model.generate(**prompt_input_ids, max_length=1000)
print(f"Inference completed and output generated in {time.time() - prompt_inf_start_time}s")

Inference completed and output generated in 0.00023293495178222656s


In [18]:
display(Markdown(colorize_text(tokenizer.decode(out[0]))))

<bos>**<font color='blue'>Instruction:</font>**

What is coroutine in Python? Explain with an example.

**<font color='green'>Response:</font>**
Coroutine is a special type of function in Python that allows you to run multiple functions concurrently without blocking the main thread. This means that you can perform long-running tasks without slowing down the UI or other parts of the application.

Here's an example of how to use coroutines:

```python
import asyncio

async def long_running_task():
    print("Starting long-running task...")
    await asyncio.sleep(2)
    print("Long-running task finished!")

async def main():
    await long_running_task()

asyncio.run(main())
```

**Explanation:**

1. The `async` keyword is used to define an async function.
2. The `async def` syntax defines a function that returns a coroutine object.
3. The `await` keyword is used to pause the execution of the function until it completes.
4. The `await long_running_task()` line starts the `long_running_task` function and pauses the execution of `main` until it finishes.
5. The `asyncio.run(main())` line starts an asynchronous event loop and runs the `main` function on it.
6. The `asyncio.run` function takes an argument `main`, which is the function to run on the event loop.
7. The `main` function uses the `async` keyword to define an async function called `long_running_task`.
8. The `await` keyword is used to pause the execution of `main` until the `long_running_task` function finishes.
9. The `asyncio.run` function starts an asynchronous event loop and runs the `main` function on it.

**Output:**

When you run the code, you will see the following output:

```
Starting long-running task...
Long-running task finished!
```

This shows that the `long_running_task` function was executed concurrently with the `main` function, without blocking the main thread.<eos>

### GPU

In [11]:
start_time = time.time()
tokenizer_gpu = AutoTokenizer.from_pretrained("google/gemma-2b-it",
                                              device_map="mps",
                                              token=os.environ.get("hugging_face_token")
                                             )
model_gpu = AutoModelForCausalLM.from_pretrained("google/gemma-2b-it",
                                                 device_map="mps",
                                                 token=os.environ.get("hugging_face_token"),
                                                 use_cache=False
                                                 # quantization_config=quantization_config
                                                )
print(f"Model loaded in {time.time() - start_time}s")

Loading checkpoint shards: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 2/2 [00:11<00:00,  5.88s/it]


Model loaded in 14.285676956176758s


In [12]:
input_text_gpu = "What is coroutine in Python? Explain with an executable example."
input_ids_gpu = tokenizer_gpu(input_text_gpu, return_tensors="pt").to("mps")

In [13]:
input_ids_gpu['input_ids'].shape

torch.Size([1, 14])

In [14]:
inf_start_time = time.time()
outputs_gpu = model_gpu.generate(**input_ids_gpu, max_length=1000)
print(f"Inference completed and output generated in {time.time() - inf_start_time}s")

KeyboardInterrupt: 

In [None]:
print(tokenizer_gpu.decode(outputs[0]))