In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [2]:
# ⚡ AGI Voice Agent - GPU & CPU Optimization (Notebook 23)

### Objective:
- Maximize performance on both CPU & GPU
- Enable device selection (CUDA or CPU)
- Reduce latency for inference






✅ 1. Detect and Configure Device

import torch

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print("Using device:", device)




✅ 2. Load Models with Device Assignment

Whisper (for Voice-to-Text):

import whisper

whisper_model = whisper.load_model("base").to(device)

Transformers (Reasoning Model):

from transformers import AutoModelForCausalLM, AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("gpt2")
model = AutoModelForCausalLM.from_pretrained("gpt2").to(device)




✅ 3. Efficient Inference with FP16 (GPU)

model.half()  # Reduces memory and speeds up on GPU

input_ids = tokenizer("Define artificial general intelligence.", return_tensors='pt').input_ids.to(device)
outputs = model.generate(input_ids, max_length=100)
response = tokenizer.decode(outputs[0], skip_special_tokens=True)

print(response)



✅ 4. Optimize CPU with ONNX Runtime

pip install onnx onnxruntime

Convert transformer to ONNX:

from transformers.onnx import export
from pathlib import Path

export(
    preprocessor=tokenizer,
    model=model,
    config=model.config,
    opset=12,
    output=Path("gpt2.onnx"),
    tokenizer=tokenizer
)

Run with ONNX Runtime:

import onnxruntime as ort

session = ort.InferenceSession("gpt2.onnx")




✅ 5. Benchmarking CPU vs GPU

import time

start = time.time()
_ = model.generate(input_ids, max_length=100)
end = time.time()
print(f"Inference Time: {end - start:.2f} seconds on {device}")




✅ 6. Bonus: Parallel GPU Inference with DeepSpeed / Huggingface Accelerate

pip install deepspeed accelerate

Initialize with DeepSpeed configs for multi-GPU inference.

SyntaxError: invalid character '✅' (U+2705) (1684979588.py, line 13)