# **Установка и импорт библиотек**

In [1]:
# Установка OpenAI SDK
!pip install -q openai

In [2]:
# Импорт необходимых библиотек
import time
from openai import OpenAI
from google.colab import userdata

#**Загрузка API-ключа и инициализация клиента**

In [5]:
# Получаем API-ключ из Google Colab Secrets
API_KEY = userdata.get("OPENAI_API_KEY")

# Инициализация клиента OpenAI
client = OpenAI(api_key=API_KEY)

# **Конфигурация моделей и параметров**
### **Назначение:** централизованная настройка моделей, промптов и оценки стоимости

In [6]:
# Конфигурация моделей
# NOTE: price_* — это ОЦЕНОЧНАЯ стоимость, используется для сравнения, не реальный биллинг

CONFIG = {
    "intent": {
        "model": "gpt-4o-mini",
        "system_prompt": (
            "You are a helpful assistant. "
            "Classify the following user query as either 'simple' or 'hard'. "
            "Return ONLY one word: simple or hard."
        ),
        "max_tokens": 5,
        "price_input": 0.15,
        "price_output": 0.60,
    },
    "easy": {
        "model": "gpt-4o-mini",
        "system_prompt": "You are a helpful assistant.",
        "max_tokens": None,
        "price_input": 0.15,
        "price_output": 0.60,
    },
    "hard": {
        "model": "gpt-4o",
        "system_prompt": "You are a helpful assistant.",
        "max_tokens": None,
        "price_input": 5.00,
        "price_output": 15.00,
    },
}

# **Функция отправки запроса в модель**
### **Назначение:** единая точка общения с LLM, замер latency и оценки стоимости

In [7]:
def call_model(prompt: str, model_type: str):
    """
    Отправляет запрос в указанную модель,
    возвращает оценку стоимости, latency и intent (если применимо).
    """

    start_time = time.time()

    response = client.chat.completions.create(
        model=CONFIG[model_type]["model"],
        messages=[
            {"role": "system", "content": CONFIG[model_type]["system_prompt"]},
            {"role": "user", "content": prompt},
        ],
        max_tokens=CONFIG[model_type]["max_tokens"],
    )

    latency = time.time() - start_time

    # Извлекаем текст ответа
    content = response.choices[0].message.content.strip()

    # Intent нужен только для классификатора
    intent = None
    if model_type == "intent":
        content_lower = content.lower()
        intent = "hard" if "hard" in content_lower else "simple"

    # Подсчёт примерной стоимости
    tokens_input = response.usage.prompt_tokens
    tokens_output = response.usage.completion_tokens

    estimated_price = (
        tokens_input * CONFIG[model_type]["price_input"]
        + tokens_output * CONFIG[model_type]["price_output"]
    )

    print(f"[MODEL: {model_type}] → {content}")

    return estimated_price, latency, intent

# **Тестовые запросы**
### **Назначение:** подготовка запросов разной сложности

In [8]:
EASY_QUERY = "Give me a recipe for a classic margarita cocktail."

HARD_QUERY = (
    "How do memory-mapped files (mmap) provide advantages over traditional "
    "read/write system calls for certain types of I/O operations?"
)

# **Тестируем классификатор намерений (Intent Classifier)**
### **Назначение:** проверяем, как модель различает сложность задач

In [9]:
print("Testing intent classification:\n")

call_model(EASY_QUERY, "intent")
call_model(HARD_QUERY, "intent")

Testing intent classification:

[MODEL: intent] → simple
[MODEL: intent] → hard


(10.649999999999999, 1.6125001907348633, 'hard')

# **Используем только мощную модель (baseline)**
### **Назначение:** измеряем latency и стоимость без оптимизации

In [10]:
print("\nUsing ONLY powerful model (gpt-4o):\n")

price_1, latency_1, _ = call_model(EASY_QUERY, "hard")
price_2, latency_2, _ = call_model(HARD_QUERY, "hard")

total_price = price_1 + price_2
total_latency = latency_1 + latency_2

print("\n-------------------------------")
print(f"TOTAL PRICE (estimate): {total_price:.2f}")
print(f"TOTAL LATENCY: {total_latency:.2f} sec")


Using ONLY powerful model (gpt-4o):

[MODEL: hard] → A classic margarita is a refreshing and simple cocktail that's perfect for any occasion. Here’s how you can make one:

### Ingredients:
- 2 oz (60 ml) tequila (preferably blanco)
- 1 oz (30 ml) Cointreau or triple sec
- 1 oz (30 ml) freshly squeezed lime juice
- Salt for rimming the glass (optional)
- Lime wedge for garnish
- Ice

### Instructions:

1. **Prepare the Glass:**
   - If you like a salted rim, run a lime wedge around the outer edge of your glass to moisten it.
   - Dip the rim into a small plate of salt, ensuring it's evenly coated. Shake off any excess.

2. **Mix the Cocktail:**
   - Fill a cocktail shaker with ice.
   - Pour the tequila, Cointreau (or triple sec), and freshly squeezed lime juice into the shaker.
   - Shake well until the outside of the shaker feels cold—about 15 to 20 seconds.

3. **Serve:**
   - Fill your prepared glass with ice (if not serving straight up).
   - Strain the mixture from the shaker int

# **Реализация мульти-модельного роутера**
### **Назначение:** автоматический выбор модели на основе intent

In [11]:
def router(query: str):
    """
    Определяет сложность запроса и маршрутизирует его
    в подходящую модель (easy / hard).
    """

    # Шаг 1: классификация intent
    price_intent, latency_intent, intent = call_model(query, "intent")

    # Шаг 2: маршрутизация
    if intent == "hard":
        print("[ROUTER] Using HARD model\n")
        price_answer, latency_answer, _ = call_model(query, "hard")
    else:
        print("[ROUTER] Using EASY model\n")
        price_answer, latency_answer, _ = call_model(query, "easy")

    total_price = price_intent + price_answer
    total_latency = latency_intent + latency_answer

    return total_price, total_latency

# **Используем мульти-модельность**
### **Назначение:** сравнение эффективности маршрутизации

In [12]:
print("\nUsing MULTI-MODEL ROUTER:\n")

price_1, latency_1 = router(EASY_QUERY)
price_2, latency_2 = router(HARD_QUERY)

total_price = price_1 + price_2
total_latency = latency_1 + latency_2

print("\n-------------------------------")
print(f"TOTAL PRICE (estimate): {total_price:.2f}")
print(f"TOTAL LATENCY: {total_latency:.2f} sec")


Using MULTI-MODEL ROUTER:

[MODEL: intent] → simple
[ROUTER] Using EASY model

[MODEL: easy] → Certainly! Here’s a classic margarita recipe that you can easily make at home.

### Classic Margarita Recipe

#### Ingredients:
- 2 oz (60 ml) tequila (preferably 100% agave)
- 1 oz (30 ml) fresh lime juice
- 1 oz (30 ml) Cointreau or triple sec
- Salt (for rimming the glass)
- Lime wedge (for garnish)
- Ice

#### Instructions:

1. **Prepare the Glass**: 
   - Rub the lime wedge around the rim of your glass to moisten it. 
   - Dip the rim into a plate of salt to coat it evenly. Set the glass aside.

2. **Mix the Ingredients**: 
   - In a shaker, combine the tequila, fresh lime juice, and Cointreau (or triple sec).
   - Fill the shaker with ice and shake vigorously for about 15-20 seconds until well chilled.

3. **Strain**: 
   - Fill your prepared glass with ice (if desired) and strain the margarita mixture into the glass.

4. **Garnish**: 
   - Garnish with a lime wedge on the rim of the g