In [None]:
TOKEN = "<YOUR API_KEY>"

# База

In [11]:
from langchain_openai import ChatOpenAI


llm = ChatOpenAI(
    model="just-ai/claude/claude-3-5-sonnet",
    temperature=0,
    max_tokens=None,
    timeout=None,
    max_retries=2,
    api_key=TOKEN,
    base_url="https://caila.io/api/adapters/openai"
)

In [None]:
from langchain_core.prompts import ChatPromptTemplate

system_template = "Translate the following from English into {language}"
user_template = "Hello everybody"

prompt_template = ChatPromptTemplate.from_messages(
    [("system", system_template),
     ("user", user_template)]
)

In [9]:
prompt_template.invoke({'language': 'Russian'})

ChatPromptValue(messages=[SystemMessage(content='Translate the following from English into Russian', additional_kwargs={}, response_metadata={}), HumanMessage(content='Hello everybody', additional_kwargs={}, response_metadata={})])

In [10]:
chain = prompt_template | llm
chain.invoke(
    {
        'language': 'Russian'
    }
)

AIMessage(content='Здравствуйте все / Привет всем\n\nNote: "Здравствуйте" is more formal, while "Привет" is informal.', additional_kwargs={'refusal': None}, response_metadata={'token_usage': {'completion_tokens': 42, 'prompt_tokens': 17, 'total_tokens': 59, 'completion_tokens_details': None, 'prompt_tokens_details': None}, 'model_name': 'claude-3-5-sonnet-20241022', 'system_fingerprint': None, 'id': '5d716e11-5fea-447f-a309-f30b5e5a74a6', 'finish_reason': None, 'logprobs': None}, id='run-951a2f4f-c258-4037-a855-0155bf663a03-0', usage_metadata={'input_tokens': 17, 'output_tokens': 42, 'total_tokens': 59, 'input_token_details': {}, 'output_token_details': {}})

# Генерация кода для Export-шага

In [4]:
MODEL = "just-ai/openai-proxy/gpt-4.1"
TEMPERATURE = 0

In [5]:
from langchain_openai import ChatOpenAI


llm = ChatOpenAI(
    model=MODEL,
    temperature=TEMPERATURE,
    max_tokens=None,
    timeout=None,
    max_retries=2,
    api_key=TOKEN,
    base_url="https://caila.io/api/adapters/openai"
)

In [14]:
from typing import List, Dict, Optional
from pydantic import BaseModel

class DataSource(BaseModel):
    name: str
    description: str
    data_schema: Dict[str, str] # column_name: type
    type: str  # 'table', 'csv', 'api', etc.
    database: str
    access_method: str | None = None
    limitations: Optional[str] | None = None
    recommendations: List[str] | None = None
    connection_params: Dict[str, str] | None = None


ds_customers = DataSource(
        name="customers",
        description="Таблица клиентов",
        data_schema={"customer_id": "Int64", 
                     "name": "Text", 
                     "position": "Text",
                     "Age": "Int64",
                     "tariff": "Json"},
        type="table",
        database="ClickHouse",
    )

In [None]:
from langchain_core.prompts import ChatPromptTemplate

system_template = (
    "You are an experienced Python data engineer writing code for an Airflow DAG. " \
)

# user_template = (
#     "Implement a function def export_data_from_source(**context) -> None for an Airflow DAG.\n"
#     "- The data source has the following properties:\n"
#     "  - database: {database}\n"
#     "  - table name: {name}\n"
#     "  - data schema: {data_schema}\n"
#     "- Use the appropriate Airflow connection for the database type (for example, PostgresHook(\"<name>_source\") for PostgreSQL or ClickHouseHook(\"<name>_source)\" for Clickhouse).\n"
#     "- Use only standard and popular open-source Python libraries (such as pandas, psycopg2). \n"
#     "- Save the extracted data to a file in a suitable format (CSV, JSON, Parquet, etc). Preferably CSV.\n"
#     "- Add a docstring in Russian that describes what the function does.\n"
#     "- Do not add any comments or explanations outside the function code.\n"
#     "- Return only the function code."
# )

user_template = (
    "Implement a function def export_and_load_data_from_source(**context) -> None for an Airflow DAG.\n"
    "- The data source has the following properties: {data_source} \n"
    "- Use the appropriate Airflow connection for the database type (for example, PostgresHook(\"<name>_source\") for PostgreSQL or ClickHouseHook(\"<name>_source)\" for Clickhouse).\n"
    "- Use only standard and popular open-source Python libraries (such as pandas, psycopg2). \n"
    "- Save the extracted data to a file in a suitable format (CSV, JSON, Parquet, etc). Preferably CSV.\n"
    "- Add a docstring in Russian that describes what the function does.\n"
    "- Do not add any comments or explanations outside the function code.\n"
    "- Return only the function code."
)


prompt_template = ChatPromptTemplate.from_messages(
    [("system", system_template),
     ("user", user_template)]
)

In [16]:
chain = prompt_template | llm
# result = chain.invoke(
#     {"database": ds_customers.database,
#      "name": ds_customers.name,
#      "data_schema": ds_customers.data_schema}
# )

result = chain.invoke(
    {"data_source": ds_customers}
)

In [17]:
result.content

'```python\ndef export_data_from_source(**context) -> None:\n    """\n    Экспортирует данные из источника ClickHouse, используя соответствующий Airflow Hook.\n    Данные из таблицы \'customers\' сохраняются в формате CSV.\n    """\n    from airflow.providers.clickhouse.hooks.clickhouse import ClickHouseHook\n    import pandas as pd\n\n    # Создание подключения к ClickHouse\n    hook = ClickHouseHook(\'clickhouse_default\')\n\n    # SQL запрос для выборки данных\n    sql = "SELECT * FROM customers"\n\n    # Выполнение запроса и загрузка данных в DataFrame\n    df = hook.get_pandas_df(sql)\n\n    # Сохранение данных в CSV файл\n    df.to_csv(\'/path/to/output/customers_data.csv\', index=False)\n```'

In [None]:
import re

def clean_code(code_str: str) -> str:
    # убрать обрамление ``` или ```python и оставить только содержимое
    pattern = r"```(?:python)?\n(.*?)```"
    matches = re.findall(pattern, code_str, re.DOTALL)
    if matches:
        # если несколько блоков, объединяем их через 2 перевода строки
        return "\n\n".join(match.strip() for match in matches)
    return code_str.strip()

cleaned_result = clean_code(result.content)

In [19]:
cleaned_result

'def export_data_from_source(**context) -> None:\n    """\n    Экспортирует данные из источника ClickHouse, используя соответствующий Airflow Hook.\n    Данные из таблицы \'customers\' сохраняются в формате CSV.\n    """\n    from airflow.providers.clickhouse.hooks.clickhouse import ClickHouseHook\n    import pandas as pd\n\n    # Создание подключения к ClickHouse\n    hook = ClickHouseHook(\'clickhouse_default\')\n\n    # SQL запрос для выборки данных\n    sql = "SELECT * FROM customers"\n\n    # Выполнение запроса и загрузка данных в DataFrame\n    df = hook.get_pandas_df(sql)\n\n    # Сохранение данных в CSV файл\n    df.to_csv(\'/path/to/output/customers_data.csv\', index=False)'

In [20]:
with open(f"results/export/airflow_template_3_{MODEL.split('/')[-1].replace('-', '_')}.py", 
          "w", encoding="utf-8") as f:
    f.write(cleaned_result)

# Собираем цепочку для генерации пайплайна

In [None]:
from langchain_openai import ChatOpenAI

MODEL = "just-ai/openai-proxy/gpt-4.1"
TEMPERATURE = 0

llm = ChatOpenAI(
    model=MODEL,
    temperature=TEMPERATURE,
    max_tokens=None,
    timeout=None,
    max_retries=2,
    api_key=TOKEN,
    base_url="https://caila.io/api/adapters/openai"
)

In [None]:
from langchain_core.prompts import ChatPromptTemplate

system_template = (
    "You are an experienced Python data engineer writing code for an Airflow DAG. "
)

user_template = (
    "Implement a function def export_data_from_source(**context) -> None for an Airflow DAG.\n"
    "- The data source has the following properties:\n"
    "  - database: {database}\n"
    "  - table name: {name}\n"
    "  - data schema: {data_schema}\n"
    "- Use the appropriate Airflow connection for the database type (for example, PostgresHook(\"<name>_source\") for PostgreSQL or ClickHouseHook(\"<name>_source)\" for Clickhouse).\n"
    "- Use only standard and popular open-source Python libraries (such as pandas, psycopg2). \n"
    "- Save the extracted data to a file in a suitable format (CSV, JSON, Parquet, etc). Preferably CSV.\n"
    "- Add a docstring in Russian that describes what the function does.\n"
    "- Do not add any comments or explanations outside the function code.\n"
    "- Return only the function code."
)

prompt_template = ChatPromptTemplate.from_messages(
    [("system", system_template),
     ("user", user_template)]
)

In [None]:
from typing import List, Dict, Optional
from pydantic import BaseModel

class DataSource(BaseModel):
    name: str
    description: str
    data_schema: Dict[str, str] # column_name: type
    type: str  # 'table', 'csv', 'api', etc.
    database: str
    access_method: str | None = None
    limitations: Optional[str] | None = None
    recommendations: List[str] | None = None
    connection_params: Dict[str, str] | None = None

ds_customers = DataSource(
        name="customers",
        description="Таблица клиентов",
        data_schema={"customer_id": "Int64", "name": "Text", "region_id": "Int64", 
                     "position": "Text", "Age": "Int64"},
        type="table",
        database="PostgreSQL",
        access_method=None,
        limitations=None,
        recommendations=["экспортировать данные ежедневно по ночам"]
    )