In [1]:
from dotenv import load_dotenv

load_dotenv()

True

In [2]:
import nest_asyncio
import uuid
import torch
from vllm import AsyncEngineArgs, AsyncLLMEngine, SamplingParams
from transformers import AutoTokenizer
from pprint import pprint
import os
import json
import re

from qdrant_client import QdrantClient
from qdrant_client.models import PointStruct

from langgraph.graph import StateGraph
from langsmith import traceable

from typing_extensions import TypedDict 

from pydantic import BaseModel, Field
from typing import List, Dict, Optional, Annotated

from transformers import AutoTokenizer, AutoModel


# 이벤트 루프 충돌 방지
nest_asyncio.apply()

INFO 05-18 15:14:42 [__init__.py:239] Automatically detected platform cuda.


In [3]:
gemma = os.getenv("GEMMA_MODEL_4B_PATH")
tokenizer = AutoTokenizer.from_pretrained("google/gemma-3-4b-it")

engine_args = AsyncEngineArgs(
    model=gemma,
    # tokenizer = tokenizer,
    tensor_parallel_size=1,
    gpu_memory_utilization=0.95,
    # max_num_seqs = 100,
    max_model_len=16384,
    max_num_batched_tokens=8192
)
llm = AsyncLLMEngine.from_engine_args(engine_args)

INFO 05-18 15:14:56 [config.py:600] This model supports multiple tasks: {'reward', 'score', 'generate', 'embed', 'classify'}. Defaulting to 'generate'.
INFO 05-18 15:14:57 [config.py:1780] Chunked prefill is enabled with max_num_batched_tokens=8192.
INFO 05-18 15:15:01 [core.py:61] Initializing a V1 LLM engine (v0.8.3) with config: model='/home/a01088415234/models/gemma-3-4b-it/', speculative_config=None, tokenizer='/home/a01088415234/models/gemma-3-4b-it/', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, override_neuron_config=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.bfloat16, max_seq_len=16384, download_dir=None, load_format=auto, tensor_parallel_size=1, pipeline_parallel_size=1, disable_custom_all_reduce=False, quantization=None, enforce_eager=False, kv_cache_dtype=auto,  device_config=cuda, decoding_config=DecodingConfig(guided_decoding_backend='xgrammar', reasoning_backend=None), observability_config=ObservabilityConfig(show_hidden_metrics

Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.


INFO 05-18 15:15:12 [gpu_model_runner.py:1258] Starting to load model /home/a01088415234/models/gemma-3-4b-it/...
INFO 05-18 15:15:12 [config.py:3334] cudagraph sizes specified by model runner [1, 2, 4, 8, 16, 24, 32, 40, 48, 56, 64, 72, 80, 88, 96, 104, 112, 120, 128, 136, 144, 152, 160, 168, 176, 184, 192, 200, 208, 216, 224, 232, 240, 248, 256, 264, 272, 280, 288, 296, 304, 312, 320, 328, 336, 344, 352, 360, 368, 376, 384, 392, 400, 408, 416, 424, 432, 440, 448, 456, 464, 472, 480, 488, 496, 504, 512] is overridden by config [512, 384, 256, 128, 4, 2, 1, 392, 264, 136, 8, 400, 272, 144, 16, 408, 280, 152, 24, 416, 288, 160, 32, 424, 296, 168, 40, 432, 304, 176, 48, 440, 312, 184, 56, 448, 320, 192, 64, 456, 328, 200, 72, 464, 336, 208, 80, 472, 344, 216, 88, 120, 480, 352, 248, 224, 96, 488, 504, 360, 232, 104, 496, 368, 240, 112, 376]


Loading safetensors checkpoint shards:   0% Completed | 0/2 [00:00<?, ?it/s]


INFO 05-18 15:15:15 [loader.py:447] Loading weights took 2.59 seconds
INFO 05-18 15:15:15 [gpu_model_runner.py:1273] Model loading took 8.5828 GiB and 2.872397 seconds
INFO 05-18 15:15:15 [gpu_model_runner.py:1542] Encoder cache will be initialized with a budget of 8192 tokens, and profiled with 32 image items of the maximum feature size.
INFO 05-18 15:15:38 [backends.py:416] Using cache directory: /home/a01088415234/.cache/vllm/torch_compile_cache/7948162d1b/rank_0_0 for vLLM's torch.compile
INFO 05-18 15:15:38 [backends.py:426] Dynamo bytecode transform time: 16.94 s
INFO 05-18 15:15:39 [backends.py:115] Directly load the compiled graph for shape None from the cache
INFO 05-18 15:15:52 [monitor.py:33] torch.compile takes 16.94 s in total
INFO 05-18 15:15:54 [kv_cache_utils.py:578] GPU KV cache size: 11,024 tokens
INFO 05-18 15:15:54 [kv_cache_utils.py:581] Maximum concurrency for 16,384 tokens per request: 0.67x
INFO 05-18 15:16:30 [gpu_model_runner.py:1608] Graph capturing finished 

In [4]:
sampling_params = SamplingParams(temperature=0.6, top_p=0.7, repetition_penalty=1.1, max_tokens=4096, stop = ["<eos>"])


# 비동기 LLM 실행 (non-streaming)
async def llm_engine(prompt: str) -> str:
    request_id = str(uuid.uuid4())
    last_output = None

    async for output in llm.generate(
        prompt=prompt,
        sampling_params=sampling_params,
        request_id=request_id
    ):
        last_output = output.outputs[0].text  # ✅ 덮어쓰기만 함

    return last_output if last_output is not None else ""



async def llm_til(prompt: str) -> str:
    try:
        result = await llm_engine(prompt)
        return result
    except Exception as e:
        print(f"❌ LLM 호출 실패: {e}")
        return "[LLM 호출 실패]"

In [6]:
# JSON 파일 읽기
with open("/home/a01088415234/models/data/dia_230520.json", 'r', encoding='utf-8') as file:
    data = json.load(file)
data

{'username': 'kingwhangzang',
 'date': '2023-05-22',
 'repo': 'IPS',
 'files': [{'filepath': 'Map.js',
   'latest_code': 'import React, { useEffect, useState } from \'react\';\nimport { NavLink } from \'react-router-dom\';\nimport ReviewAndInfo from \'./ReviewAndInfo\'; \n\nconst { kakao } = window;\n\nfunction Map() {\n  const [selectedStore, setSelectedStore] = useState(null);\n\n  const handleLogout = () => {\n    // Logout logic implementation\n  };\n\n  useEffect(() => {\n    const container = document.getElementById(\'map\');\n    const options = {\n      center: new kakao.maps.LatLng(37.365264512305174, 127.10676860117488),\n      level: 3\n    };\n    const map = new kakao.maps.Map(container, options);\n\n    const markerPosition = new kakao.maps.LatLng(37.56000302825312, 126.97540593203321);\n    const marker = new kakao.maps.Marker({\n      position: markerPosition\n    });\n\n    kakao.maps.event.addListener(marker, \'click\', function() {\n      setSelectedStore({\n        