In [1]:
%%capture
import os
if "COLAB_" not in "".join(os.environ.keys()):
    !pip install unsloth
else:
    # Do this only in Colab notebooks! Otherwise use pip install unsloth
    !pip install --no-deps bitsandbytes accelerate xformers==0.0.29.post3 peft trl triton cut_cross_entropy unsloth_zoo
    !pip install sentencepiece protobuf datasets huggingface_hub hf_transfer
    !pip install --no-deps unsloth

In [2]:
from unsloth import FastLanguageModel
import torch

# 모델 설정 파라미터
max_seq_length = 4096 # 원하는 길이로 설정! 내부적으로 RoPE 스케일링을 자동 지원합니다!
dtype = None # 자동 감지용. Tesla T4, V100는 Float16, Ampere+는 Bfloat16 사용
load_in_4bit = True # 메모리 사용량을 줄이기 위해 4비트 양자화 사용. False로 설정 가능

# 4비트 사전 양자화된 모델 목록 - 4배 빠른 다운로드와 메모리 부족 현상 방지
fourbit_models = [
    "unsloth/Meta-Llama-3.1-8B-bnb-4bit",      # Llama-3.1 2배 더 빠름
    "unsloth/Meta-Llama-3.1-8B-Instruct-bnb-4bit",
    "unsloth/Meta-Llama-3.1-70B-bnb-4bit",
    "unsloth/Meta-Llama-3.1-405B-bnb-4bit",    # 405B 모델용 4비트!
    "unsloth/Mistral-Small-Instruct-2409",     # Mistral 22B 2배 더 빠름!
    "unsloth/mistral-7b-instruct-v0.3-bnb-4bit",
    "unsloth/Phi-3.5-mini-instruct",           # Phi-3.5 2배 더 빠름!
    "unsloth/Phi-3-medium-4k-instruct",
    "unsloth/gemma-2-9b-bnb-4bit",
    "unsloth/gemma-2-27b-bnb-4bit",            # Gemma 2배 더 빠름!

    "unsloth/Llama-3.2-1B-bnb-4bit",           # 신규! Llama 3.2 모델
    "unsloth/Llama-3.2-1B-Instruct-bnb-4bit",
    "unsloth/Llama-3.2-3B-bnb-4bit",
    "unsloth/Llama-3.2-3B-Instruct-bnb-4bit",

    "unsloth/Llama-3.3-70B-Instruct-bnb-4bit" # 신규! Llama 3.3 70B!
] # 더 많은 모델은 https://huggingface.co/unsloth 에서 확인 가능

# 모델 및 토크나이저 로딩
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "unsloth/Llama-3.2-3B-Instruct", # 또는 "unsloth/Llama-3.2-1B-Instruct" 선택 가능
    max_seq_length = max_seq_length,
    dtype = dtype,
    #load_in_4bit = load_in_4bit,
    load_in_4bit = False,  # ⚠️ 이게 핵심!
)

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
Unsloth: Failed to patch Gemma3ForConditionalGeneration.
🦥 Unsloth Zoo will now patch everything to make training faster!
INFO 04-14 06:47:15 [__init__.py:239] Automatically detected platform cuda.
==((====))==  Unsloth 2025.3.19: Fast Llama patching. Transformers: 4.51.2. vLLM: 0.8.3.
   \\   /|    NVIDIA A100-SXM4-40GB. Num GPUs = 1. Max memory: 39.557 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.6.0+cu124. CUDA: 8.0. CUDA Toolkit: 12.4. Triton: 3.2.0
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.29.post3. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


model.safetensors:   0%|          | 0.00/6.43G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/234 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/54.7k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.2M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/454 [00:00<?, ?B/s]

In [3]:
# PEFT(Parameter-Efficient Fine-Tuning) 모델 설정
model = FastLanguageModel.get_peft_model(
    model,
    r = 16, # 0보다 큰 숫자 선택! 추천값: 8, 16, 32, 64, 128
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj",],
    lora_alpha = 16,
    lora_dropout = 0, # 모든 값 지원, 0이 최적화됨
    bias = "none",    # 모든 값 지원, "none"이 최적화됨
    # [신규] "unsloth"는 VRAM 사용량을 30% 줄이고 배치 크기를 2배로 늘립니다!
    use_gradient_checkpointing = "unsloth", # 매우 긴 컨텍스트의 경우 True 또는 "unsloth" 사용
    random_state = 3407,
    use_rslora = False,  # rank stabilized LoRA 지원
    loftq_config = None, # LoftQ 지원
)


Unsloth 2025.3.19 patched 28 layers with 28 QKV layers, 28 O layers and 28 MLP layers.


In [4]:

# 데이터셋 로딩 및 전처리
from datasets import load_dataset
from unsloth.chat_templates import get_chat_template, standardize_sharegpt

tokenizer = get_chat_template(tokenizer, chat_template="llama-3.1")

# 데이터셋 구조 확인을 위한 디버깅 코드 추가
print("데이터셋 구조 확인 중...")
dataset = load_dataset("mlabonne/FineTome-100k", split="train")
print("첫 번째 샘플의 키:", dataset[0].keys())
print("데이터셋 구조:", dataset)

# 3. 각 데이터셋을 ShareGPT 포맷으로 정제
def convert_finetome(example):
    # 데이터셋 구조에 맞게 수정
    conversations = example.get("conversations", [])
    if not conversations:
        conversations = [
            {"from": "user", "value": example.get("instruction", example.get("input", ""))},
            {"from": "assistant", "value": example.get("output", example.get("response", ""))},
        ]
    return {"conversations": conversations}

def convert_webgpt(example):
    """WebGPT 데이터셋을 ShareGPT 형식으로 변환"""
    try:
        # 질문 처리
        question = example.get("question", {})
        if isinstance(question, dict):
            question_text = question.get("full_text", "")
        else:
            question_text = str(question)

        # 인용문 처리
        quotes_0 = example.get("quotes_0", {})
        quotes_text = ""
        if isinstance(quotes_0, dict):
            titles = quotes_0.get("title", [])
            extracts = quotes_0.get("extract", [])
            if titles and extracts:
                quotes_text = "\n".join([
                    f"Title: {title}\nExtract: {extract}"
                    for title, extract in zip(titles, extracts)
                ])

        # 질문과 인용문 결합
        full_question = f"{question_text}"
        if quotes_text:
            full_question += f"\n\n[Web Results]\n{quotes_text}"

        # 답변 처리
        answer = example.get("answer_0", "")
        if not answer:
            answer = example.get("answer_1", "")

        # 빈 대화 필터링
        if not full_question.strip() and not answer.strip():
            return None

        return {
            "conversations": [
                {"from": "user", "value": full_question},
                {"from": "assistant", "value": answer}
            ]
        }
    except Exception as e:
        print(f"변환 중 오류 발생: {e}")
        print("예시 데이터:", example)
        return None

def convert_longalign(example):
    # LongAlign-10k 데이터셋 구조에 맞게 수정
    conversations = example.get("conversations", [])
    if not conversations:
        conversations = [
            {"from": "user", "value": example.get("instruction", example.get("input", ""))},
            {"from": "assistant", "value": example.get("output", example.get("response", ""))},
        ]
    return {"conversations": conversations}

def convert_longbench(example):
    # LongBench-v2 데이터셋 구조에 맞게 수정
    question = example.get("question", "")
    context = example.get("context", "")
    if context:
        question = f"{context}\n\n{question}"

    answer = example.get("answer", "")
    if not answer and "choice" in example:
        # 객관식 답변 처리
        choices = [v for k, v in example.items() if k.startswith("choice_")]
        answer = "\n".join(f"{chr(65+i)}. {choice}" for i, choice in enumerate(choices))
        answer += f"\n정답: {example.get('answer', '')}"

    return {
        "conversations": [
            {"from": "user", "value": question},
            {"from": "assistant", "value": answer},
        ]
    }

# 4. 데이터셋 로딩 및 변환
print("데이터셋 로딩 및 변환 시작...")
try:
    ds1 = load_dataset("mlabonne/FineTome-100k", split="train").map(convert_finetome)
    print("FineTome 데이터셋 변환 완료")
except Exception as e:
    print(f"FineTome 데이터셋 변환 중 오류 발생: {e}")
    print("데이터셋 구조를 확인하세요.")
    raise

try:
    # WebGPT 데이터셋 구조 확인
    print("\nWebGPT 데이터셋 구조 확인 중...")
    webgpt_dataset = load_dataset("openai/webgpt_comparisons", split="train")
    print("첫 번째 샘플의 키:", webgpt_dataset[0].keys())
    print("첫 번째 샘플:", webgpt_dataset[0])

    ds2 = webgpt_dataset.map(convert_webgpt)
    print("WebGPT 데이터셋 변환 완료")
except Exception as e:
    print(f"WebGPT 데이터셋 변환 중 오류 발생: {e}")
    print("데이터셋 구조:", webgpt_dataset)
    ds2 = None

try:
    # LongAlign-10k 데이터셋 로딩
    ds3 = load_dataset("THUDM/LongAlign-10k", split="train").map(convert_longalign)
    print("LongAlign-10k 데이터셋 변환 완료")
except Exception as e:
    print(f"LongAlign-10k 데이터셋 변환 중 오류 발생: {e}")
    ds3 = None

try:
    # LongBench-v2 데이터셋 로딩
    ds4 = load_dataset("THUDM/LongBench-v2", split="train").map(convert_longbench)
    print("LongBench-v2 데이터셋 변환 완료")
except Exception as e:
    print(f"LongBench-v2 데이터셋 변환 중 오류 발생: {e}")
    ds4 = None

# 데이터셋 병합
from datasets import concatenate_datasets
try:
    datasets_to_merge = [ds1]
    if ds2 is not None:
        datasets_to_merge.append(ds2)
    if ds3 is not None:
        datasets_to_merge.append(ds3)
    if ds4 is not None:
        datasets_to_merge.append(ds4)

    # 각 데이터셋의 구조를 통일
    for ds in datasets_to_merge:
        ds = ds.map(lambda x: {"conversations": x["conversations"]})

    dataset = concatenate_datasets(datasets_to_merge)
    print("데이터셋 병합 완료")
    print(f"병합된 데이터셋 크기: {len(dataset)}")
except Exception as e:
    print(f"데이터셋 병합 중 오류 발생: {e}")
    dataset = ds1


데이터셋 구조 확인 중...
첫 번째 샘플의 키: dict_keys(['conversations', 'source', 'score'])
데이터셋 구조: Dataset({
    features: ['conversations', 'source', 'score'],
    num_rows: 100000
})
데이터셋 로딩 및 변환 시작...
FineTome 데이터셋 변환 완료

WebGPT 데이터셋 구조 확인 중...
첫 번째 샘플의 키: dict_keys(['question', 'quotes_0', 'answer_0', 'tokens_0', 'score_0', 'quotes_1', 'answer_1', 'tokens_1', 'score_1'])
첫 번째 샘플: {'question': {'dataset': 'triviaqa', 'id': '18c654a169eb80287f4353d33e701b1c', 'full_text': 'Voiced by Harry Shearer, what Simpsons character was modeled after Ted Koppel?'}, 'quotes_0': {'title': ['Kent Brockman (en.wikipedia.org)', 'Krusty the Clown (en.wikipedia.org)'], 'extract': ['Kent Brockman is a fictional character in the animated television series The Simpsons. He is voiced by Harry Shearer and first appeared in the episode "Krusty Gets Busted". He is a grumpy, self-centered local Springfield news anchor.', "Krusty was created by cartoonist Matt Groening and partially inspired by Rusty Nails, a television clow

In [5]:

# 5. ShareGPT 스타일 표준화
print("ShareGPT 스타일 표준화 시작...")
try:
    # 데이터셋 구조 확인
    print("데이터셋 샘플 확인:", dataset[0])

    # 빈 대화 필터링
    dataset = dataset.filter(lambda x: any(msg["value"].strip() for msg in x["conversations"]))

    dataset = standardize_sharegpt(dataset)
    print("ShareGPT 스타일 표준화 완료")
except Exception as e:
    print(f"ShareGPT 스타일 표준화 중 오류 발생: {e}")
    raise

ShareGPT 스타일 표준화 시작...
데이터셋 샘플 확인: {'conversations': [{'from': 'human', 'value': 'Explain what boolean operators are, what they do, and provide examples of how they can be used in programming. Additionally, describe the concept of operator precedence and provide examples of how it affects the evaluation of boolean expressions. Discuss the difference between short-circuit evaluation and normal evaluation in boolean expressions and demonstrate their usage in code. \n\nFurthermore, add the requirement that the code must be written in a language that does not support short-circuit evaluation natively, forcing the test taker to implement their own logic for short-circuit evaluation.\n\nFinally, delve into the concept of truthiness and falsiness in programming languages, explaining how it affects the evaluation of boolean expressions. Add the constraint that the test taker must write code that handles cases where truthiness and falsiness are implemented differently across different programmi

In [6]:
# 6. text 필드 생성 (chat template 적용)
def formatting_prompts_func(examples):
    try:
        texts = [
            tokenizer.apply_chat_template(convo, tokenize=False, add_generation_prompt=False)
            for convo in examples["conversations"]
        ]
        return { "text": texts }
    except Exception as e:
        print(f"프롬프트 포맷팅 중 오류 발생: {e}")
        print("예시 데이터:", examples)
        raise

print("프롬프트 포맷팅 시작...")
dataset = dataset.map(formatting_prompts_func, batched=True)
print("데이터셋 전처리 완료")

프롬프트 포맷팅 시작...


Map:   0%|          | 0/100000 [00:00<?, ? examples/s]

데이터셋 전처리 완료


In [7]:
# 학습 설정
from trl import SFTTrainer
from transformers import TrainingArguments, DataCollatorForSeq2Seq
from unsloth import is_bfloat16_supported

trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    train_dataset = dataset,
    dataset_text_field = "text",
    max_seq_length = max_seq_length,
    data_collator = DataCollatorForSeq2Seq(tokenizer = tokenizer),
    dataset_num_proc = 2,
    packing = False, # 짧은 시퀀스의 경우 학습 속도를 5배까지 향상시킬 수 있음
    args = TrainingArguments(
        per_device_train_batch_size = 2,
        gradient_accumulation_steps = 4,
        warmup_steps = 5,
        # num_train_epochs = 1, # 전체 학습을 위해 이 값을 설정
        max_steps = 100,
        learning_rate = 2e-4,
        fp16 = not is_bfloat16_supported(),
        bf16 = is_bfloat16_supported(),
        logging_steps = 1,
        optim = "adamw_8bit",
        weight_decay = 0.01,
        lr_scheduler_type = "linear",
        seed = 3407,
        output_dir = "outputs",
        report_to = "none", # WandB 등에 사용
    ),
)

Unsloth: Tokenizing ["text"] (num_proc=2):   0%|          | 0/100000 [00:00<?, ? examples/s]

In [8]:
from unsloth.chat_templates import train_on_responses_only
trainer = train_on_responses_only(
    trainer,
    instruction_part = "<|start_header_id|>user<|end_header_id|>\n\n",
    response_part = "<|start_header_id|>assistant<|end_header_id|>\n\n",
)

# 입력과 레이블 디코딩 확인
tokenizer.decode(trainer.train_dataset[5]["input_ids"])
space = tokenizer(" ", add_special_tokens = False).input_ids[0]
tokenizer.decode([space if x == -100 else x for x in trainer.train_dataset[5]["labels"]])

# @title 현재 GPU 메모리 상태 출력
gpu_stats = torch.cuda.get_device_properties(0)
start_gpu_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
max_memory = round(gpu_stats.total_memory / 1024 / 1024 / 1024, 3)
print(f"사용 중인 GPU: {gpu_stats.name}, 최대 메모리: {max_memory} GB")
print(f"학습 시작 시점에 예약된 메모리: {start_gpu_memory} GB")

# 학습 실행
trainer_stats = trainer.train()

# @title 최종 GPU 메모리 사용량 및 학습 시간 출력
used_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
used_memory_for_lora = round(used_memory - start_gpu_memory, 3)
used_percentage = round(used_memory / max_memory * 100, 3)
lora_percentage = round(used_memory_for_lora / max_memory * 100, 3)

print(f"총 학습 시간: {trainer_stats.metrics['train_runtime']} 초")
print(f"총 학습 시간(분 단위): {round(trainer_stats.metrics['train_runtime']/60, 2)} 분")
print(f"최대 예약 메모리 사용량: {used_memory} GB")
print(f"LoRA 학습에 사용된 추가 메모리: {used_memory_for_lora} GB")
print(f"전체 GPU 메모리 대비 최대 예약 메모리 비율: {used_percentage} %")
print(f"전체 GPU 메모리 대비 LoRA 학습 메모리 비율: {lora_percentage} %")


Map (num_proc=12):   0%|          | 0/100000 [00:00<?, ? examples/s]

사용 중인 GPU: NVIDIA A100-SXM4-40GB, 최대 메모리: 39.557 GB
학습 시작 시점에 예약된 메모리: 6.779 GB


==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 100,000 | Num Epochs = 1 | Total steps = 100
O^O/ \_/ \    Batch size per device = 2 | Gradient accumulation steps = 4
\        /    Data Parallel GPUs = 1 | Total batch size (2 x 4 x 1) = 8
 "-____-"     Trainable parameters = 24,313,856/3,237,063,680 (0.75% trained)


Unsloth: Will smartly offload gradients to save VRAM!


Step,Training Loss
1,0.7697
2,0.8259
3,1.0793
4,0.8836
5,0.7601
6,0.9334
7,0.6156
8,0.9874
9,0.849
10,0.749


총 학습 시간: 154.8161 초
총 학습 시간(분 단위): 2.58 분
최대 예약 메모리 사용량: 7.426 GB
LoRA 학습에 사용된 추가 메모리: 0.647 GB
전체 GPU 메모리 대비 최대 예약 메모리 비율: 18.773 %
전체 GPU 메모리 대비 LoRA 학습 메모리 비율: 1.636 %


In [15]:

# 3. 저장 디렉토리 세팅
drive_dir = "/content/drive/MyDrive/unsloth_models/Llama_3.2_3B_test/"
os.makedirs(drive_dir, exist_ok=True)

from peft import PeftModel

print(type(model))
# 또는
print(model.__class__)

# 4. 모델 저장
# base_model = model.base_model
# base_model.save_pretrained(f"{drive_dir}/base_model_saved")
# tokenizer.save_pretrained(f"{drive_dir}/base_model_saved")

model.save_pretrained(f"{drive_dir}/lora_model")
tokenizer.save_pretrained(f"{drive_dir}/lora_model")

model.save_pretrained_merged(f"{drive_dir}/merged_16bit", tokenizer, save_method="merged_16bit")
model.save_pretrained_merged(f"{drive_dir}/merged_4bit", tokenizer, save_method="merged_4bit_forced")

model.save_pretrained_gguf(f"{drive_dir}/gguf_f16", tokenizer, quantization_method="f16")
model.save_pretrained_gguf(f"{drive_dir}/gguf_q4_k_m", tokenizer, quantization_method="q4_k_m")

# GGUF 모델은 vLLM이나 HuggingFace `transformers`에서 직접 로드 불가.
# GGUF용 추론은 llama.cpp 또는 koboldcpp에서 처리 필요 (ex. llama-cpp-python). 하단 참조.


<class 'transformers.models.llama.modeling_llama.LlamaForCausalLM'>
<class 'transformers.models.llama.modeling_llama.LlamaForCausalLM'>


In [10]:
!pip install -U "vllm[triton]" accelerate
!pip install -U peft transformersw

Collecting xformers==0.0.29.post2 (from vllm[triton])
  Using cached xformers-0.0.29.post2-cp311-cp311-manylinux_2_28_x86_64.whl.metadata (1.0 kB)
Using cached xformers-0.0.29.post2-cp311-cp311-manylinux_2_28_x86_64.whl (44.3 MB)
Installing collected packages: xformers
  Attempting uninstall: xformers
    Found existing installation: xformers 0.0.29.post3
    Uninstalling xformers-0.0.29.post3:
      Successfully uninstalled xformers-0.0.29.post3
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
unsloth 2025.3.19 requires tyro, which is not installed.
unsloth 2025.3.19 requires protobuf<4.0.0, but you have protobuf 5.29.4 which is incompatible.
unsloth 2025.3.19 requires trl!=0.15.0,!=0.9.0,!=0.9.1,!=0.9.2,!=0.9.3,<=0.15.2,>=0.7.9, but you have trl 0.16.1 which is incompatible.[0m[31m
[0mSuccessfully installed xformers-0.0.29.post2


[31mERROR: Could not find a version that satisfies the requirement transformersw (from versions: none)[0m[31m
[0m[31mERROR: No matching distribution found for transformersw[0m[31m
[0m

In [16]:
!python3 -m vllm.entrypoints.openai.api_server \
  --model {drive_dir}lora_model \
  --dtype float16 \
  --port 8000 \
  --gpu-memory-utilization 0.9 \
  --max-model-len 4096

INFO 04-14 07:06:00 [__init__.py:239] Automatically detected platform cuda.
2025-04-14 07:06:00.451591: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1744614360.473861   80585 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1744614360.480660   80585 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
INFO 04-14 07:06:04 [api_server.py:1034] vLLM API server version 0.8.3
INFO 04-14 07:06:04 [api_server.py:1035] args: Namespace(host=None, port=8000, uvicorn_log_level='info', disable_uvicorn_access_log=False, allow_credentials=False, allowed_origins=['*'], allowed_methods=['*'], allowed_headers=['*'], api_key=None, lora_modules=None, prompt_adapters=None, ch

In [17]:
!python3 -m vllm.entrypoints.openai.api_server \
  --model {drive_dir}merged_16bit \
  --dtype float16 \
  --port 8000 \
  --gpu-memory-utilization 0.9 \
  --max-model-len 4096

INFO 04-14 07:13:32 [__init__.py:239] Automatically detected platform cuda.
2025-04-14 07:13:32.525549: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1744614812.548189   82762 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1744614812.554963   82762 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
INFO 04-14 07:13:36 [api_server.py:1034] vLLM API server version 0.8.3
INFO 04-14 07:13:36 [api_server.py:1035] args: Namespace(host=None, port=8000, uvicorn_log_level='info', disable_uvicorn_access_log=False, allow_credentials=False, allowed_origins=['*'], allowed_methods=['*'], allowed_headers=['*'], api_key=None, lora_modules=None, prompt_adapters=None, ch

In [18]:
!wget https://github.com/cloudflare/cloudflared/releases/latest/download/cloudflared-linux-amd64 -O cloudflared
!chmod +x cloudflared

# cloudflared 터널 시작
!./cloudflared tunnel --url http://localhost:8000

--2025-04-14 07:17:07--  https://github.com/cloudflare/cloudflared/releases/latest/download/cloudflared-linux-amd64
Resolving github.com (github.com)... 20.205.243.166
Connecting to github.com (github.com)|20.205.243.166|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://github.com/cloudflare/cloudflared/releases/download/2025.4.0/cloudflared-linux-amd64 [following]
--2025-04-14 07:17:07--  https://github.com/cloudflare/cloudflared/releases/download/2025.4.0/cloudflared-linux-amd64
Reusing existing connection to github.com:443.
HTTP request sent, awaiting response... 302 Found
Location: https://objects.githubusercontent.com/github-production-release-asset-2e65be/106867604/f756c1d5-fdc6-4b60-9a49-bdc7883319c0?X-Amz-Algorithm=AWS4-HMAC-SHA256&X-Amz-Credential=releaseassetproduction%2F20250414%2Fus-east-1%2Fs3%2Faws4_request&X-Amz-Date=20250414T071708Z&X-Amz-Expires=300&X-Amz-Signature=6bae072fff3d880426edde28225b90569e5d6a82ecbce931d70842c254336b0d&X-A

In [6]:
import time, re

# 1. vLLM 서버 백그라운드 실행
!python3 -m vllm.entrypoints.openai.api_server \
  --model {drive_dir}merged_16bit \
  --dtype float16 \
  --port 8000 \
  --gpu-memory-utilization 0.9 \
  --max-model-len 4096 > vllm.log 2>&1 &

# !python3 -m vllm.entrypoints.openai.api_server \
#   --model {drive_dir}lora_model \
#   --dtype float16 \
#   --port 8000 \
#   --gpu-memory-utilization 0.9 \
#   --max-model-len 4096 > vllm.log 2>&1 &

# 2. cloudflared 다운로드 및 복사
!wget -q https://github.com/cloudflare/cloudflared/releases/latest/download/cloudflared-linux-amd64 -O cloudflared
!cp cloudflared cloudflared_exec
!chmod +x cloudflared_exec

# 3. cloudflared 백그라운드 실행
!./cloudflared_exec tunnel --url http://localhost:8000 --no-autoupdate > cloudflared.log 2>&1 &

# 4. Cloudflared URL 확인
time.sleep(15)
with open("cloudflared.log") as f:
    log = f.read()

match = re.search("https://.*trycloudflare.com", log)
public_url = match.group(0) if match else None

print("🔗 외부에서 접근 가능한 URL:")
print(public_url if public_url else "❌ URL 찾기 실패")
# !ps -ef | grep cloudflared
# !tail -n 50 cloudflared.log

cp: cannot create regular file 'cloudflared_exec': Text file busy
🔗 외부에서 접근 가능한 URL:
https://pensions-investigator-dose-warren.trycloudflare.com


|목적|	추천|
|:-|:-|
|고속 API 서빙 (Chat, Search 등)	|✅ vLLM|
|브라우저, 노트북, IoT Edge 추론	|✅ GGUF|
|프롬프트 길거나 병렬 추론 많음	|✅ vLLM 필수|
|단일 사용자, 간단한 테스트/디버깅	|✅ GGUF 충분|

In [None]:
#GGUF 서빙은 의미 없음.