# 라이브러리 설치 및 임포트, 시드 고정

In [None]:
cd ../../..

In [None]:
!pip install -r requirements.txt \
  --extra-index-url https://download.pytorch.org/whl/cu124

Looking in indexes: https://pypi.org/simple, https://download.pytorch.org/whl/cu124
Collecting transformers==4.53.2 (from -r requirements.txt (line 2))
  Downloading transformers-4.53.2-py3-none-any.whl.metadata (40 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m40.9/40.9 kB[0m [31m1.9 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting datasets==4.0.0 (from -r requirements.txt (line 3))
  Downloading datasets-4.0.0-py3-none-any.whl.metadata (19 kB)
Collecting bitsandbytes==0.46.1 (from -r requirements.txt (line 6))
  Downloading bitsandbytes-0.46.1-py3-none-manylinux_2_24_x86_64.whl.metadata (10 kB)
Collecting fsspec==2025.3.0 (from -r requirements.txt (line 7))
  Downloading fsspec-2025.3.0-py3-none-any.whl.metadata (11 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch==2.6.0+cu124->-r requirements.txt (line 1))
  Downloading https://download.pytorch.org/whl/cu124/nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl (24.6 MB)
[2K     [90m━━━

In [None]:
import pandas as pd
import numpy as np
from datasets import Dataset, DatasetDict
from transformers import AutoTokenizer
from transformers import AutoModelForSequenceClassification, BitsAndBytesConfig
from transformers import DataCollatorWithPadding, TrainingArguments, Trainer, TrainerCallback
from transformers import pipeline
import torch
from peft import LoraConfig, TaskType, get_peft_model
from sklearn.metrics import roc_auc_score
import datetime as dt
import random
import re
import os
from tqdm import tqdm
from torch.utils.data import DataLoader

In [None]:
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = True

SEED = 42
seed_everything(SEED)

# 데이터 불러오기

In [None]:
val_fold_idx = 2


fold_paths = [f"./data/kfold_csv/fold{i}.csv" for i in range(4)]

FOLD_VAL   = fold_paths[val_fold_idx]
FOLD_TRAIN = [path for idx, path in enumerate(fold_paths) if idx != val_fold_idx]

print("▶ Train folds:", FOLD_TRAIN)
print("▶ Validation fold:", FOLD_VAL)

TEST_CSV        = "./data/kfold_csv/test_preprocessed.csv"
SUBMISSION_CSV  = "./data/kfold_csv/sample_submission.csv"

▶ Train folds: ['/content/fold0.csv', '/content/fold1.csv', '/content/fold3.csv']
▶ Validation fold: /content/fold2.csv


In [None]:
train_df = pd.concat(
    [pd.read_csv(p, encoding="utf-8-sig") for p in FOLD_TRAIN],
    ignore_index=True
)

val_df   = pd.read_csv(FOLD_VAL, encoding="utf-8-sig")

train_df = train_df[['full_text', 'generated']].rename(
    columns={'full_text':'text', 'generated':'label'}
)
val_df   = val_df  [['full_text', 'generated']].rename(
    columns={'full_text':'text', 'generated':'label'}
)

train_df = train_df.sample(frac=1, random_state=SEED).reset_index(drop=True)

print("최종 학습 샘플 수:", len(train_df))
print("최종 학습 클래스 분포:", train_df['label'].value_counts().to_dict())
print("검증 샘플 수:", len(val_df))
print("검증 클래스 분포:", val_df['label'].value_counts().to_dict())

최종 학습 샘플 수: 91143
최종 학습 클래스 분포: {0: 45572, 1: 45571}
검증 샘플 수: 30381
검증 클래스 분포: {1: 15191, 0: 15190}


### Huggingface dataset 변환

In [None]:
train_dataset = Dataset.from_pandas(train_df)
val_dataset   = Dataset.from_pandas(val_df)

### 토큰화

In [None]:
MODEL_NAME = "kakaocorp/kanana-1.5-8b-instruct-2505"
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

def tokenize_function(example):
    return tokenizer(example["text"], truncation=True)

train_dataset = train_dataset.map(tokenize_function, batched=True)
val_dataset = val_dataset.map(tokenize_function, batched=True)

train_dataset = train_dataset.remove_columns(["text"])
val_dataset = val_dataset.remove_columns(["text"])

train_dataset = train_dataset.rename_column("label", "labels")
val_dataset   = val_dataset.rename_column("label", "labels")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json: 0.00B [00:00, ?B/s]

tokenizer.json:   0%|          | 0.00/17.2M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/444 [00:00<?, ?B/s]

Map:   0%|          | 0/91143 [00:00<?, ? examples/s]

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Map:   0%|          | 0/30381 [00:00<?, ? examples/s]

### Data Collator

In [None]:
data_collator = DataCollatorWithPadding(tokenizer, padding=True)

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_compute_dtype=torch.bfloat16,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_use_double_quant=True
)

model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=2, quantization_config=bnb_config, torch_dtype=torch.bfloat16)
model.to(device)

config.json:   0%|          | 0.00/717 [00:00<?, ?B/s]

model.safetensors.index.json: 0.00B [00:00, ?B/s]

Fetching 4 files:   0%|          | 0/4 [00:00<?, ?it/s]

model-00002-of-00004.safetensors:   0%|          | 0.00/4.90G [00:00<?, ?B/s]

model-00001-of-00004.safetensors:   0%|          | 0.00/4.93G [00:00<?, ?B/s]

model-00003-of-00004.safetensors:   0%|          | 0.00/4.90G [00:00<?, ?B/s]

model-00004-of-00004.safetensors:   0%|          | 0.00/1.33G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

Some weights of LlamaForSequenceClassification were not initialized from the model checkpoint at kakaocorp/kanana-1.5-8b-instruct-2505 and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


LlamaForSequenceClassification(
  (model): LlamaModel(
    (embed_tokens): Embedding(128259, 4096, padding_idx=128001)
    (layers): ModuleList(
      (0-31): 32 x LlamaDecoderLayer(
        (self_attn): LlamaAttention(
          (q_proj): Linear4bit(in_features=4096, out_features=4096, bias=False)
          (k_proj): Linear4bit(in_features=4096, out_features=1024, bias=False)
          (v_proj): Linear4bit(in_features=4096, out_features=1024, bias=False)
          (o_proj): Linear4bit(in_features=4096, out_features=4096, bias=False)
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear4bit(in_features=4096, out_features=14336, bias=False)
          (up_proj): Linear4bit(in_features=4096, out_features=14336, bias=False)
          (down_proj): Linear4bit(in_features=14336, out_features=4096, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): LlamaRMSNorm((4096,), eps=1e-05)
        (post_attention_layernorm): LlamaRMSNorm((4096,), eps=1e-05)
      )
 

In [None]:
R = 32
LORA_ALPHA = 16
LORA_DROPOUT = 0.1
lora_config = LoraConfig(
    r=R,
    lora_alpha=LORA_ALPHA,
    lora_dropout=LORA_DROPOUT,
    task_type=TaskType.SEQ_CLS,
    target_modules= ["q_proj","k_proj","v_proj","o_proj","gate_proj","up_proj","down_proj"]
)

model = get_peft_model(model, lora_config)

In [None]:
model.print_trainable_parameters()

trainable params: 83,894,272 || all params: 7,588,839,424 || trainable%: 1.1055


In [None]:
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    probs = logits[:, 1]
    roc_auc = roc_auc_score(labels, probs)
    return {"roc_auc": roc_auc}

In [None]:
training_args = TrainingArguments(
    output_dir="./train&inference/kanana/fold2/kanana_model2_checkpoint",
    overwrite_output_dir=True,
    learning_rate=2e-5,
    per_device_train_batch_size=6,
    per_device_eval_batch_size=6,
    num_train_epochs=1,
    eval_strategy="steps",
    metric_for_best_model="roc_auc",
    greater_is_better=True,
    logging_strategy="steps",
    logging_steps=9999999999,
    logging_first_step=True,
    save_total_limit=2,
    seed=SEED,
    dataloader_drop_last=False,
    report_to="none",
    label_names=["labels"]
)

In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

  trainer = Trainer(


In [None]:
trainer.train()

Step,Training Loss,Validation Loss


Step,Training Loss,Validation Loss


TrainOutput(global_step=15191, training_loss=0.5479988408021197, metrics={'train_runtime': 8137.7023, 'train_samples_per_second': 11.2, 'train_steps_per_second': 1.867, 'total_flos': 8.349646379887411e+17, 'train_loss': 0.5479988408021197, 'epoch': 1.0})

In [None]:
output_dir = "./train&inference/kanana/fold2/kanana_model2"
trainer.model.save_pretrained(output_dir)
tokenizer.save_pretrained(output_dir)
print("모델이 저장되었습니다:", output_dir)

모델이 저장되었습니다: output


# TEST 데이터셋 추론

In [None]:
test_df = pd.read_csv(TEST_CSV, encoding='utf-8-sig')
submission_df = pd.read_csv(SUBMISSION_CSV, encoding='utf-8-sig')

print("테스트 샘플 수:", len(test_df))
pred_probs = []

테스트 샘플 수: 1962


In [None]:
trainer.model.eval()

PeftModelForSequenceClassification(
  (base_model): LoraModel(
    (model): LlamaForSequenceClassification(
      (model): LlamaModel(
        (embed_tokens): Embedding(128259, 4096, padding_idx=128001)
        (layers): ModuleList(
          (0-31): 32 x LlamaDecoderLayer(
            (self_attn): LlamaAttention(
              (q_proj): lora.Linear4bit(
                (base_layer): Linear4bit(in_features=4096, out_features=4096, bias=False)
                (lora_dropout): ModuleDict(
                  (default): Dropout(p=0.1, inplace=False)
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=4096, out_features=32, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=32, out_features=4096, bias=False)
                )
                (lora_embedding_A): ParameterDict()
                (lora_embedding_B): ParameterDict()
                (lora_magnitude_vector): ModuleDic

In [None]:
clf = pipeline(
    "text-classification",
    model=trainer.model,
    tokenizer=tokenizer,
    return_all_scores=True,
)

Device set to use cuda:0


In [None]:
print("샘플 결과 예시:", clf(test_df['paragraph_text'][0]))

샘플 결과 예시: [[{'label': 'LABEL_0', 'score': 0.7690802216529846}, {'label': 'LABEL_1', 'score': 0.23091977834701538}]]


In [None]:
for text in test_df['paragraph_text']:
    scores = clf(text)[0]
    prob_ai = None
    for s in scores:
        if s['label'] in ['LABEL_1', '1', 'generated']:
            prob_ai = s['score']
            break
    if prob_ai is None:
        prob_ai = scores[1]['score']
    pred_probs.append(prob_ai)

You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset


In [None]:
submission_df['generated'] = pred_probs

In [None]:
submission_df

Unnamed: 0,ID,generated
0,TEST_0000,0.230920
1,TEST_0001,0.969785
2,TEST_0002,0.198064
3,TEST_0003,0.973241
4,TEST_0004,0.752013
...,...,...
1957,TEST_1957,0.993950
1958,TEST_1958,0.997088
1959,TEST_1959,0.304042
1960,TEST_1960,0.377541


In [None]:
submission_df.to_csv("./ensemble/data/test_ensemble_folding/test_kanana_fold2.csv", index=False, encoding="utf-8-sig")

# Validation 데이터셋 추론

In [None]:
def tokenize_test(batch):
    return tokenizer(batch["text"], truncation=True)

val_ds = Dataset.from_pandas(val_df)

val_ds = val_ds.map(tokenize_test, batched=True,
                      remove_columns=["text", "label"])

Map:   0%|          | 0/30381 [00:00<?, ? examples/s]

In [None]:
def collate(features):
    """
    • 동적 padding → tensor 변환
    • tokenizer가 추가한 'length' 류 메타키 제거
    """
    batch = data_collator(features)
    return batch

In [None]:
BATCH_TEST = 6

loader = DataLoader(
    val_ds,
    batch_size=BATCH_TEST,
    shuffle=False,
    collate_fn=collate,
    pin_memory=True,
)

probs_list = []

with torch.no_grad():
    for batch in tqdm(loader):
        batch = {k: v.to(device) for k, v in batch.items()}
        logits = trainer.model(**batch).logits
        probs  = torch.softmax(logits, dim=-1)[:, 1]
        probs_list.append(probs.cpu())

probs = torch.cat(probs_list).to(torch.float32).numpy()
print(f"[✓] Inference done – {len(probs)} samples")

In [None]:
val_df['generated'] = probs
val_df['ID'] = pd.read_csv(FOLD_VAL, encoding="utf-8-sig")['id']
val_df = val_df[['ID', 'generated', 'label']]
val_df.to_csv("./ensemble/data/val_ensemble_folding/val_kanana_fold2.csv", index=False, encoding="utf-8-sig")