# 라이브러리 설치 및 임포트, 시드 고정

In [None]:
cd ../../..

c:\2025digital\2025-digital-aigt-detection\train&inference\gemma


  self.shell.db['dhist'] = compress_dhist(dhist)[-100:]


In [6]:
!pip install -r ./requirements.txt \
  --extra-index-url https://download.pytorch.org/whl/cu124

Looking in indexes: https://pypi.org/simple, https://download.pytorch.org/whl/cu124


In [None]:
from module import gemma3_seqcls_infonce  # 반드시 최상단에서 임포트!

  from .autonotebook import tqdm as notebook_tqdm


In [7]:
import pandas as pd
import numpy as np
from datasets import Dataset, DatasetDict
from transformers import AutoTokenizer
from transformers import AutoModelForSequenceClassification, BitsAndBytesConfig
from transformers import DataCollatorWithPadding, TrainingArguments, Trainer, TrainerCallback
from transformers import pipeline
import torch
from peft import LoraConfig, TaskType, get_peft_model
from sklearn.metrics import roc_auc_score
import datetime as dt
import random
import re
import os
from tqdm import tqdm
from torch.utils.data import DataLoader

In [8]:
from huggingface_hub import login

login(token="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")  # Hugging Face에서 발급받은 토큰 입력

In [9]:
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = True

SEED = 42
seed_everything(SEED) # Seed 고정

# 데이터 불러오기

In [None]:
# 전체 fold 파일 경로 리스트 (0~4)
val_fold_idx = 0  # 예: 3을 넣으면 fold3.csv가 validation으로, 나머지(0,1,2,4)가 train으로 사용


fold_paths = [f"./data/kfold_csv/fold{i}.csv" for i in range(4)]

FOLD_VAL   = fold_paths[val_fold_idx]
FOLD_TRAIN = [path for idx, path in enumerate(fold_paths) if idx != val_fold_idx]

print("▶ Train folds:", FOLD_TRAIN)
print("▶ Validation fold:", FOLD_VAL)

TEST_CSV        = "./data/kfold_csv/test_preprocessed.csv"
SUBMISSION_CSV  = "./data/kfold_csv/sample_submission.csv"

▶ Train folds: ['./fold1.csv', './fold2.csv', './fold3.csv']
▶ Validation fold: ./fold0.csv


In [None]:
# ==============================================================
# 1) fold1~3 → 학습, fold0 → 검증
# ==============================================================

# 학습용 데이터프레임
train_df = pd.concat(
    [pd.read_csv(p, encoding="utf-8-sig") for p in FOLD_TRAIN],
    ignore_index=True
)

# 검증용 데이터프레임
val_df   = pd.read_csv(FOLD_VAL, encoding="utf-8-sig")

# ── 필요 없는 열 제거 & 컬럼명 통일 ──
train_df = train_df[['full_text', 'generated']].rename(
    columns={'full_text':'text', 'generated':'label'}
)
val_df   = val_df  [['full_text', 'generated']].rename(
    columns={'full_text':'text', 'generated':'label'}
)

# ── 학습 세트 셔플 ──
train_df = train_df.sample(frac=1, random_state=SEED).reset_index(drop=True)

print("최종 학습 샘플 수:", len(train_df))
print("최종 학습 클래스 분포:", train_df['label'].value_counts().to_dict())
print("검증 샘플 수:", len(val_df))
print("검증 클래스 분포:", val_df['label'].value_counts().to_dict())

최종 학습 샘플 수: 91143
최종 학습 클래스 분포: {1: 45572, 0: 45571}
검증 샘플 수: 30381
검증 클래스 분포: {0: 15191, 1: 15190}


In [14]:
# ==============================================================
# 2) Hugging Face Dataset 변환
# ==============================================================
train_dataset = Dataset.from_pandas(train_df)
val_dataset   = Dataset.from_pandas(val_df)

In [15]:
# ==============================================================
# 3) 토큰화
# ==============================================================

MODEL_NAME = "google/gemma-3-12b-it"  # 사전학습 모델 이름 (Hugging Face 모델 허브)
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

# 학습/검증 데이터를 토큰화
def tokenize_function(example):
    return tokenizer(example["text"], truncation=True)

train_dataset = train_dataset.map(tokenize_function, batched=True)
val_dataset = val_dataset.map(tokenize_function, batched=True)

# 토크나이저가 반환한 컬럼과 원본 텍스트 컬럼 정리 (모델 입력에 필요 없는 컬럼 제거)
train_dataset = train_dataset.remove_columns(["text"])
val_dataset = val_dataset.remove_columns(["text"])

# 라벨 컬럼명 변경
train_dataset = train_dataset.rename_column("label", "labels")
val_dataset   = val_dataset.rename_column("label", "labels")

Map:   0%|          | 0/91143 [00:00<?, ? examples/s]Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.
Map: 100%|██████████| 91143/91143 [00:09<00:00, 10078.22 examples/s]
Map: 100%|██████████| 30381/30381 [00:03<00:00, 9089.57 examples/s] 


In [16]:
# ==============================================================
# 4) Data Collator
# ==============================================================
data_collator = DataCollatorWithPadding(tokenizer, padding=True)

In [17]:
# 장치 설정 (GPU 사용 가능 여부)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_compute_dtype=torch.bfloat16,  # A100에서는 bfloat16 사용 권장
    bnb_4bit_quant_type="nf4",             # NF4 양자화 방식
    bnb_4bit_use_double_quant=True         # 메모리 효율 추가 향상 옵션
)

# 사전훈련 모델 로드 (시퀀스 분류용 헤드 포함) 및 GPU 이동
model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=2, quantization_config=bnb_config, torch_dtype=torch.bfloat16)
model.to(device)

Fetching 5 files: 100%|██████████| 5/5 [00:50<00:00, 10.05s/it]
Loading checkpoint shards: 100%|██████████| 5/5 [00:17<00:00,  3.44s/it]
Some weights of Gemma3ForSequenceClassification were not initialized from the model checkpoint at google/gemma-3-12b-it and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Gemma3ForSequenceClassification(
  (vision_tower): SiglipVisionModel(
    (vision_model): SiglipVisionTransformer(
      (embeddings): SiglipVisionEmbeddings(
        (patch_embedding): Conv2d(3, 1152, kernel_size=(14, 14), stride=(14, 14), padding=valid)
        (position_embedding): Embedding(4096, 1152)
      )
      (encoder): SiglipEncoder(
        (layers): ModuleList(
          (0-26): 27 x SiglipEncoderLayer(
            (layer_norm1): LayerNorm((1152,), eps=1e-06, elementwise_affine=True)
            (self_attn): SiglipAttention(
              (k_proj): Linear4bit(in_features=1152, out_features=1152, bias=True)
              (v_proj): Linear4bit(in_features=1152, out_features=1152, bias=True)
              (q_proj): Linear4bit(in_features=1152, out_features=1152, bias=True)
              (out_proj): Linear4bit(in_features=1152, out_features=1152, bias=True)
            )
            (layer_norm2): LayerNorm((1152,), eps=1e-06, elementwise_affine=True)
            (mlp): Siglip

In [18]:
# LoRA 설정 구성
R = 32
LORA_ALPHA = 16
LORA_DROPOUT = 0.1
lora_config = LoraConfig(
    r=R,
    lora_alpha=LORA_ALPHA,
    lora_dropout=LORA_DROPOUT,
    task_type=TaskType.SEQ_CLS,
    target_modules= ["q_proj","k_proj","v_proj","o_proj","gate_proj","up_proj","down_proj"]
)

# 원본 모델에 LoRA 어댑터 추가
model = get_peft_model(model, lora_config)


In [19]:
model.print_trainable_parameters()

trainable params: 136,920,576 || all params: 12,324,253,296 || trainable%: 1.1110


In [20]:
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    probs = logits[:, 1]  # 클래스 1의 확률 추정값
    roc_auc = roc_auc_score(labels, probs)
    return {"roc_auc": roc_auc}

In [None]:
class ScheduledCLTrainer(Trainer):
    """
    1 에폭 동안
    - 처음 delay_ratio 비율만큼은 lambda_cl=0
    - 이후 에폭 종료까지 선형적으로 max_lambda 까지 올림
    """
    def __init__(self, *args, delay_ratio: float = 0.3, max_lambda: float = 0.05, **kwargs):
        super().__init__(*args, **kwargs)
        self.delay_ratio = delay_ratio
        self.max_lambda  = max_lambda

    def compute_loss(self, model, inputs, return_outputs=False, **kwargs):
        labels = inputs["labels"]
        step   = self.state.global_step        # 현재 스텝 (0부터 시작)
        total  = self.state.max_steps          # 1 에폭 전체 스텝 수

        # ── delay 구간 스텝 계산 ─────────────────────
        delay_steps = int(total * self.delay_ratio)

        # ── lambda_cl 계산 ───────────────────────────
        if step < delay_steps:
            lambda_cl = 0.0
        else:
            # 남은 구간을 0→1 로 노말라이즈
            rem_steps = total - delay_steps
            rel_step  = step - delay_steps
            progress  = min(rel_step / rem_steps, 1.0)
            lambda_cl = progress * self.max_lambda

        # ── forward 호출 ─────────────────────────────
        outputs = model(
            **inputs,
            contrastive_labels=labels,
            lambda_cl=lambda_cl,
        )
        loss = outputs.loss
        return (loss, outputs) if return_outputs else loss

In [None]:
# 훈련 파라미터 설정
training_args = TrainingArguments(
    output_dir="./train&inference/gemma/fold0/gemma_model0_checkpoint",
    overwrite_output_dir=True,
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=1,
    save_strategy="epoch",
    metric_for_best_model="roc_auc",
    greater_is_better=True,
    logging_strategy="steps",
    logging_steps=1000,
    logging_first_step=True,
    save_total_limit=2,
    seed=SEED,
    dataloader_drop_last=False,
    report_to="none",
    label_names=["labels"]
)

In [23]:
# Trainer 객체 생성
trainer = ScheduledCLTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    delay_ratio=0.3,    # 에폭의 30% 동안 CL 꺼둠
    max_lambda=0.05,    # 이후 선형 상승하여 최종 0.05
)

  super().__init__(*args, **kwargs)


In [24]:
# 모델 훈련 시작
trainer.train()

Step,Training Loss
1,2.375
1000,0.6685
2000,0.553
3000,0.5335
4000,0.5297
5000,0.5343
6000,0.5483


IOPub message rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_msg_rate_limit`.

Current values:
ServerApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
ServerApp.rate_limit_window=3.0 (secs)



In [None]:
# fine-tuned 모델을 로컬에 저장
output_dir = "./train&inference/gemma/fold0/gemma_model0"
trainer.model.save_pretrained(output_dir)
tokenizer.save_pretrained(output_dir)
print("모델이 저장되었습니다:", output_dir)


모델이 저장되었습니다: ./gemma_model0


# TEST 데이터셋 추론

In [27]:
# 테스트 데이터 불러오기
test_df = pd.read_csv(TEST_CSV, encoding='utf-8-sig')
submission_df = pd.read_csv(SUBMISSION_CSV, encoding='utf-8-sig')

print("테스트 샘플 수:", len(test_df))
# 각 테스트 샘플에 대해 추론
pred_probs = []

테스트 샘플 수: 1962


In [28]:
trainer.model.eval()

PeftModelForSequenceClassification(
  (base_model): LoraModel(
    (model): Gemma3ForSequenceClassification(
      (vision_tower): SiglipVisionModel(
        (vision_model): SiglipVisionTransformer(
          (embeddings): SiglipVisionEmbeddings(
            (patch_embedding): Conv2d(3, 1152, kernel_size=(14, 14), stride=(14, 14), padding=valid)
            (position_embedding): Embedding(4096, 1152)
          )
          (encoder): SiglipEncoder(
            (layers): ModuleList(
              (0-26): 27 x SiglipEncoderLayer(
                (layer_norm1): LayerNorm((1152,), eps=1e-06, elementwise_affine=True)
                (self_attn): SiglipAttention(
                  (k_proj): lora.Linear4bit(
                    (base_layer): Linear4bit(in_features=1152, out_features=1152, bias=True)
                    (lora_dropout): ModuleDict(
                      (default): Dropout(p=0.1, inplace=False)
                    )
                    (lora_A): ModuleDict(
                      

In [None]:
# 추론 파이프라인 구성 (GPU 사용, 모든 클래스 점수 출력)
clf = pipeline(
    "text-classification",
    model=trainer.model,
    tokenizer=tokenizer,
    return_all_scores=True,
)

Device set to use cuda:0


In [None]:
print("샘플 결과 예시:", clf(test_df['paragraph_text'][0]))

샘플 결과 예시: [[{'label': 'LABEL_0', 'score': 0.6584175229072571}, {'label': 'LABEL_1', 'score': 0.3415825068950653}]]


In [None]:
for text in test_df['paragraph_text']:
    scores = clf(text)[0]
    prob_ai = None
    for s in scores:
        if s['label'] in ['LABEL_1', '1', 'generated']:
            prob_ai = s['score']
            break
    if prob_ai is None:
        prob_ai = scores[1]['score']
    pred_probs.append(prob_ai)

You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset


In [31]:
# 결과를 제출 데이터프레임에 기록
submission_df['generated'] = pred_probs

In [33]:
submission_df

Unnamed: 0,ID,generated
0,TEST_0000,0.341583
1,TEST_0001,0.414899
2,TEST_0002,0.222700
3,TEST_0003,0.819893
4,TEST_0004,0.893309
...,...,...
1957,TEST_1957,0.991084
1958,TEST_1958,0.992000
1959,TEST_1959,0.317426
1960,TEST_1960,0.250913


In [None]:
submission_df.to_csv("./ensemble/data/test_ensemble_folding/test_gemma_fold0.csv", index=False, encoding="utf-8-sig")

# VAL 데이터셋 배치 추론

In [36]:
def tokenize_test(batch):
    return tokenizer(batch["text"], truncation=True)

val_ds = Dataset.from_pandas(val_df)

val_ds = val_ds.map(tokenize_test, batched=True,
                      remove_columns=["text", "label"])

Map: 100%|██████████| 30381/30381 [00:02<00:00, 11528.74 examples/s]


In [37]:
def collate(features):
    """
    • 동적 padding → tensor 변환
    • tokenizer가 추가한 'length' 류 메타키 제거
    """
    batch = data_collator(features)
    return batch

In [38]:
BATCH_TEST = 8
loader = DataLoader(
    val_ds,
    batch_size=BATCH_TEST,
    shuffle=False,
    collate_fn=collate,
    pin_memory=True,
)

probs_list = []

with torch.no_grad():
    for batch in tqdm(loader):
        batch = {k: v.to(device) for k, v in batch.items()}
        logits = trainer.model(**batch).logits
        probs  = torch.softmax(logits, dim=-1)[:, 1]
        probs_list.append(probs.cpu())

probs = torch.cat(probs_list).to(torch.float32).numpy()
print(f"[✓] Inference done – {len(probs)} samples")

 79%|███████▉  | 2999/3798 [14:59<04:12,  3.16it/s]IOPub message rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_msg_rate_limit`.

Current values:
ServerApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
ServerApp.rate_limit_window=3.0 (secs)



In [None]:
val_df['generated'] = probs
val_df['ID'] = pd.read_csv(FOLD_VAL, encoding="utf-8-sig")['id']
val_df = val_df[['ID', 'generated', 'label']]
val_df.to_csv("./ensemble/data/val_ensemble_folding/val_gemma_fold0.csv", index=False, encoding="utf-8-sig")