# SA_SLM 학습 노트북

## 필요 파일
- `data.zip`: 로컬 `data/` 폴더를 ZIP 압축

## 실행 전
런타임 > 런타임 유형 변경 > **GPU (T4)** 선택

In [None]:
# 1. 환경 설정
!pip install -q torch transformers datasets accelerate peft bitsandbytes trl
!pip install -U bitsandbytes

In [None]:
# 2. 데이터 업로드
from google.colab import files
import zipfile, os, json, glob

print('data.zip 파일을 업로드하세요')
uploaded = files.upload()

for f in uploaded:
    if f.endswith('.zip'):
        zipfile.ZipFile(f).extractall('.')
        print(f'{f} 압축 해제 완료')

In [None]:
# 3. 데이터 로드
records = [json.load(open(f)) for f in 
           glob.glob('./data/examples/student_record*.json') + 
           glob.glob('./data/raw/students/*.json')]

tasks = {}
for k, v in [('roadmap', 'test_task1_roadmap.json'),
             ('statement', 'test_task2_statement.json'),
             ('evaluation', 'test_task3_evaluation.json')]:
    p = './data/examples/' + v
    if os.path.exists(p):
        tasks[k] = json.load(open(p))

print(f'{len(records)} records, {sum(len(v) for v in tasks.values())} tasks')

In [None]:
# 4. 학습 데이터 변환 (Schema v2)
examples = []

for r in records:
    # 신 스키마 v2 구조
    school = r.get('school_info', {})
    grades = r.get('grades', {})
    major = r.get('target_major', {})
    acts_obj = r.get('activities', {})
    notes = r.get('consultant_notes', {})
    result = r.get('admission_result', {})
    
    # 교과 + 비교과 활동 통합
    all_acts = acts_obj.get('curricular', []) + acts_obj.get('extracurricular', [])
    
    # 프로필 문자열 생성
    pstr = f"지역:{school.get('region','')} 학교:{school.get('school_type','')} 계열:{major.get('track','')} 성적:{grades.get('overall_tier','')} 관심:{','.join(major.get('interests',[]))} 가치관:{','.join(major.get('values',[]))} 목표:{major.get('specific','')}"
    
    # 차별화된 활동 (점수 4 이상)
    astr = ''.join([f"\n-[{a.get('category','')}]{a.get('description','')}" for a in all_acts if a.get('uniqueness_score',0)>=4])
    
    # 활동 추천 예시
    examples.append({'i': '활동추천', 'in': pstr, 'out': f"[추천]{astr}\n[서사]{notes.get('narrative_summary','')}"})
    
    # 세특 작성 예시
    for a in all_acts:
        if a.get('neis_statement'):
            examples.append({'i': '세특작성', 'in': f"과목:{a.get('subject','')} 활동:{a.get('description','')}", 'out': a.get('neis_statement','')})

# 기존 task 파일 처리 (하위 호환)
for item in tasks.get('roadmap', []):
    inp = item.get('input', {})
    examples.append({'i': item.get('instruction',''), 'in': f"계열:{inp.get('track','')}", 'out': item.get('output','')})

for item in tasks.get('statement', []):
    inp = item.get('input', {})
    examples.append({'i': item.get('instruction',''), 'in': f"과목:{inp.get('subject','')} 활동:{inp.get('raw_activity','')}", 'out': item.get('output','')})

print(f'{len(examples)} examples')

In [None]:
# 5. Dataset 생성
from datasets import Dataset

def fmt(e):
    return {'text': f"<|im_start|>system\n생기부전문가<|im_end|>\n<|im_start|>user\n{e['i']}\n{e['in']}<|im_end|>\n<|im_start|>assistant\n{e['out']}<|im_end|>"}

dataset = Dataset.from_list([fmt(e) for e in examples])
print(f'{len(dataset)} samples')

In [None]:
# 6. 모델 로드 + LoRA 설정
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training

MODEL = 'Qwen/Qwen2.5-3B-Instruct'

bnb = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type='nf4',
    bnb_4bit_compute_dtype=torch.bfloat16
)

tokenizer = AutoTokenizer.from_pretrained(MODEL, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token

model = AutoModelForCausalLM.from_pretrained(
    MODEL, quantization_config=bnb, device_map='auto', trust_remote_code=True
)
model = prepare_model_for_kbit_training(model)

lora = LoraConfig(
    r=16, lora_alpha=32,
    target_modules=['q_proj','k_proj','v_proj','o_proj','gate_proj','up_proj','down_proj'],
    lora_dropout=0.05, bias='none', task_type='CAUSAL_LM'
)
model = get_peft_model(model, lora)
model.print_trainable_parameters()

In [None]:
# 7. 학습
from transformers import Trainer, TrainingArguments, DataCollatorForLanguageModeling

args = TrainingArguments(
    output_dir='./out',
    num_train_epochs=10, # 3 -> 10
    per_device_train_batch_size=1,
    gradient_accumulation_steps=4,
    learning_rate=1e-4, # 2e-4 -> 1e-4
    warmup_ration=0.05, # 새롭게 추가
    bf16=True,
    logging_steps=10,
    save_strategy='epoch',
    optim='paged_adamw_8bit',
    report_to='none'
)

def tokenize(example):
    return tokenizer(example['text'], truncation=True, max_length=2048, padding='max_length')

tokenized_dataset = dataset.map(tokenize, remove_columns=['text'])
collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

trainer = Trainer(
    model=model,
    args=args,
    train_dataset=tokenized_dataset,
    data_collator=collator
)

print(f'Training {len(tokenized_dataset)} samples...')
trainer.train()

In [None]:
# 8. 저장 & 다운로드
import shutil

model.save_pretrained('./adapter')
tokenizer.save_pretrained('./adapter')
shutil.make_archive('sa_slm_adapter', 'zip', '.', './adapter')

print('다운로드 시작...')
files.download('sa_slm_adapter.zip')