# 모델 구축

1. AI-Hub 데이터로 감정분석 모델을 먼저 학습
2. 크롤링 등의 방법으로 관광 명소 리뷰로 **도메인 전이 학습**
3. 관광 리뷰와 관련된 키워드를 강화해 최종 모델 구축

# AI-Hub 데이터로 감정분석 모델 학습

In [None]:
# Parameters
max_len = 512
batch_size = 64
warmup_ratio = 0.1
num_epochs = 3
max_grad_norm = 1
log_interval = 200
learning_rate = 5e-5

# 1 데이터 처리

## 1.1 데이터 로드

In [None]:
from pathlib import Path
import os
import json
import glob
os.environ['CUDA_LAUNCH_BLOCKING'] = '1'

proj_root = Path('.').resolve().parent
data_path = os.path.join(proj_root, 'data', 'json')

json_files = glob.glob(data_path + '/TL_SNS_*/*.json', recursive=True)

## 1.2. 데이터셋 클래스 생성

> `relevant_aspects`에 임의로 학습시킬 키워드 지정

In [None]:
import torch
from torch.utils.data import Dataset, DataLoader

# DataSet Class

relevant_aspects = ['가격', '서비스', '품질']

class ABSADataset(Dataset):
    def __init__(self, json_files, tokenizer, max_len):
        self.sentences = []
        self.labels = []
        self.tokenizer = tokenizer
        self.max_len = max_len

        for file in json_files:
            with open(file, 'r', encoding='utf-8') as f:
                data = json.load(f)
                for review in data:
                    sentence = review["RawText"]
                    for aspect in review["Aspects"]:
                        aspect_term = aspect["Aspect"]
                        if aspect_term in relevant_aspects:
                            sentiment = int(aspect["SentimentPolarity"]) + 1 # 0: Negative, 1: Neutral, 2: Positive
                            combined_input = f"{sentence} [SEP] {aspect_term}"
                            self.sentences.append(combined_input)
                            self.labels.append(sentiment)

    def __len__(self):
        return len(self.sentences)

    def __getitem__(self, idx):
        combined_input = self.sentences[idx]
        label = self.labels[idx]
        encoding = self.tokenizer.encode_plus(
            combined_input,
            add_special_tokens=True,
            max_length=self.max_len,
            return_token_type_ids=False,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt',
        )
        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(label, dtype=torch.long)
        }

## 1.3 데이터 전처리(Tokenizer)
tokenizer는 Huggingface transformer에서 제공하는 라이브러리중에서 사용함. : skt/kobert-base-v1

In [None]:
from kobert_tokenizer import KoBERTTokenizer

tokenizer = KoBERTTokenizer.from_pretrained("skt/kobert-base-v1", sp_model_kwargs={'nbest_size': -1, 'alpha': 0.6, 'enable_sampling': True})

train_dataset = ABSADataset(json_files, tokenizer, max_len=max_len)

## 2. 모델 학습

## 2.1 모델 로드
skt/kobert-base-v1 기반으로, SequenceClassification모델 사용, label은 3개(긍정, 중립, 부정)

In [None]:
# 모델 로드
import torch
from transformers import BertForSequenceClassification

device = torch.device("cuda:0")
model = BertForSequenceClassification.from_pretrained("skt/kobert-base-v1", num_labels=3).to(device)

In [None]:
# DataLoader 생성
from torch.utils.data import DataLoader

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, num_workers=4)

## 2.2 옵티마이저
- 옵티마이저: adamW 사용

In [None]:
from transformers import AdamW, get_linear_schedule_with_warmup
# Optimizer
optimizer = AdamW(model.parameters(), lr=5e-5, weight_decay=0.01)

# Secheduler
total_steps = len(train_loader) * num_epochs
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)

## 2.3 학습 루프
epoch = 3
batch = 64

In [None]:
print(f"Total samples: {len(train_dataset)}")

In [None]:
from tqdm import tqdm
from torch.amp import autocast, GradScaler

scaler = GradScaler()

# 모델 학습
model.train()

for epoch in range(num_epochs):
    epoch_loss = 0
    progress_bar = tqdm(train_loader, desc=f"Epoch {epoch + 1}/{num_epochs}", leave=False)
    for batch in progress_bar:
        optimizer.zero_grad()
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        with autocast("cuda:0"):
            outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss
            epoch_loss += loss.item()

        scaler.scale(loss).backward()
        scaler.step(optimizer)
        scaler.update()

        progress_bar.set_postfix(loss=loss.item())

    avg_loss = epoch_loss / len(train_loader)
    print(f"Epoch {epoch + 1}/{num_epochs}, Average Loss: {avg_loss:.4f}")

    torch.save(model.state_dict(), f"model_epoch_{epoch + 1}.pth")

## 2.4 모델 저장

In [None]:
model.save_pretrained("kobert-finetuned")
tokenizer.save_pretrained("kobert-finetuned")

# 3. 모델 평가

Todo : validation 셋으로 모델 성능 평가 필요

In [None]:
# 모델 평가 예시
model.eval()
with torch.no_grad():
    test_sentence = "이 제품은 디자인이 정말 멋져요"
    test_aspect = "디자인"
    combined_input = f"{test_sentence} [SEP] {test_aspect}"
    encoding = tokenizer.encode_plus(
        combined_input,
        add_special_tokens=True,
        max_length=128,
        return_token_type_ids=False,
        padding='max_length',
        truncation=True,
        return_attention_mask=True,
        return_tensors='pt',
    )

    input_ids = encoding['input_ids'].to(device)
    attention_mask = encoding['attention_mask'].to(device)

    outputs = model(input_ids=input_ids, attention_mask=attention_mask)
    logits = outputs.logits
    prediction = torch.argmax(logits, dim=-1).item()

    sentiment_map = {0: "부정", 1: "중립", 2: "긍정"}
    print(f"Aspect '{test_aspect}'에 대한 감성 분석 결과: {sentiment_map[prediction]}")