In [1]:
# 모델 돌린 코드
import os
import torch
import torch.nn as nn
import torch.optim as optim
import pandas as pd
import numpy as np
from transformers import BertTokenizer, BertForSequenceClassification, get_scheduler
from torch.utils.data import DataLoader, random_split
from sklearn.utils.class_weight import compute_class_weight
from tqdm import tqdm
from sklearn.metrics import f1_score, accuracy_score
from transformers import logging

# 불필요한 경고 메시지 숨기기
os.environ["TF_CPP_MIN_LOG_LEVEL"] = "3"
os.environ["CUDA_VISIBLE_DEVICES"] = "0"
logging.set_verbosity_error()


In [2]:

# GPU 설정 및 최적화
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
torch.backends.cudnn.benchmark = True  # 연산 최적화 활성화
print(f"Using device: {device}")
if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)}")

Using device: cuda
GPU: Tesla T4


In [3]:
# 데이터 로드
df_resampled = pd.read_csv("./review_balanced_resampled.csv")

# BERT 토크나이저 로드
tokenizer = BertTokenizer.from_pretrained("klue/bert-base")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/289 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/248k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/495k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/425 [00:00<?, ?B/s]

In [36]:
import pandas as pd
import os

# 전체 데이터 로드
#df_full = pd.read_csv("data.csv")
print(f"전체 데이터 로드 메모리 사용량: {df_resampled.memory_usage(deep=True).sum() / 1e6:.2f} MB")

# 필요한 컬럼만 로드
df_partial = pd.read_csv("review_balanced_resampled.csv", usecols=["cleaned_review"])
print(f"필요한 컬럼만 로드 메모리 사용량: {df_partial.memory_usage(deep=True).sum() / 1e6:.2f} MB")


전체 데이터 로드 메모리 사용량: 285.44 MB
필요한 컬럼만 로드 메모리 사용량: 53.44 MB


In [5]:
df_resampled.head(5)

Unnamed: 0,review_id,product_id,rating,review_text,review_date,review_size,review_length,cleaned_review,tokenized_review,categories,가성비,내구성 및 품질,디자인,배송 및 포장 및 응대,사이즈,착용감
0,65431338,2070763,5.0,가죽이 부들부들해서 착화감이 좋아요\n\n색깔도 맘에 듭니다,2024-10-10,42(260) 구매,5,가죽이 부들부들해서 착화감이 좋아요 색깔도 맘에 듭니다,"['가죽', '부들부들해서', '좋아요', '색깔', '듭니']","['착용감', '내구성 및 품질', '디자인']",0,1,1,0,0,1
1,11913341,1092992,5.0,여름에 거의 매일 신고 다녔어요 오래 신어도 편하고 무난해요,2020-09-27,240 구매,8,여름에 거의 매일 신고 다녔어요 오래 신어도 편하고 무난해요,"['여름', '거의', '매일', '신고', '다녔어요', '오래', '신어', '...","['착용감', '내구성 및 품질']",0,1,0,0,0,1
2,20956603,1494180,5.0,신발 자체가 너무 이쁘고 키높이도 맘에 들어요! :) 자주 신고 다닐 거 같습니다 ㅎㅎ,2021-10-20,260 구매,8,신발 자체가 너무 이쁘고 키높이도 맘에 들어요 자주 신고 다닐 거 같습니다,"['신발', '자체', '이쁘고', '높이', '들어요', '자주', '신고', '...",['디자인'],0,0,1,0,0,0
3,59968738,1798273,5.0,배송이 빠르고 포장 꼼곰합니다.\n발볼 넓은 버전도 나왔으면 좋겠네요,2024-05-27,270 구매,9,배송이 빠르고 포장 꼼곰합니다 발볼 넓은 버전도 나왔으면 좋겠네요,"['배송', '빠르고', '포장', '꼼곰합니', '발볼', '넓은', '버전', ...","['사이즈', '배송 및 포장 및 응대']",0,0,0,1,1,0
4,53871258,1635193,5.0,내성발톱 있는데 발볼이 좁지 않아서 편하고 무난해서 좋아요,2023-12-30,270 구매,7,내성발톱 있는데 발볼이 좁지 않아서 편하고 무난해서 좋아요,"['내성발톱', '있는데', '발볼', '좁지', '않아서', '편하고', '무난'...","['착용감', '사이즈']",0,0,0,0,1,1


In [4]:
# 토큰화 함수
def tokenize_function(texts):
    return tokenizer(texts, padding="max_length", truncation=True, max_length=128, return_tensors="pt")

In [6]:
#1분 소요
tokenized_data = tokenize_function(df_resampled["cleaned_review"].tolist())
tokenized_data[:5]

{'input_ids': tensor([[    2,  9363,  2052, 23701,  6396,  1633,  2267,  2434,  2052,  5723,
           2182,  7177,  2119,  1043,  2170, 11815,     3,     0,     0,     0,
              0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
              0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
              0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
              0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
              0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
              0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
              0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
              0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
              0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
              0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
              0,     0,    

In [7]:
#batch_size=5000으로 나눠서 처리하여 속도 개선
from transformers import BatchEncoding

def tokenize_function(texts, batch_size=5000):
    encodings = []
    for i in range(0, len(texts), batch_size):
        batch = texts[i:i+batch_size]
        encodings.append(tokenizer(batch, padding="max_length", truncation=True, max_length=128, return_tensors="pt"))
    return BatchEncoding({k: torch.cat([e[k] for e in encodings]) for k in encodings[0]})


In [8]:
#1분54초
tokenized_data = tokenize_function(df_resampled["cleaned_review"].tolist())
tokenized_data[:5]

{'input_ids': tensor([[    2,  9363,  2052, 23701,  6396,  1633,  2267,  2434,  2052,  5723,
           2182,  7177,  2119,  1043,  2170, 11815,     3,     0,     0,     0,
              0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
              0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
              0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
              0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
              0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
              0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
              0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
              0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
              0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
              0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
              0,     0,    

In [9]:
labels = torch.tensor(df_resampled[["가성비", "내구성 및 품질", "디자인", "배송 및 포장 및 응대", "사이즈", "착용감"]].values, dtype=torch.float32)
labels[:5]

tensor([[0., 1., 1., 0., 0., 1.],
        [0., 1., 0., 0., 0., 1.],
        [0., 0., 1., 0., 0., 0.],
        [0., 0., 0., 1., 1., 0.],
        [0., 0., 0., 0., 1., 1.]])

In [88]:
# 클래스 가중치 계산
# class_weights = torch.tensor([
#     compute_class_weight("balanced", classes=np.array([0, 1]), y=df_resampled[label].values)[1]
#     for label in ["가성비", "내구성 및 품질", "디자인", "배송 및 포장 및 응대", "사이즈", "착용감"]
# ], dtype=torch.float32).to(device)


  class_weights = torch.sqrt(torch.tensor(class_weights, dtype=torch.float32).cuda())


tensor([1.2515, 1.0926, 1.0247, 1.1180, 1.0851, 0.9685], device='cuda:0')

In [89]:
# 클래스 가중치 정규화 (최소 1.0 이상 유지)
min_weight = 1.0
max_weight = 2.0  # 최대 가중치를 2.0으로 제한

normalized_weights = (class_weights - class_weights.min()) / (class_weights.max() - class_weights.min())
normalized_weights = normalized_weights * (max_weight - min_weight) + min_weight

print("정규화된 클래스 가중치:", normalized_weights)
#정규화된 클래스 가중치: tensor([2.0000, 1.3464, 1.1417, 1.4336, 1.3220, 1.0000], device='cuda:0')

class_weights = torch.sqrt(torch.tensor(normalized_weights, dtype=torch.float32).cuda())
class_weights

정규화된 클래스 가중치: tensor([2.0000, 1.4384, 1.1988, 1.5281, 1.4121, 1.0000], device='cuda:0')


  class_weights = torch.sqrt(torch.tensor(normalized_weights, dtype=torch.float32).cuda())


tensor([1.4142, 1.1993, 1.0949, 1.2362, 1.1883, 1.0000], device='cuda:0')

In [90]:
# 데이터셋 클래스
class ReviewDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        item = {key: val[idx] for key, val in self.encodings.items()}
        item["labels"] = self.labels[idx]
        return item


In [91]:
dataset = ReviewDataset(tokenized_data, labels)
dataset.__len__()

287189

In [92]:
train_size = int(0.8 * len(dataset))
val_size = int(0.1 * len(dataset))
test_size = len(dataset) - train_size - val_size
train_size, val_size, test_size

(229751, 28718, 28720)

In [93]:
train_dataset, val_dataset, test_dataset = random_split(dataset, [train_size, val_size, test_size])

In [94]:
# DataLoader 설정 (최적화)
gpu_memory = torch.cuda.get_device_properties(0).total_memory / 1e9  # GB 단위 변환
gpu_memory

15.828320256

In [95]:
#batch_size = 32 if gpu_memory > 15 else 16 if gpu_memory > 10 else 8
#batch_size = 64
batch_size = min(128, batch_size * 2)
batch_size

128

In [96]:
gradient_accumulation_steps = 1 if batch_size >= 16 else 2
gradient_accumulation_steps

1

In [97]:
num_workers = min(4, os.cpu_count())
num_workers

2

In [98]:
#데이터 로더 (DataLoader) 설정
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, num_workers=num_workers, pin_memory=True, persistent_workers=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False, num_workers=num_workers, pin_memory=True, persistent_workers=True)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False, num_workers=num_workers, pin_memory=True, persistent_workers=True)



```
BertForSequenceClassification(
  (bert): BertModel(  # 사전 학습된 BERT 모델 (Feature Extractor)
    (embeddings): BertEmbeddings(...)  # 입력 단어 임베딩
    (encoder): BertEncoder(...)  # Transformer 인코더 (12개 레이어)
    (pooler): BertPooler(...)  # [CLS] 토큰의 최종 벡터 변환
  )
  (dropout): Dropout(p=0.1, inplace=False)  # 드롭아웃 적용 (과적합 방지)
  (classifier): Linear(in_features=768, out_features=6, bias=True)  # 최종 분류 레이어
)

```



In [99]:
# BERT 모델 로드 (Dropout 최적화)
model = BertForSequenceClassification.from_pretrained("klue/bert-base", num_labels=6, ignore_mismatched_sizes=True)
model.config.hidden_dropout_prob = 0.5 #0.4 #0.3
model.config.attention_probs_dropout_prob = 0.5 #0.4 #0.3
for name, param in model.named_parameters():
    if "LayerNorm" in name:
        param.requires_grad = False
model.to(device)

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(32000, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e

In [100]:
# 손실 함수 (가중치 적용)
loss_fn = nn.BCEWithLogitsLoss(pos_weight=class_weights)
loss_fn

BCEWithLogitsLoss()

In [101]:
# 옵티마이저 & 학습률 스케줄러
#optimizer = optim.AdamW(model.parameters(), lr=2e-5, weight_decay=1e-2)
optimizer = optim.AdamW(model.parameters(), lr=2e-5, weight_decay=5e-2)
#lr_scheduler = get_scheduler("cosine", optimizer=optimizer, num_warmup_steps=500, num_training_steps=len(train_loader) * 5)
lr_scheduler = get_scheduler("cosine", optimizer=optimizer, num_warmup_steps=1000, num_training_steps=len(train_loader) * 5)

# Mixed Precision (AMP) 사용
scaler = torch.cuda.amp.GradScaler()
batch_size

  scaler = torch.cuda.amp.GradScaler()


128

In [102]:
# 빠른 Validation 평가 함수
def quick_evaluate(model, val_loader, num_batches=5):
    model.eval() #평가 모드로 전환하여 Dropout 비활성화.
    total_loss = 0
    all_preds, all_labels = [], []

    with torch.no_grad():
        for i, batch in enumerate(val_loader):
            if i >= num_batches:
                break

            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            labels = batch["labels"].to(device)

            outputs = model(input_ids, attention_mask=attention_mask)
            loss = loss_fn(outputs.logits, labels)
            total_loss += loss.item()

            preds = torch.sigmoid(outputs.logits).cpu().numpy() > 0.5 #확률을 0.5 기준으로 변환하여 예측 수행.
            all_preds.extend(preds)
            all_labels.extend(labels.cpu().numpy())

    return total_loss / num_batches, accuracy_score(all_labels, all_preds), f1_score(all_labels, all_preds, average="macro")


In [103]:
# 학습 함수 (train) - Early Stopping 추가 및 출력 개선
def train(model, train_loader, val_loader, epochs=5):
    best_val_loss = float("inf")
    patience = 2 #Early Stopping을 적용하여 3번 연속으로 검증 손실이 개선되지 않으면 학습 중지.
    patience_counter = 0
    min_delta = 0.0003

    for epoch in range(epochs):
        model.train()
        total_loss = 0
        optimizer.zero_grad()
        loop = tqdm(train_loader, leave=True, desc=f"Epoch {epoch+1}/{epochs}")

        for step, batch in enumerate(loop):
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            labels = batch["labels"].to(device)

            #with torch.cuda.amp.autocast():
            #테스트
            with torch.autocast(device_type="cuda", dtype=torch.float16):
                outputs = model(input_ids, attention_mask=attention_mask)
                loss = loss_fn(outputs.logits, labels) / gradient_accumulation_steps #작은 배치 크기에서도 충분한 학습 효과를 주기 위해 Gradient Accumulation 적용

            scaler.scale(loss).backward() #AMP(자동 혼합 정밀도) 적용.
            torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0) #기울기 폭발(Gradient Explosion) 방지를 위해 최대값 제한

            if (step + 1) % gradient_accumulation_steps == 0:
                scaler.unscale_(optimizer)
                scaler.step(optimizer)
                scaler.update()
                lr_scheduler.step()
                optimizer.zero_grad()

            total_loss += loss.item()
            loop.set_postfix(loss=total_loss / (step + 1))  # 실시간 loss 출력

        val_loss, val_accuracy, val_f1 = quick_evaluate(model, val_loader)
        print(f"Epoch {epoch+1} | Train Loss: {total_loss / len(train_loader):.4f} | Val Loss: {val_loss:.4f} | Val Acc: {val_accuracy:.4f} | Val F1: {val_f1:.4f}")




```
Epoch 1/5: 100%|██████████| 3590/3590 [19:29<00:00,  3.07it/s, loss=0.261]
Epoch 1 | Train Loss: 0.2608 | Val Loss: 0.0775 | Val Acc: 0.9000 | Val F1: 0.9788
Epoch 2/5: 100%|██████████| 3590/3590 [19:28<00:00,  3.07it/s, loss=0.0669]
Epoch 2 | Train Loss: 0.0669 | Val Loss: 0.0499 | Val Acc: 0.9344 | Val F1: 0.9855
Epoch 3/5: 100%|██████████| 3590/3590 [19:27<00:00,  3.08it/s, loss=0.0461]
Epoch 3 | Train Loss: 0.0461 | Val Loss: 0.0391 | Val Acc: 0.9531 | Val F1: 0.9892
Epoch 4/5: 100%|██████████| 3590/3590 [19:28<00:00,  3.07it/s, loss=0.036]
Epoch 4 | Train Loss: 0.0360 | Val Loss: 0.0335 | Val Acc: 0.9531 | Val F1: 0.9900
Epoch 5/5: 100%|██████████| 3590/3590 [19:28<00:00,  3.07it/s, loss=0.0338]
Epoch 5 | Train Loss: 0.0338 | Val Loss: 0.0337 | Val Acc: 0.9531 | Val F1: 0.9900
```





```
Epoch 1/5: 100%|██████████| 3590/3590 [19:30<00:00,  3.07it/s, loss=0.253]
Epoch 1 | Train Loss: 0.2533 | Val Loss: 0.0735 | Val Acc: 0.8875 | Val F1: 0.9757
Epoch 2/5: 100%|██████████| 3590/3590 [19:30<00:00,  3.07it/s, loss=0.0638]
Epoch 2 | Train Loss: 0.0638 | Val Loss: 0.0441 | Val Acc: 0.9406 | Val F1: 0.9863
Epoch 3/5: 100%|██████████| 3590/3590 [19:27<00:00,  3.07it/s, loss=0.0448]
Epoch 3 | Train Loss: 0.0448 | Val Loss: 0.0369 | Val Acc: 0.9500 | Val F1: 0.9887
Epoch 4/5: 100%|██████████| 3590/3590 [19:27<00:00,  3.07it/s, loss=0.0389]
Epoch 4 | Train Loss: 0.0389 | Val Loss: 0.0352 | Val Acc: 0.9469 | Val F1: 0.9874
Epoch 5/5: 100%|██████████| 3590/3590 [19:27<00:00,  3.08it/s, loss=0.0378]
Epoch 5 | Train Loss: 0.0378 | Val Loss: 0.0342 | Val Acc: 0.9469 | Val F1: 0.9874
```



In [None]:
# 학습 시작
# 13분 25초 / 25분 * 5 = 125분
# 1시간 37분 32초
train(model, train_loader, val_loader, epochs=5)

Epoch 1/5:  64%|██████▎   | 1142/1795 [11:11<06:23,  1.70it/s, loss=0.459]

테스트 데이터에서 성능이 95% 이상 유지되면 실사용 가능!   
테스트에서 F1-score가 급락하면, 모델이 과적합일 가능성이 있음.

In [81]:
test_loss, test_acc, test_f1 = quick_evaluate(model, test_loader)
print(f"Test Loss: {test_loss:.4f} | Test Acc: {test_acc:.4f} | Test F1: {test_f1:.4f}")


Test Loss: 0.0344 | Test Acc: 0.9563 | Test F1: 0.9900


In [83]:
# 모델 저장 (전체 모델 저장 추가)
torch.save(model, "model.pth")

In [None]:
import torch

# 저장된 모델 로드
model = torch.load("model.pth")
model.eval()  # 평가 모드로 변경


In [82]:
from transformers import BertTokenizer
import torch

# BERT 토크나이저 로드
tokenizer = BertTokenizer.from_pretrained("klue/bert-base")

# 새로운 리뷰 예제
#text = "이 제품은 가성비가 뛰어나고 디자인이 정말 멋져요!"
text = ""
inputs = tokenizer(text, return_tensors="pt", truncation=True, padding="max_length", max_length=128)

# GPU 사용 가능하면 모델도 GPU로 이동
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
inputs = {key: val.to(device) for key, val in inputs.items()}

# 모델 예측 실행
with torch.no_grad():
    outputs = model(**inputs)

# 로짓을 확률로 변환
probs = torch.sigmoid(outputs.logits)
predictions = (probs > 0.5).int()  # 0.5 이상이면 해당 클래스로 예측

print("예측 확률:", probs)
print("예측 결과 (0=해당 없음, 1=해당):", predictions.cpu().numpy())




예측 확률: tensor([[0.9991, 0.0062, 0.9983, 0.0038, 0.0028, 0.0039]], device='cuda:0')
예측 결과 (0=해당 없음, 1=해당): [[1 0 1 0 0 0]]


가성비, 내구성, 디자인, 배송, 사이즈, 착용감

In [None]:
class_labels = ["가성비", "내구성", "디자인", "배송", "사이즈", "착용감"]


In [86]:
from transformers import BertTokenizer
import torch

# BERT 토크나이저 로드
tokenizer = BertTokenizer.from_pretrained("klue/bert-base")

# 모델을 GPU로 이동
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
model.eval()  # 평가 모드

# 6개 클래스 레이블 정의
class_labels = ["가성비", "내구성", "디자인", "배송", "사이즈", "착용감"]

# 10개 샘플 리뷰
test_samples = [
    "가격 대비 품질이 뛰어나고 디자인도 세련됐어요. 정말 만족합니다.",
    "신발이 튼튼하고 내구성이 좋아서 오래 신을 수 있을 것 같아요.",
    "배송이 예상보다 빨랐고, 포장도 꼼꼼하게 잘 되어 있었어요.",
    "사이즈가 딱 맞고 착화감이 편해서 장시간 신어도 문제 없어요.",
    "디자인이 너무 예쁘고 색감도 고급스러워요. 선물하기에도 좋아요.",
    "가격이 너무 비싼데 품질은 기대 이하여서 아쉬웠어요.",
    "조금만 신었는데 벌써 해지기 시작했어요. 내구성이 너무 약하네요.",
    "배송이 너무 늦고, 상자도 찌그러져서 왔어요. 기분이 별로네요.",
    "사이즈가 설명과 다르게 작게 나와서 불편합니다. 교환해야 할 듯.",
    "사진이랑 색상이 너무 다르게 나와서 실망했어요. 디자인이 별로네요."
]

# 예측 수행 함수
def predict_review(texts):
    inputs = tokenizer(texts, padding="max_length", truncation=True, max_length=128, return_tensors="pt")
    inputs = {key: val.to(device) for key, val in inputs.items()}

    with torch.no_grad():
        outputs = model(**inputs)

    probs = torch.sigmoid(outputs.logits)  # 확률 변환
    predictions = (probs > 0.5).int()  # 0.5 이상이면 해당 클래스로 예측

    return probs.cpu().numpy(), predictions.cpu().numpy()

# 10개 샘플 예측
probs, preds = predict_review(test_samples)

# 결과 출력
for i, (review, prob, pred) in enumerate(zip(test_samples, probs, preds)):
    predicted_labels = [class_labels[j] for j in range(len(class_labels)) if pred[j] == 1]  # 1인 레이블만 선택

    print(f"📝 리뷰 {i+1}: {review}")
    print(f"🔹 예측된 라벨: {predicted_labels if predicted_labels else ['해당 없음']}")
    print("-" * 80)


📝 리뷰 1: 가격 대비 품질이 뛰어나고 디자인도 세련됐어요. 정말 만족합니다.
🔹 예측된 라벨: ['가성비', '내구성', '디자인']
--------------------------------------------------------------------------------
📝 리뷰 2: 신발이 튼튼하고 내구성이 좋아서 오래 신을 수 있을 것 같아요.
🔹 예측된 라벨: ['내구성']
--------------------------------------------------------------------------------
📝 리뷰 3: 배송이 예상보다 빨랐고, 포장도 꼼꼼하게 잘 되어 있었어요.
🔹 예측된 라벨: ['배송']
--------------------------------------------------------------------------------
📝 리뷰 4: 사이즈가 딱 맞고 착화감이 편해서 장시간 신어도 문제 없어요.
🔹 예측된 라벨: ['사이즈', '착용감']
--------------------------------------------------------------------------------
📝 리뷰 5: 디자인이 너무 예쁘고 색감도 고급스러워요. 선물하기에도 좋아요.
🔹 예측된 라벨: ['디자인']
--------------------------------------------------------------------------------
📝 리뷰 6: 가격이 너무 비싼데 품질은 기대 이하여서 아쉬웠어요.
🔹 예측된 라벨: ['가성비', '내구성']
--------------------------------------------------------------------------------
📝 리뷰 7: 조금만 신었는데 벌써 해지기 시작했어요. 내구성이 너무 약하네요.
🔹 예측된 라벨: ['내구성']
------------------------------------------------------------

In [None]:
# 첫번째 결과
2025-02-25 20:16:31.314967: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-02-25 20:16:31.316332: I external/local_xla/xla/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.
2025-02-25 20:16:31.320181: I external/local_xla/xla/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.
2025-02-25 20:16:31.332216: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
WARNING: All log messages before absl::InitializeLog() is called are written to STDERR
E0000 00:00:1740482191.353197   16073 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1740482191.359222   16073 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-02-25 20:16:31.379992: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
Using device: cuda
GPU: Tesla T4
/home/ubuntu/anaconda3/envs/pytorch_env/lib/python3.9/site-packages/huggingface_hub/file_download.py:797: FutureWarning: `resume_download` is deprecated and will be removed in version 1.0.0. Downloads always resume when possible. If you want to force a new download, use `force_download=True`.
  warnings.warn(
/home/ubuntu/anaconda3/envs/pytorch_env/lib/python3.9/site-packages/huggingface_hub/file_download.py:797: FutureWarning: `resume_download` is deprecated and will be removed in version 1.0.0. Downloads always resume when possible. If you want to force a new download, use `force_download=True`.
  warnings.warn(
Epoch 1/5: 100%|██████████| 7180/7180 [25:37<00:00,  4.67it/s, loss=0.162]
Epoch 1 | Train Loss: 0.1615 | Val Loss: 0.0531 | Val Acc: 0.9250 | Val F1: 0.9834
Epoch 2/5: 100%|██████████| 7180/7180 [25:37<00:00,  4.67it/s, loss=0.0363]
Epoch 2 | Train Loss: 0.0363 | Val Loss: 0.0067 | Val Acc: 0.9875 | Val F1: 0.9972
Epoch 3/5: 100%|██████████| 7180/7180 [25:30<00:00,  4.69it/s, loss=0.0221]
Epoch 3 | Train Loss: 0.0221 | Val Loss: 0.0042 | Val Acc: 0.9875 | Val F1: 0.9972
Epoch 4/5: 100%|██████████| 7180/7180 [25:33<00:00,  4.68it/s, loss=0.0196]
Epoch 4 | Train Loss: 0.0196 | Val Loss: 0.0084 | Val Acc: 0.9875 | Val F1: 0.9981
Epoch 5/5: 100%|██████████| 7180/7180 [25:35<00:00,  4.68it/s, loss=0.0189]
Epoch 5 | Train Loss: 0.0189 | Val Loss: 0.0079 | Val Acc: 0.9875 | Val F1: 0.9972


In [None]:
# 수요일에 돌려볼 코드
import os
import torch
import torch.nn as nn
import torch.optim as optim
import pandas as pd
import numpy as np
from transformers import BertTokenizer, BertForSequenceClassification, get_scheduler
from torch.utils.data import DataLoader, random_split
from sklearn.utils.class_weight import compute_class_weight
from tqdm import tqdm
from sklearn.metrics import f1_score, accuracy_score

# GPU 설정
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
torch.backends.cudnn.benchmark = True
print(f"Using device: {device}")

# 데이터 로드
df_resampled = pd.read_csv("./review_balanced_resampled.csv")

# 배치 크기 조정 (VRAM 최적화)
gpu_memory = torch.cuda.get_device_properties(0).total_memory / 1e9
batch_size = 16  # Tesla T4 (15GB) 기준 최적화

# 토크나이저 로드 및 데이터 변환
tokenizer = BertTokenizer.from_pretrained("klue/bert-base")
tokenized_data = tokenizer(df_resampled["cleaned_review"].tolist(), padding="max_length", truncation=True, max_length=128, return_tensors="pt")
labels = torch.tensor(df_resampled[["가성비", "내구성 및 품질", "디자인", "배송 및 포장 및 응대", "사이즈", "착용감"]].values, dtype=torch.float32)

# 데이터셋 구성
dataset = torch.utils.data.TensorDataset(tokenized_data["input_ids"], tokenized_data["attention_mask"], labels)
train_size = int(0.8 * len(dataset))
val_size = int(0.1 * len(dataset))
test_size = len(dataset) - train_size - val_size
train_dataset, val_dataset, test_dataset = random_split(dataset, [train_size, val_size, test_size])

# DataLoader 설정
num_workers = min(4, os.cpu_count())
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, num_workers=num_workers)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False, num_workers=num_workers)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False, num_workers=num_workers)

# BERT 모델 로드 및 설정 최적화 (Dropout 0.4)
model = BertForSequenceClassification.from_pretrained("klue/bert-base", num_labels=6)
model.config.hidden_dropout_prob = 0.4
model.config.attention_probs_dropout_prob = 0.4
model.to(device)

# 손실 함수 및 옵티마이저
class_weights = torch.tensor([
    compute_class_weight("balanced", classes=np.array([0, 1]), y=df_resampled[label].values)[1]
    for label in ["가성비", "내구성 및 품질", "디자인", "배송 및 포장 및 응대", "사이즈", "착용감"]
], dtype=torch.float32).to(device)

loss_fn = nn.BCEWithLogitsLoss(pos_weight=class_weights)
optimizer = optim.AdamW(model.parameters(), lr=1.2e-5, weight_decay=5e-3)

# 학습률 스케줄러 (10% Warmup)
lr_scheduler = get_scheduler("cosine", optimizer=optimizer, num_warmup_steps=int(len(train_loader) * 0.1), num_training_steps=len(train_loader) * 7)

# Mixed Precision Training 추가
scaler = torch.cuda.amp.GradScaler()

# 학습 함수 (Gradient Accumulation Steps 최적화)
def train(model, train_loader, val_loader, epochs=7, patience=5, min_delta=0.001):
    best_val_loss = float("inf")
    patience_counter = 0
    gradient_accumulation_steps = 4  # batch_size 16에 맞춰 증가

    for epoch in range(epochs):
        model.train()
        total_loss = 0
        optimizer.zero_grad()
        loop = tqdm(train_loader, leave=True, desc=f"Epoch {epoch+1}/{epochs}")

        for step, batch in enumerate(loop):
            input_ids, attention_mask, labels = [x.to(device) for x in batch]

            with torch.cuda.amp.autocast():  # Mixed Precision 적용
                outputs = model(input_ids, attention_mask=attention_mask)
                loss = loss_fn(outputs.logits, labels) / gradient_accumulation_steps

            scaler.scale(loss).backward()
            if (step + 1) % gradient_accumulation_steps == 0:
                scaler.unscale_(optimizer)
                torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
                scaler.step(optimizer)
                scaler.update()
                lr_scheduler.step()
                optimizer.zero_grad()

            total_loss += loss.item()
            loop.set_postfix(loss=total_loss / len(train_loader))

        val_loss, val_acc, val_f1 = evaluate(model, val_loader)
        print(f"Epoch {epoch+1} | Train Loss: {total_loss / len(train_loader):.4f} | Val Loss: {val_loss:.4f} | Val Acc: {val_acc:.4f} | Val F1: {val_f1:.4f}")

        if val_loss < best_val_loss - min_delta:
            best_val_loss = val_loss
            patience_counter = 0
            torch.save(model.state_dict(), "best_model.pth")
        else:
            patience_counter += 1
            if patience_counter >= patience:
                print("Early stopping activated!")
                break

# Validation 평가 함수
def evaluate(model, val_loader, threshold=0.5):
    model.eval()
    total_loss = 0
    all_preds, all_labels = [], []

    with torch.no_grad():
        for batch in val_loader:
            input_ids, attention_mask, labels = [x.to(device) for x in batch]
            outputs = model(input_ids, attention_mask=attention_mask)
            loss = loss_fn(outputs.logits, labels)
            total_loss += loss.item()

            probs = torch.sigmoid(outputs.logits).cpu().numpy()
            preds = (probs > threshold).astype(int)
            all_preds.extend(preds)
            all_labels.extend(labels.cpu().numpy())

    return total_loss / len(val_loader), accuracy_score(all_labels, all_preds), f1_score(all_labels, all_preds, average="macro", zero_division=1)

# 학습 시작
train(model, train_loader, val_loader, epochs=7)

# Test 데이터 최적 평가
model.load_state_dict(torch.load("best_model.pth"))
for threshold in [0.4, 0.5, 0.6]:
    test_loss, test_acc, test_f1 = evaluate(model, test_loader, threshold=threshold)
    print(f"Threshold {threshold} | Test Loss: {test_loss:.4f} | Test Acc: {test_acc:.4f} | Test F1: {test_f1:.4f}")
