### 필요한 라이브러리 설치 및 불러오기

In [1]:
from transformers import RobertaTokenizer
from transformers import RobertaForSequenceClassification
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from torch.utils.data import Dataset, DataLoader
from transformers import AdamW
from tqdm import tqdm
from sklearn.metrics import precision_recall_fscore_support

import matplotlib.pyplot as plt
import torch
import numpy as np
import pandas as pd
import os

tokenizer = RobertaTokenizer.from_pretrained("roberta-base")




### 데이터 로딩 및 전처리

In [3]:
df = pd.read_csv('C:\\Users\\david\\Desktop\\대학원\\Individual_project\\mbti_project\\MBTI&BigFive_data\\전처리데이터\MBTI_token_RoBERTa.csv')

In [4]:
# 필요한 컬럼 선택
texts = df['token_Reg_clean'].tolist()
labels = df['type'].tolist()

# 레이블 인코딩 (MBTI 유형을 숫자로 변환)
label_encoder = LabelEncoder()
labels_encoded = label_encoder.fit_transform(labels)

# 학습 및 테스트 데이터 분리
train_texts, val_texts, train_labels, val_labels = train_test_split(texts, labels_encoded, test_size=0.2)

### 데이터셋 및 데이터 로더 준비

In [5]:
class MBTIDataset(Dataset):
    def __init__(self, texts, labels, tokenizer):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]
        inputs = self.tokenizer(text, padding='max_length', truncation=True, max_length=512, return_tensors="pt")
        input_ids = inputs['input_ids'].squeeze(0)
        attention_mask = inputs['attention_mask'].squeeze(0)
        return {'input_ids': input_ids, 'attention_mask': attention_mask, 'labels': torch.tensor(label)}
    
def add_padding(text,seq_length,padding_idx):
        if seq_length <= len(text):
            text = text[:seq_length]
        else:
            pad_seq = np.array([padding_idx] * (seq_length - len(text)))
            text = np.concatenate([text,pad_seq])

        return text

def evaluate_model(model, data_loader, device):
    model.eval()
    predictions, true_labels = [], []

    with torch.no_grad():
        for batch in data_loader:
            inputs = {k: v.to(device) for k, v in batch.items() if k != 'labels'}
            labels = batch['labels'].to(device)
            outputs = model(**inputs)
            logits = outputs.logits
            preds = torch.argmax(logits, dim=1).detach().cpu().numpy()
            labels = labels.detach().cpu().numpy()
            predictions.extend(preds)
            true_labels.extend(labels)

    precision, recall, f1, _ = precision_recall_fscore_support(true_labels, predictions, average='weighted')
    return precision, recall, f1

# 데이터셋 생성
train_dataset = MBTIDataset(train_texts, train_labels, tokenizer)
val_dataset = MBTIDataset(val_texts, val_labels, tokenizer)

# 데이터 로더 생성
train_loader = DataLoader(train_dataset, batch_size=100, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=100)

### 모델 및 토크나이저 설정

In [None]:
# 모델 로드
model = RobertaForSequenceClassification.from_pretrained('roberta-base', num_labels=len(label_encoder.classes_))

In [12]:
# GPU가 사용 가능한지 확인하고, 그렇지 않으면 CPU 사용
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# 옵티마이저를 PyTorch의 AdamW로 변경
optimizer = torch.optim.AdamW(model.parameters(), lr=5e-5)

In [None]:
# 성능 지표를 저장할 리스트
precisions, recalls, f1_scores = [], [], []

# 학습 루프
num_epochs = 10
for epoch in range(num_epochs):
    model.train()
    for batch in tqdm(train_loader, desc=f'Epoch {epoch}'):
        inputs = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**inputs)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()

    # 검증 데이터셋을 사용한 모델 성능 평가
    precision, recall, f1 = evaluate_model(model, val_loader, device)
    precisions.append(precision)
    recalls.append(recall)
    f1_scores.append(f1)
    print(f'Epoch {epoch} - Precision: {precision:.4f}, Recall: {recall:.4f}, F1: {f1:.4f}')

    # 모델 상태 및 성능 지표 저장
    checkpoint_path = f'./model_epoch_{epoch}.pt'
    torch.save({
        'epoch': epoch,
        'model_state_dict': model.state_dict(),
        'optimizer_state_dict': optimizer.state_dict(),
        'loss': loss.item(),
        'precision': precision,
        'recall': recall,
        'f1': f1,
    }, checkpoint_path)

# 성능 지표 시각화
epochs = range(1, num_epochs + 1)
plt.figure(figsize=(10, 6))
plt.plot(epochs, precisions, 'b-o', label='Precision')
plt.plot(epochs, recalls, 'r-s', label='Recall')
plt.plot(epochs, f1_scores, 'g-^', label='F1 Score')
plt.title('Model Performance Over Epochs')
plt.xlabel('Epoch')
plt.ylabel('Score')
plt.legend()
plt.show()