In [7]:
from transformers import RobertaTokenizer, RobertaForSequenceClassification
from torch.optim import AdamW
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import precision_recall_fscore_support, accuracy_score
from torch.utils.data import Dataset, DataLoader
from tqdm import tqdm
import matplotlib.pyplot as plt
import torch
import numpy as np
import pandas as pd
import os

In [2]:
# Load and preprocess data
df = pd.read_csv('C:\\Users\\david\\Desktop\\대학원\\Individual_project\\mbti_project\\MBTI&BigFive_data\\전처리데이터\\MBTI_prepro_sen.csv')
texts = df['cleaned_text'].tolist()
labels = df['type'].tolist()

# Encode labels
label_encoder = LabelEncoder()
labels_encoded = label_encoder.fit_transform(labels)

# Tokenizer
tokenizer = RobertaTokenizer.from_pretrained("roberta-base")

In [3]:
# Dataset
class MBTIDataset(Dataset):
    def __init__(self, texts, labels, tokenizer):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]
        inputs = self.tokenizer(text, padding='max_length', truncation=True, max_length=512, return_tensors="pt")
        input_ids = inputs['input_ids'].squeeze(0)
        attention_mask = inputs['attention_mask'].squeeze(0)
        return {'input_ids': input_ids, 'attention_mask': attention_mask, 'labels': torch.tensor(label)}
    
# Evaluation function
def evaluate_model(model, data_loader, device):
    model.eval()
    predictions, true_labels = [], []
    with torch.no_grad():
        for batch in data_loader:
            inputs = {k: v.to(device) for k, v in batch.items() if k != 'labels'}
            labels = batch['labels'].to(device)
            outputs = model(**inputs)
            logits = outputs.logits
            preds = torch.argmax(logits, dim=1).detach().cpu().numpy()
            labels = labels.detach().cpu().numpy()
            predictions.extend(preds)
            true_labels.extend(labels)
    precision, recall, f1, _ = precision_recall_fscore_support(true_labels, predictions, average='weighted')
    accuracy = accuracy_score(true_labels, predictions)
    return precision, recall, f1, accuracy

In [None]:
# Cross-validation setup
skf = StratifiedKFold(n_splits=10, shuffle=True)
fold_results = []

# Training loop with cross-validation
for fold, (train_idx, val_idx) in enumerate(skf.split(texts, labels_encoded)):
    train_texts_fold = [texts[i] for i in train_idx]
    val_texts_fold = [texts[i] for i in val_idx]
    train_labels_fold = labels_encoded[train_idx]
    val_labels_fold = labels_encoded[val_idx]

    train_dataset = MBTIDataset(train_texts_fold, train_labels_fold, tokenizer)
    val_dataset = MBTIDataset(val_texts_fold, val_labels_fold, tokenizer)
    train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=16)

    # Model and optimizer
    model = RobertaForSequenceClassification.from_pretrained('roberta-base', num_labels=len(label_encoder.classes_))
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)
    optimizer = AdamW(model.parameters(), lr=1e-5)

    for epoch in range(10):  # Example: 10 epochs
        model.train()
        for batch in tqdm(train_loader, desc=f'Epoch {epoch}, Fold {fold}'):
            inputs = {k: v.to(device) for k, v in batch.items()}
            outputs = model(**inputs)
            loss = outputs.loss
            loss.backward()
            optimizer.step()
            optimizer.zero_grad()

        precision, recall, f1, accuracy = evaluate_model(model, val_loader, device)
        fold_results.append((precision, recall, f1, accuracy))
        print(f'Fold {fold}, Epoch {epoch} - Precision: {precision:.4f}, Recall: {recall:.4f}, F1: {f1:.4f}, Accuracy: {accuracy:.4f}')

In [None]:
columns = ['Fold', 'Epoch', 'Precision', 'Recall', 'F1-Score', 'Accuracy']
results_df = pd.DataFrame(fold_results, columns=columns)

# 성능 지표를 CSV 파일로 저장
results_df.to_csv('model_performance.csv', index=False)
print("성능 지표가 model_performance.csv 파일로 저장되었습니다.")

# 마지막 폴드의 모델과 토크나이저 저장
model_save_path = "final_roberta_model.pt"
tokenizer_save_path = "final_roberta_tokenizer"

# 모델 저장
torch.save(model.state_dict(), model_save_path)
print(f"모델이 {model_save_path} 경로에 저장되었습니다.")

# 토크나이저 저장
tokenizer.save_pretrained(tokenizer_save_path)
print(f"토크나이저가 {tokenizer_save_path} 경로에 저장되었습니다.")

# 최종 결과 요약 출력
mean_precision = results_df['Precision'].mean()
mean_recall = results_df['Recall'].mean()
mean_f1 = results_df['F1-Score'].mean()
mean_accuracy = results_df['Accuracy'].mean()

print(f"평균 정밀도: {mean_precision:.4f}")
print(f"평균 재현율: {mean_recall:.4f}")
print(f"평균 F1 점수: {mean_f1:.4f}")
print(f"평균 정확도: {mean_accuracy:.4f}")