### Classifying Big5 traits through BERT

In [1]:
from transformers import BertTokenizer, BertModel, get_linear_schedule_with_warmup, BertConfig, BertForSequenceClassification, AdamW
from sklearn.metrics import precision_score, accuracy_score, f1_score, recall_score, precision_recall_fscore_support
from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler, TensorDataset
from tqdm import tqdm
from keras.preprocessing.sequence import pad_sequences

import re
import numpy as np
import torch
import os
import pandas as pd
import random
import time
import datetime
import argparse
import warnings
warnings.filterwarnings('ignore')




In [2]:
path = 'C:\\Users\\david\\Desktop\\대학원\\Individual_project\\mbti_project\\MBTI&BigFive_data\\전처리데이터\\BigFive'
os.chdir(path)

In [11]:
class Args:
    def __init__(self):
        self.raw_data = "BigFive_prepro_sen.csv"  # 파일 확장자를 .csv로 변경
        self.max_len = 64
        self.batch_size = 16
        self.num_labels = 2
        self.epochs = 10
        self.seed_val = 42

args = Args()

def load_data(file_path):
    # CSV 파일 로드
    file_path = "BigFive_prepro_sen.csv"
    temp = pd.read_csv(file_path, sep=",")
    # 'cleaned_text' 열을 텍스트 데이터로 사용
    document = temp['cleaned_text'].tolist()
    # 'cEXT' 열의 값에 따라 labels 변수를 0 또는 1로 변환
    labels = temp['cCON'].apply(lambda x: 1 if x == 'y' else 0).tolist()
    return document, labels


def add_special_token(document):
    # 문장으로 분할하고 [CLS], [SEP] 토큰을 추가하는 과정
    processed_docs = []
    for doc in document:
        sentences = re.split(r'[.!?]\s+', doc)  # 문장 분할
        processed_doc = "[CLS] " + " [SEP] ".join(sentences) + " [SEP]"
        processed_docs.append(processed_doc)
    return processed_docs


def tokenization(document):
    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
    tokenized = [tokenizer.tokenize(sentence) for sentence in tqdm(document, desc="Tokenizing")]
    ids = [tokenizer.convert_tokens_to_ids(sentence) for sentence in tokenized]
    return ids

def padding(ids, args):
    ids = pad_sequences(ids, args.max_len, dtype="long", truncating='post', padding='post')
    return ids


# 학습 속도를 높이기 위한 어텐션 마스크 표시
def attention_mask(ids):
    masks = []
    for id in ids:
        mask = [float(i>0) for i in id]
        masks.append(mask)
    return masks


def preprocess(args):
    document, labels = load_data(args.raw_data)
    document = add_special_token(document)  # 문장 분할 및 특수 토큰 추가
    ids = tokenization(document)
    ids = padding(ids, args)
    masks = attention_mask(ids)
    del document
    return ids, masks, labels


def train_test_data_split(ids, masks, labels):
    train_ids, test_ids, train_labels, test_labels = train_test_split(ids, labels, random_state=42, test_size=0.2, stratify=labels)
    train_masks, test_masks, _, _ = train_test_split(masks, ids, random_state=42, test_size=0.2, stratify=labels)
    return train_ids, train_masks, train_labels, test_ids, test_masks, test_labels


def build_dataloader(ids, masks, label, args):
    dataloader = TensorDataset(torch.tensor(ids), torch.tensor(masks), torch.tensor(label))
    dataloader = DataLoader(dataloader, sampler=RandomSampler(dataloader), batch_size=args.batch_size)
    return dataloader


def build_model(args):
    model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=args.num_labels)
    # CUDA가 사용 가능한지 확인하고, 그렇지 않으면 CPU 사용
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    if device.type == 'cuda':
        print(f"{torch.cuda.get_device_name(0)} available")
    else:
        print("CUDA is not available. Using CPU instead.")
    model = model.to(device)
    return model, device


def test(test_dataloader, model, device):
    model.eval()
    all_preds = []
    all_true = []
    for batch in test_dataloader:
        batch = tuple(index.to(device) for index in batch)
        ids, masks, labels = batch
        with torch.no_grad():
            outputs = model(ids, token_type_ids=None, attention_mask=masks)
        logits = outputs.logits
        preds = torch.argmax(logits, dim=1).cpu().numpy()
        labels = labels.cpu().numpy()
        all_preds.extend(preds)
        all_true.extend(labels)
    accuracy = accuracy_score(all_true, all_preds)
    precision, recall, f1, _ = precision_recall_fscore_support(all_true, all_preds, average='binary')
    
    print(f"Test Average Accuracy: {accuracy:.2f}")
    print(f"Precision: {precision:.2f}")
    print(f"Recall: {recall:.2f}")
    print(f"F1 Score: {f1:.2f}")
    # Return all metrics for further use if needed
    return accuracy, precision, recall, f1
   
   
# 수정된 train 함수
def train(train_dataloader, test_dataloader, args):
    model, device = build_model(args)
    optimizer = AdamW(model.parameters(), lr=2e-5, eps=1e-8, weight_decay=0.01) #L2 Reg
    scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=len(train_dataloader)*args.epochs)
    random.seed(args.seed_val)
    np.random.seed(args.seed_val)
    torch.manual_seed(args.seed_val)
    torch.cuda.manual_seed_all(args.seed_val)
    model.zero_grad()
    
    for epoch in range(args.epochs):  # 수정: range(0, args.epochs) -> range(args.epochs)
        model.train()
        total_loss, total_accuracy = 0, 0
        print("-"*30)
        for step, batch in enumerate(tqdm(train_dataloader, desc="Training", leave=False)):
            if step % 139 == 0:
                print(f"Epoch : {epoch+1} / {args.epochs}, Step : {step}")
            batch = tuple(index.to(device) for index in batch)
            ids, masks, labels = batch
            outputs = model(ids, token_type_ids=None, attention_mask=masks, labels=labels)
            loss = outputs.loss
            total_loss += loss.item()
            pred = [torch.argmax(logit).cpu().detach().item() for logit in outputs.logits]
            true = [label for label in labels.cpu().numpy()]
            accuracy = accuracy_score(true, pred)
            total_accuracy += accuracy
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
            optimizer.step()
            scheduler.step()
            model.zero_grad()
        avg_loss = total_loss / len(train_dataloader)
        avg_accuracy = total_accuracy / len(train_dataloader)
        print(f"{epoch+1} Epoch Average train loss: {avg_loss}")
        print(f"{epoch+1} Epoch Average train accuracy: {avg_accuracy}")
        acc, precision, recall, f1 = test(test_dataloader, model, device)
        os.makedirs("results", exist_ok=True)
        f = os.path.join("results", f'epoch_{epoch+1}_evalAcc_{acc*100:.0f}.pth')
        torch.save(model.state_dict(), f)
        print('Saved checkpoint:', f)


# 수정된 run 함수
def run(args):
    ids, masks, labels = preprocess(args)
    train_ids, train_masks, train_labels, test_ids, test_masks, test_labels = train_test_data_split(ids, masks, labels)
    train_dataloader = build_dataloader(train_ids, train_masks, train_labels, args)
    test_dataloader = build_dataloader(test_ids, test_masks, test_labels, args)
    train(train_dataloader, test_dataloader, args)


In [12]:
#Action
run(args)

Tokenizing: 100%|██████████| 2467/2467 [00:15<00:00, 161.57it/s]
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


CUDA is not available. Using CPU instead.
------------------------------


Training:   0%|          | 0/124 [00:00<?, ?it/s]

Epoch : 1 / 10, Step : 0


                                                           

1 Epoch Average train loss: 0.6978807819466437
1 Epoch Average train accuracy: 0.5089717741935483
Test Average Accuracy: 0.54
Precision: 0.54
Recall: 1.00
F1 Score: 0.70
Saved checkpoint: results\epoch_1_evalAcc_54.pth
------------------------------


Training:   0%|          | 0/124 [00:00<?, ?it/s]

Epoch : 2 / 10, Step : 0


                                                           

2 Epoch Average train loss: 0.6846251300265712
2 Epoch Average train accuracy: 0.5608870967741936
Test Average Accuracy: 0.56
Precision: 0.57
Recall: 0.75
F1 Score: 0.65
Saved checkpoint: results\epoch_2_evalAcc_56.pth
------------------------------


Training:   0%|          | 0/124 [00:00<?, ?it/s]

Epoch : 3 / 10, Step : 0


                                                           

3 Epoch Average train loss: 0.6296449137791511
3 Epoch Average train accuracy: 0.651108870967742
Test Average Accuracy: 0.56
Precision: 0.67
Recall: 0.36
F1 Score: 0.47
Saved checkpoint: results\epoch_3_evalAcc_56.pth
------------------------------


Training:   0%|          | 0/124 [00:00<?, ?it/s]

Epoch : 4 / 10, Step : 0


                                                           

4 Epoch Average train loss: 0.4465491251359063
4 Epoch Average train accuracy: 0.7932459677419355
Test Average Accuracy: 0.54
Precision: 0.56
Recall: 0.72
F1 Score: 0.63
Saved checkpoint: results\epoch_4_evalAcc_54.pth
------------------------------


Training:   0%|          | 0/124 [00:00<?, ?it/s]

Epoch : 5 / 10, Step : 0


                                                           

5 Epoch Average train loss: 0.22489440224824414
5 Epoch Average train accuracy: 0.9163306451612904
Test Average Accuracy: 0.56
Precision: 0.56
Recall: 0.82
F1 Score: 0.67
Saved checkpoint: results\epoch_5_evalAcc_56.pth
------------------------------


Training:   0%|          | 0/124 [00:00<?, ?it/s]

Epoch : 6 / 10, Step : 0


                                                           

6 Epoch Average train loss: 0.0994249229627331
6 Epoch Average train accuracy: 0.9647177419354839
Test Average Accuracy: 0.56
Precision: 0.58
Recall: 0.70
F1 Score: 0.63
Saved checkpoint: results\epoch_6_evalAcc_56.pth
------------------------------


Training:   0%|          | 0/124 [00:00<?, ?it/s]

Epoch : 7 / 10, Step : 0


                                                           

7 Epoch Average train loss: 0.04845293859327813
7 Epoch Average train accuracy: 0.985383064516129
Test Average Accuracy: 0.57
Precision: 0.58
Recall: 0.73
F1 Score: 0.65
Saved checkpoint: results\epoch_7_evalAcc_57.pth
------------------------------


Training:   0%|          | 0/124 [00:00<?, ?it/s]

Epoch : 8 / 10, Step : 0


                                                           

8 Epoch Average train loss: 0.03331322323825706
8 Epoch Average train accuracy: 0.9924395161290323
Test Average Accuracy: 0.57
Precision: 0.59
Recall: 0.72
F1 Score: 0.64
Saved checkpoint: results\epoch_8_evalAcc_57.pth
------------------------------


Training:   0%|          | 0/124 [00:00<?, ?it/s]

Epoch : 9 / 10, Step : 0


                                                           

9 Epoch Average train loss: 0.006923062686954865
9 Epoch Average train accuracy: 0.9974798387096774
Test Average Accuracy: 0.57
Precision: 0.60
Recall: 0.63
F1 Score: 0.62
Saved checkpoint: results\epoch_9_evalAcc_57.pth
------------------------------


Training:   0%|          | 0/124 [00:00<?, ?it/s]

Epoch : 10 / 10, Step : 0


                                                           

10 Epoch Average train loss: 0.007790874487498866
10 Epoch Average train accuracy: 0.9984879032258065
Test Average Accuracy: 0.57
Precision: 0.59
Recall: 0.68
F1 Score: 0.63
Saved checkpoint: results\epoch_10_evalAcc_57.pth
