In [1]:
import torch
import numpy as np
import pandas as pd
import re
from transformers import BertTokenizer, BertModel
from keras.preprocessing.sequence import pad_sequences
from torch.utils.data import TensorDataset, DataLoader, RandomSampler
from sklearn.model_selection import train_test_split
from tqdm import tqdm
import os




In [5]:
# 경로 설정
path = 'C:\\Users\\david\\Desktop\\대학원\\Individual_project\\mbti_project\\MBTI&BigFive_data\\전처리데이터\\BigFive'
os.chdir(path)

In [12]:
## Define Function

class Args:
    def __init__(self):
        self.raw_data = "BigFive_prepro_sen.csv"
        self.max_len = 64
        self.batch_size = 16
args = Args()

def load_data(file_path):
    temp = pd.read_csv(file_path, sep=",")
    document = temp['cleaned_text'].tolist()
    labels = temp['cNEU'].apply(lambda x: 1 if x == 'y' else 0).tolist()
    return document, labels

def add_special_token(document):
    # 문장으로 분할하고 [CLS], [SEP] 토큰을 추가하는 과정
    processed_docs = []
    for doc in document:
        sentences = re.split(r'[.!?]\s+', doc)  # 문장 분할
        processed_doc = "[CLS] " + " [SEP] ".join(sentences) + " [SEP]"
        processed_docs.append(processed_doc)
    return processed_docs


def tokenization(document):
    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
    tokenized = [tokenizer.tokenize(sentence) for sentence in tqdm(document, desc="Tokenizing")]
    ids = [tokenizer.convert_tokens_to_ids(sentence) for sentence in tokenized]
    return ids

def padding(ids, max_len):
    ids = pad_sequences(ids, maxlen=max_len, dtype="long", truncating='post', padding='post')
    return ids

def attention_mask(ids):
    masks = [[float(i>0) for i in id] for id in ids]
    return masks


def preprocess(args):
    document, labels = load_data(args.raw_data)
    document = add_special_token(document)  # 문장 분할 및 특수 토큰 추가
    ids = tokenization(document)
    ids = padding(ids, args.max_len)
    masks = attention_mask(ids)
    
    return ids, masks


def build_dataloader(ids, masks, args):
    dataset = TensorDataset(torch.tensor(ids), torch.tensor(masks))
    dataloader = DataLoader(dataset, sampler=RandomSampler(dataset), batch_size=args.batch_size)
    return dataloader

def build_model():
    model = BertModel.from_pretrained("bert-base-uncased")
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model = model.to(device)
    return model, device

def extract_cls_vectors(dataloader, model, device):
    model.eval()
    cls_vectors = []
    for batch in tqdm(dataloader, desc="Extracting"):
        batch = tuple(t.to(device) for t in batch)
        ids, masks = batch
        
        with torch.no_grad():
            outputs = model(input_ids=ids, attention_mask=masks)
        
        cls_vectors_batch = outputs.last_hidden_state[:, 0, :].cpu().numpy()
        cls_vectors.extend(cls_vectors_batch)
    
    return cls_vectors

def run(args):
    ids, masks = preprocess(args)
    dataloader = build_dataloader(ids, masks, args)
    model, device = build_model()
    cls_vectors = extract_cls_vectors(dataloader, model, device)
    
    # 추출된 CLS 벡터 저장
    np.save("BERT_cls_Big5.npy", np.array(cls_vectors))
    print("CLS vectors have been saved.")

In [13]:
# 실행
run(args)

Tokenizing: 100%|██████████| 2467/2467 [00:40<00:00, 60.49it/s]
Extracting: 100%|██████████| 155/155 [02:56<00:00,  1.14s/it]

CLS vectors have been saved.



