In [23]:
from transformers import BertTokenizer, BertModel
from sklearn.model_selection import train_test_split
from torch.utils.data import TensorDataset
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler
from transformers import BertForSequenceClassification, AdamW
from tqdm import tqdm
from sklearn.metrics import accuracy_score

import numpy as np
import torch
import os
import pandas as pd
import warnings
warnings.filterwarnings('ignore')




In [3]:
path ='C:\\Users\\david\\Desktop\\대학원\\Individual_project\\mbti_project\\Hatespeech_data\\hatespeech_sep'
os.chdir(path)

In [11]:
df = pd.read_csv('train_model용.csv')
df

Unnamed: 0,label,text_data,prepro_text_2
0,1,The reason is sexism. MENTION1773 Female comed...,the reason is sexism female comedians just are...
1,0,"â­â­â­â­â­ review by robe gardner: ""aweso...",review by robe gardner awesome product looks g...
2,1,@user in chief- the gangs all here #hillarycl...,in chief the gangs all here hillaryclinton oba...
3,0,MENTION963 killing you how?,killing you how
4,1,@user @user the correct way it should have bee...,the correct way it should have been written do...
...,...,...,...
9508,0,got tickets to the great moscow state circus. ...,got tickets to the great moscow state circus d...
9509,0,cups of tea and fresh mags. #days @user in o...,cups of tea and fresh mags days in our stirrup...
9510,1,MENTION3489 i dont expect that from a woman. I...,i dont expect that from a woman i expected wom...
9511,1,@user #americans suppo these #zionist #occupi...,americans suppo these zionist occupiers let s ...


In [16]:
# BERT 토크나이저 로드
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# 토큰화 함수 정의
def tokenize_and_format(sentences):
    input_ids = []
    attention_masks = []

    for sent in sentences:
        encoded_dict = tokenizer.encode_plus(
                            sent,                      # 문장을 인코딩
                            add_special_tokens = True, # '[CLS]'와 '[SEP]' 추가
                            max_length = 64,           # 문장의 최대 길이 설정
                            pad_to_max_length = True,  # 패딩 적용
                            return_attention_mask = True, # 어텐션 마스크 생성
                            return_tensors = 'pt',     # 파이토치 텐서로 반환
                     )
        
        input_ids.append(encoded_dict['input_ids'])
        attention_masks.append(encoded_dict['attention_mask'])

    return torch.cat(input_ids, dim=0), torch.cat(attention_masks, dim=0)

model = BertModel.from_pretrained("bert-base-uncased")

In [12]:
# 원본 데이터와 레이블 분리
X = df['prepro_text_2']
y = df['label']

In [13]:
# 전체 데이터를 훈련+검증 세트와 테스트 세트로 분할
X_train_val, X_test, y_train_val, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

# 훈련+검증 세트를 훈련 세트와 검증 세트로 분할
X_train, X_val, y_train, y_val = train_test_split(X_train_val, y_train_val, test_size=0.25, stratify=y_train_val, random_state=42)  # 0.25 * 0.8 = 0.2

In [19]:
# 훈련, 검증, 테스트 데이터셋을 토큰화
train_inputs, train_masks = tokenize_and_format(X_train)
val_inputs, val_masks = tokenize_and_format(X_val)
test_inputs, test_masks = tokenize_and_format(X_test)

# 레이블을 텐서로 변환
train_labels = torch.tensor(y_train.values)
val_labels = torch.tensor(y_val.values)
test_labels = torch.tensor(y_test.values)

# TensorDataset 객체 생성
train_dataset = TensorDataset(train_inputs, train_masks, train_labels)
val_dataset = TensorDataset(val_inputs, val_masks, val_labels)
test_dataset = TensorDataset(test_inputs, test_masks, test_labels)

In [22]:
# 배치 사이즈 설정
batch_size = 16

# 훈련 데이터 로더 생성
train_dataloader = DataLoader(
            train_dataset,  # 훈련 데이터셋
            sampler = RandomSampler(train_dataset), # 데이터셋에서 무작위로 샘플링
            batch_size = batch_size # 배치 사이즈
        )

# 검증 데이터 로더 생성
validation_dataloader = DataLoader(
            val_dataset, # 검증 데이터셋
            sampler = SequentialSampler(val_dataset), # 순차적 샘플링
            batch_size = batch_size # 배치 사이즈
        )

# 테스트 데이터 로더 생성
test_dataloader = DataLoader(
            test_dataset, # 테스트 데이터셋
            sampler = SequentialSampler(test_dataset), # 순차적 샘플링
            batch_size = batch_size # 배치 사이즈
        )

In [24]:
# 모델 초기화
model = BertForSequenceClassification.from_pretrained(
    "bert-base-uncased", # 소문자로 된 BERT 베이스 모델 사용
    num_labels = 2, # 이진 분류를 위한 레이블 수
    output_attentions = False, # 어텐션 가중치를 반환할지 여부
    output_hidden_states = False, # 히든 상태를 반환할지 여부
)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [25]:
# 최적화 알고리즘 설정
optimizer = AdamW(model.parameters(),
                  lr = 1e-5, # 학습률
                  eps = 1e-8 # 수치 안정성을 위한 작은 값
                  )

In [27]:
# 장치 설정 (GPU 사용 가능한 경우 GPU 사용)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# 에포크 수
epochs = 10

# 훈련 루프
for epoch_i in range(0, epochs):
    print('======== Epoch {:} / {:} ========'.format(epoch_i + 1, epochs))
    print('Training...')

    # 훈련 모드로 설정
    model.train()
    
    # `tqdm`으로 훈련 데이터 로더 감싸기
    total_train_loss = 0
    train_iterator = tqdm(train_dataloader, desc="Iteration")
    
    for step, batch in enumerate(train_iterator):
        # 배치를 GPU에 로드
        b_input_ids, b_input_mask, b_labels = tuple(t.to(device) for t in batch)
        
        # 그래디언트 초기화
        model.zero_grad()        
        
        # 순전파
        outputs = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask, labels=b_labels)
        
        # 손실 값
        loss = outputs.loss
        total_train_loss += loss.item()

        # 역전파
        loss.backward()
        
        # 그래디언트 클리핑
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        
        # 가중치 업데이트
        optimizer.step()
    
    # 평균 훈련 손실 계산
    avg_train_loss = total_train_loss / len(train_dataloader)
    print("  Average training loss: {0:.2f}".format(avg_train_loss))

    # 검증 세트 평가
    print("\nValidation...")

    model.eval()

    total_eval_accuracy = 0
    total_eval_loss = 0

    for batch in validation_dataloader:
        b_input_ids, b_input_mask, b_labels = batch
        b_input_ids = b_input_ids.to(device)
        b_input_mask = b_input_mask.to(device)
        b_labels = b_labels.to(device)

        with torch.no_grad():
            outputs = model(b_input_ids,
                            token_type_ids=None,
                            attention_mask=b_input_mask,
                            labels=b_labels)

        loss = outputs.loss
        logits = outputs.logits

        total_eval_loss += loss.item()

        logits = logits.detach().cpu().numpy()
        label_ids = b_labels.to('cpu').numpy()

        total_eval_accuracy += accuracy_score(label_ids, np.argmax(logits, axis=1))

    avg_val_accuracy = total_eval_accuracy / len(validation_dataloader)
    avg_val_loss = total_eval_loss / len(validation_dataloader)
    print("Epoch: {}, Validation Accuracy: {:.2f}, Validation Loss: {:.2f}".format(epoch_i+1, avg_val_accuracy, avg_val_loss))

Training...


Iteration:   1%|          | 3/357 [00:05<11:22,  1.93s/it]


KeyboardInterrupt: 