데이터셋별 정확도 비교

In [1]:
import pandas as pd
import torch
from torch import nn
from torch.utils.data import Dataset
from kobert_tokenizer import KoBERTTokenizer
import gluonnlp as nlp
import numpy as np

In [2]:
#region kovert-v6 모델 불러오기
tokenizer = KoBERTTokenizer.from_pretrained('skt/kobert-base-v1')
vocab = nlp.vocab.BERTVocab.from_sentencepiece(tokenizer.vocab_file, padding_token='[PAD]')
tok = tokenizer.tokenize

#device = torch.device("cpu")
device = torch.device("cuda:0")

In [3]:
class BERTClassifier(nn.Module):
    def __init__(self,
                 bert,
                 hidden_size = 768,
                 num_classes=6, #클래스 수 조정
                 dr_rate=None,
                 params=None):
        super(BERTClassifier, self).__init__()
        self.bert = bert
        self.dr_rate = dr_rate
                 
        self.classifier = nn.Linear(hidden_size , num_classes)
        if dr_rate:
            self.dropout = nn.Dropout(p=dr_rate)
    
    def gen_attention_mask(self, token_ids, valid_length):
        attention_mask = torch.zeros_like(token_ids)
        for i, v in enumerate(valid_length):
            attention_mask[i][:v] = 1
        return attention_mask.float()

    def forward(self, token_ids, valid_length, segment_ids):
        attention_mask = self.gen_attention_mask(token_ids, valid_length)
        
        _, pooler = self.bert(input_ids = token_ids, token_type_ids = segment_ids.long(), attention_mask = attention_mask.float().to(token_ids.device))
        if self.dr_rate:
            out = self.dropout(pooler)
        return self.classifier(out)

class BERTDataset(Dataset):
    def __init__(self, dataset, sent_idx, label_idx, bert_tokenizer,vocab, max_len,
                 pad, pair):
   
        transform = nlp.data.BERTSentenceTransform(
            bert_tokenizer, max_seq_length=max_len,vocab=vocab, pad=pad, pair=pair)
        
        self.sentences = [transform([i[sent_idx]]) for i in dataset]
        self.labels = [np.int32(i[label_idx]) for i in dataset]

    def __getitem__(self, i):
        return (self.sentences[i] + (self.labels[i], ))
         
    def __len__(self):
        return (len(self.labels))
    
#model_path = '../sentiment-analysis/model/kobert-v6.pt'
model_path = 'model/kobert-v7.pt' #(cmd 위치 기준)
model = torch.load(model_path)
#model = model.to('cpu')

max_len = 64
batch_size = 64

#예측함수
def predict(sentence):
    dataset = [[sentence, '0']]
    test = BERTDataset(dataset, 0, 1, tok, vocab, max_len, True, False)
    test_dataloader = torch.utils.data.DataLoader(test, batch_size=batch_size, num_workers=0)
    model.eval()
    answer = 0
    for batch_id, (token_ids, valid_length, segment_ids, label) in enumerate(test_dataloader):
        token_ids = token_ids.long().to(device)
        segment_ids = segment_ids.long().to(device)
        valid_length= valid_length
        label = label.long().to(device)
        out = model(token_ids, valid_length, segment_ids)
        for logits in out:
            logits = logits.detach().cpu().numpy()
            answer = np.argmax(logits)
    return answer

In [9]:
predict('배고프다')

3

In [4]:
import pandas as pd

df = pd.read_csv('data/병합데이터셋-v5a.csv', index_col=0) 

df= df.dropna()
df = df.drop_duplicates(subset='sentence')

# 감정을 정수 라벨로 변경
emotions = {'행복': 0, '불안': 1, '놀람': 2, '슬픔': 3, '분노': 4, '중립': 5 }
df['emotion'] = df.emotion.map(emotions)

df.shape

(135650, 2)

In [5]:
# 전체파트
changed_de, changed_nu = 0, 0 #분모,분자

for index, row in df.iterrows():
    if predict(row['sentence']) == row['emotion']:
        changed_nu +=1
    changed_de+=1
    
# print("{:.3f}".format(changed_nu/changed_de))
print(changed_nu/changed_de)

0.847121267969038


In [16]:
# 수정파트
changed_de, changed_nu = 0, 0 #분모,분자

for index, row in df.iterrows():
    if index > 3000:
        break
    if predict(row['sentence']) == row['emotion']:
        changed_nu +=1
    changed_de+=1
    
# print("{:.3f}".format(changed_nu/changed_de))
print(changed_nu/changed_de)

0.8257133035407357


In [19]:
subset_df = df.iloc[130000:] 

de, nu = 0, 0 #분모,분자

for index, row in subset_df.iterrows():
    if predict(row['sentence']) == row['emotion']:
        nu +=1
    de+=1
    
print(nu/de)

0.8490265486725663


In [12]:
import pandas as pd

df = pd.read_csv('data/근우파트.csv', index_col=0) 

df= df.dropna()
df = df.drop_duplicates(subset='sentence')

# 감정을 정수 라벨로 변경
emotions = {'행복': 0, '불안': 1, '놀람': 2, '슬픔': 3, '분노': 4, '중립': 5 }
df['emotion'] = df.emotion.map(emotions)

df.shape

(135709, 2)

In [13]:
subset_df = df.iloc[10001:20001] 

de, nu = 0, 0 #분모,분자

for index, row in subset_df.iterrows():
    if predict(row['sentence']) == row['emotion']:
        nu +=1
    de+=1
    
print(nu/de)

0.8432
