In [1]:
import re
import os
from tqdm.auto import tqdm
import pandas as pd
import VocabBuilder
from WordPieceTokenizer import WordPieceTokenizer

datasetsFilePath = "datasets/"

In [2]:
df = pd.read_csv(f'{datasetsFilePath}sentiment_data.csv',index_col=0)
df.head()

Unnamed: 0,발화,감정,str_len
0,언니 동생으로 부르는게 맞는 일인가요..??,불안,24
1,그냥 내 느낌일뿐겠지?,불안,12
2,아직너무초기라서 그런거죠?,불안,14
3,유치원버스 사고 낫다던데,불안,13
4,근데 원래이런거맞나요,불안,11


# 한글 전처리

In [3]:
def text_preprocess_kor_for_wordpiece(text : str) :
    text = str(text)

    text = re.sub(r'http\S+|www.\S+', '', text)
    text = re.sub(r'\S*@\S*\s?', '', text)
    text = re.sub(r'<[^>]*>', '', text)
    
    target = r"[^가-힣0-9a-zA-Z.,!?'\" ]"
    text = re.sub(target, repl=" ", string=text)

    text = re.sub(r'([.,!?"\'])(\1{1,})', r'\1', text)

    text = re.sub(r'([ㄱ-ㅎㅏ-ㅣ])\1+', r'\1', text)

    text = re.sub(r'\d+', '', text)

    text = re.sub(r"\s+", repl=" ", string=text)
    
    return text.strip()

In [4]:
for i in tqdm(range(len(df))):
    text = df.iloc[i,0]
    text = text_preprocess_kor_for_wordpiece(text)
    df.iloc[i,0] = text

  0%|          | 0/88110 [00:00<?, ?it/s]

KeyboardInterrupt: 

In [None]:
file = f'{datasetsFilePath}sentiment_data.txt'
VocabBuilder.SaveDataFrameTextsTo_txt(df,"발화",file)

# BertWordPieceTokenizer를 사용하여 vocab.txt 생성 후 사용
'datasets/sentiment_vocab/vocab.txt'

In [None]:
files = [file]
vocab_size = 32000
min_frequency = 5
special_tokens = ["[PAD]", "[UNK]", "[CLS]", "[SEP]", "[MASK]"]
output_dir = f'{datasetsFilePath}sentiment_vocab'
VocabBuilder.GenerateVocab(files,vocab_size,min_frequency,special_tokens,output_dir)


## Pretrain에서 사용한 vocab.txt 사용

In [5]:
vocab_file_path = f'saves/vocab.txt'
tokenizer = WordPieceTokenizer(vocab_file_path, do_lower_case=False, strip_accents=False,clean_text=True)

In [6]:
max_length = 128
attention_masks = []
token_type_ids_ = []
for i in tqdm(range(len(df))):
    text_to_encode = df.iloc[i,0]
    encoded_result = tokenizer.encode(text_to_encode, max_length=max_length)
    input_ids = " ".join(map(str,encoded_result['input_ids']))
    attention_mask = " ".join(map(str,encoded_result['attention_mask']))
    token_type_ids = " ".join(map(str,encoded_result['token_type_ids']))
    df.iloc[i,0] = input_ids
    attention_masks.append(attention_mask)
    token_type_ids_.append(token_type_ids)

df['attention_mask'] = attention_masks
df['token_type_ids'] = token_type_ids_ 
df.head()

  0%|          | 0/88110 [00:00<?, ?it/s]

Unnamed: 0,발화,감정,str_len,attention_mask,token_type_ids
0,2 17637 15450 2000 11814 1086 3628 29262 3715 ...,불안,24,1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 ...,0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 ...
1,2 2916 188 5325 1233 1125 22353 1 3 0 0 0 0 0 ...,불안,12,1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 ...,0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 ...
2,2 2785 1148 1373 1425 1029 6671 2020 1073 1324...,불안,14,1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 ...,0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 ...
3,2 6916 3859 2747 183 1011 15973 3 0 0 0 0 0 0 ...,불안,13,1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 ...,0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 ...
4,2 5315 4113 1036 1627 1073 1762 3018 3 0 0 0 0...,불안,11,1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 ...,0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 ...


In [7]:
df.to_csv(f"{datasetsFilePath}sentiment_train.csv")