In [None]:
import pandas as pd

train_data = pd.read_table('/path/train.txt')
test_data = pd.read_table('/path/test.txt')

train_data.head() # 상위 5개 추출

In [None]:
print(len(train_data),len(test_data))

In [None]:
train_data = train_data.dropna(how = 'any') # Null 값이 존재하는 행 제거
print(train_data.isnull().values.any())
test_data = test_data.dropna(how = 'any')
print(test_data.isnull().values.any())

In [None]:
print(len(train_data),len(test_data))

In [None]:
# 중복 제거 
train_data.drop_duplicates(subset=['document'], inplace=True)
test_data.drop_duplicates(subset=['document'], inplace=True)

In [None]:
print(len(train_data),len(test_data))

In [None]:
import re
import json
from konlpy.tag import Okt
from collections.abc import Iterable

# 동의어 처리
with open('/content/thesaurus_dic.json', 'r', encoding='utf-8') as f:
    thesaurus = json.load(f)

# 학습 및 테스트 데이터 내용 및 라벨 준비
train_texts = train_data["document"]
test_texts = test_data["document"]
train_labels = train_data["label"]
test_labels = test_data["label"]

# 바뀐 단어들이 [] 형태로 들어가지 않기 위해 사용
def flatten(lis):
  for item in lis:
    if isinstance(item, Iterable) and not isinstance(item, str):
      for x in flatten(item):
        yield x
    else:
      yield item

# 토크나이저 및 불용어 제거
def preprocess_stop_words(text): 
  okt = Okt()
  text = text.strip()
  text = re.sub('\s+', ' ', text)

  # 불용어 사전 txt 파일 이용해 삭제
  with open('/content/stop_words.txt', 'r', encoding='utf-8') as f:
    custom_stopwords = [word.strip() for word in f]
    text = re.sub('[^A-Za-z0-9가-힣]', ' ', text)  # 특문 제거

    # 토큰화
    tokens = okt.morphs(text)

    # Thesaurus 적용
    replaced_words = [thesaurus.get(word, word) for word in tokens]
    replaced_words = [' '.join(words) if isinstance(words, list) else words for words in replaced_words]

    # 불용어 제거
    mecab_word = [word for word in replaced_words if word not in custom_stopwords]

    text = ' '.join(mecab_word)
  return text

X_train = [preprocess_stop_words(text) for text in train_texts]
X_test = [preprocess_stop_words(text) for text in test_texts]
Y_train = train_labels
Y_test = test_labels