In [1]:
import pandas as pd
import numpy as np
from konlpy.tag import Okt
from string import punctuation
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import pickle

In [2]:
data = pd.read_table('ck_data.txt', names = ['label', 'reviews'])
data = data.drop_duplicates(subset = ['reviews'])
data['label'].astype(int)

0        0
1        0
2        1
3        1
4        1
        ..
99995    0
99996    0
99997    0
99998    1
99999    1
Name: label, Length: 99892, dtype: int32

In [3]:
st = set(['은', '는', '이', '가', '을', '를', '도', '에게', '로', '에', '요', '의', '우리', '저희', '나', '님', '게임', '너무', '그냥', '좀', '더', '정말', '다', '그리고', '하지만', '그래서'])
p = list(punctuation)
st.update(p)

In [4]:
data['reviews'] = data['reviews'].str.replace('[^ㄱ-ㅎ ㅏ-ㅣ 가-힣]', '')
data['reviews'] = data['reviews'].str.replace('^ +', '')
data['reviews'] = data['reviews'].replace('', np.nan)
data = data.dropna(how = 'any', axis = 0)

  data['reviews'] = data['reviews'].str.replace('[^ㄱ-ㅎ ㅏ-ㅣ 가-힣]', '')
  data['reviews'] = data['reviews'].str.replace('^ +', '')


In [5]:
okt = Okt()
okt_l = []
for sentence in data['reviews']:
    okt_l.append(okt.morphs(sentence))

In [6]:
s_otk_l = []
for sentence in okt_l:
    s_otk_l.append([word for word in sentence if word not in st])
s_otk_l

[['노래', '적음'],
 ['돌겠네', '진짜', '황숙', '아', '어크', '공장', '그만', '돌려라', '죽는다'],
 ['막노동', '체험판', '막노동', '하는', '사람', '인데', '장비', '내', '사야', '돼', '뭐', '지'],
 ['차악차악차악', '이래서', '왕국', '되찾을', '수', '있는거야'],
 ['시간', '때우기에', '좋음', '도전', '과제', '시간', '이면', '깰', '수', '있어요'],
 ['역시', '재미있네요', '전작', '에서', '할수', '없었던', '자유로운', '덱', '빌딩', '좋네요'],
 ['재미있었습니다'],
 ['은근', '쉽지만', '은근', '어려운'],
 ['베',
  'ㅈ',
  '스',
  '개',
  'ㅐ',
  '끼',
  '들',
  '아',
  '시작',
  '할',
  '때',
  '체스판',
  '돌아가는거',
  '분동',
  '안',
  '번',
  '봤',
  'ㅈㄴ',
  '빡치네',
  '진짜',
  '무한',
  '로딩',
  '버그',
  '안',
  '쳐',
  '고치냐',
  '겜',
  '하지말라는',
  '거',
  '냐'],
 ['시간', '분동', '안', '스트레스', '풀림', 'ㄹㅇ'],
 ['걍', '겜임', '계정', '못', '만들어', '미친', '겜'],
 ['관람객', '호랑이', '불가'],
 ['재미', '쩡', '초반', '힘들어여', '도트', '라서', '조아'],
 ['포켓볼',
  '몰랐는데',
  '걸',
  '배워',
  '갑니다',
  '심심할',
  '때',
  '하면',
  '좋아요',
  '컴퓨터',
  '상대',
  '하는거',
  '제대로',
  '이겨',
  '보고',
  '싶은데',
  '잘',
  '안되네요'],
 ['트레이아크', '사랑', '해요', '핰핰'],
 ['좆망', '겜', '하지마', '무슨', '처음', '키', '자마자', '매칭', '왜', '렙', '하

In [7]:
X = s_otk_l
Y = np.array(data['label'])

In [8]:
x_data, tt_x, y_data, tt_y = train_test_split(X, Y, test_size = 0.3, random_state = 22)
t_x, v_x, t_y, v_y = train_test_split(x_data, y_data, test_size = 0.2, random_state = 22)

In [9]:
tk = Tokenizer()
tk.fit_on_texts(t_x)

In [10]:
word_num = len([w for w in sorted(list(tk.word_counts.items()), key = lambda x: x[1]) if w[1] > 4]) + 1

In [11]:
tk = Tokenizer(num_words = word_num)
tk.fit_on_texts(t_x)

In [12]:
t_t_x = tk.texts_to_sequences(t_x)
t_v_x = tk.texts_to_sequences(v_x)
t_tt_x = tk.texts_to_sequences(tt_x)

In [13]:
drop_t = [idx for idx, sentence in enumerate(t_t_x) if len(sentence) < 1]
drop_v = [idx for idx, sentence in enumerate(t_v_x) if len(sentence) < 1]
drop_tt = [idx for idx, sentence in enumerate(t_tt_x) if len(sentence) < 1]

In [14]:
t_t_x = np.delete(t_t_x, drop_t, axis = 0)
t_y = np.delete(t_y, drop_t, axis = 0)
t_v_x = np.delete(t_v_x, drop_v, axis = 0)
v_y = np.delete(v_y, drop_v, axis = 0)
t_tt_x = np.delete(t_tt_x, drop_tt, axis = 0)
tt_y = np.delete(tt_y, drop_tt, axis = 0)

  return array(a, dtype, copy=False, order=order)


In [15]:
p_l = max([len(pad_sequences(t_t_x)[0]), len(pad_sequences(t_v_x)[0]), len(pad_sequences(t_tt_x)[0])])

t_x = pad_sequences(t_t_x, maxlen = p_l)
v_x = pad_sequences(t_v_x, maxlen = p_l)
tt_x = pad_sequences(t_tt_x, maxlen = p_l)

In [16]:
preprocessed_data = {}
preprocessed_data['train_x'] = t_x
preprocessed_data['train_y'] = t_y
preprocessed_data['test_x'] = tt_x
preprocessed_data['test_y'] = tt_y
preprocessed_data['validation_x'] = v_x
preprocessed_data['validation_y'] = v_y
preprocessed_data['token'] = tk
preprocessed_data['word_num'] = word_num
preprocessed_data['sentence_len'] = p_l
preprocessed_data['stop_words'] = st

In [17]:
with open('preprocessed_data.pickle', 'wb') as fw:
    pickle.dump(preprocessed_data, fw)