In [147]:
import pandas as pd
import numpy as np
from nltk.corpus import stopwords
from bs4 import BeautifulSoup
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import re

In [148]:
class Preprocessor_eng:
    # 클래스 객체 만들 때 data, 학습에 사용할 X데이터의 열 이름, 불용어를 매개변수로 받음
    def __init__(self, data, x_data, stop_words):
        self.data = data
        self.stop_words = stop_words
        
        if self.data.isnull().values.any(): # 데이터에 null값이 있다면 제거
            self.data = self.data.dropna(how = 'any')
        self.data = self.data.drop_duplicates(subset = [x_data])  # 중복된 데이터 제거
    
    # 데이터 확인용
    def print_data(self):
        print(self.data) 
    
    # 전처리 메소드 (학습에 사용할 X데이터의 열 이름, 불용어 처리 여부를 매개변수로 받음)
    def preprocessor(self, x_data, remove_stopwords = True):
        def preprocessing(X_text, remove_stopwords):
            X_text = BeautifulSoup(X_text, 'lxml').get_text()
            X_text = re.sub("[^a-zA-Z]", " ", X_text)
            words = X_text.lower().split()
            if remove_stopwords:
                stops = set(self.stop_words)
                words = [w for w in words if not w in stops]
                clean_text = ' '.join(words)
            else:
                clean_text = ' '.join(words)
            return clean_text
    
        self.data['clean_X'] = self.data[x_data].apply(lambda x: preprocessing(X_text = x, remove_stopwords = remove_stopwords))
        self.data['clean_X'] = self.data['clean_X'].str.replace("[^a-zA-Z0-9 ]", "")
        self.data['clean_X'] = self.data['clean_X'].str.replace('^ +', "")
        self.data['clean_X'].replace('', np.nan, inplace = True)
        self.data = self.data.dropna(how = 'any')
    
    # 데이터 분류 메소드(학습에 사용할 Y데이터의 열 이름, 타겟데이터가 문자열인지 아닌지, 이중분류인지 아닌지를 매개변수로 받음)
    def data_classification(self, y_data_column, isstr = False, isbin = True):
        if isstr:
            self.data['encoder_y'] = LabelEncoder().fit_transform(self.data[y_data_column])
            if isbin:
                Y = np.array(self.data['encoder_y'])
            else:
                Y = to_categorical(self.data['encoder_y'])
        else:
            if isbin:
                Y = np.array(self.data[y_data_column])
            else:
                Y = to_categorical(self.data[y_data_column])
        X = self.data['clean_X']
        
        x_data, tt_x, y_data, tt_y = train_test_split(X, Y, test_size = 0.3, random_state = 0)
        t_x, v_x, t_y, v_y = train_test_split(x_data, y_data, test_size = 0.2, random_state = 0)
        
        tk = Tokenizer()
        tk.fit_on_texts(t_x)
        n = len([d for d in sorted(list(tk.word_counts.items()), key = lambda x: x[1]) if d[1] > 4]) + 1
        token = Tokenizer(n)
        token.fit_on_texts(t_x)
        token_t_x = token.texts_to_sequences(t_x)
        token_tt_x = token.texts_to_sequences(tt_x)
        token_v_x = token.texts_to_sequences(v_x)
        
        drop_train = [index for index, sentence in enumerate(token_t_x) if len(sentence) < 1]
        drop_test = [index for index, sentence in enumerate(token_tt_x) if len(sentence) < 1]
        drop_val = [index for index, sentence in enumerate(token_v_x) if len(sentence) < 1]
        
        token_t_x = np.delete(token_t_x, drop_train, axis=0)
        t_y = np.delete(t_y, drop_train, axis=0)
        token_tt_x = np.delete(token_tt_x, drop_test, axis=0)
        tt_y = np.delete(tt_y, drop_test, axis=0)
        token_v_x = np.delete(token_v_x, drop_val, axis=0)
        v_y = np.delete(v_y, drop_val, axis=0)
        
        w_l = len(pad_sequences(token_t_x)[0])
        train_inputs = pad_sequences(token_t_x, maxlen = w_l)
        test_inputs = pad_sequences(token_tt_x, maxlen = w_l)
        val_inputs = pad_sequences(token_v_x, maxlen = w_l)
        train_outputs = t_y
        test_outputs = tt_y
        val_outputs = v_y
        
        return train_inputs, val_inputs, test_inputs, train_outputs, val_outputs, test_outputs, n