### 맨 밑으로 가시면 한번에 전처리 처리를 할 수 있도록 해놨습니다.


(처음 실행시에는, 모든 cell을 실행시켜야 합니다)

[자동화 처리](#자동화-처리)

### 라이브러리 import

In [1]:
import numpy as np
import pandas as pd
import seaborn as sns

import matplotlib.pyplot as plt
import re

import nltk

import contractions

from typing import List, Optional, Union, Callable
from nltk.stem import PorterStemmer, SnowballStemmer, LancasterStemmer, WordNetLemmatizer

### DataFrame Import

In [2]:
data = pd.read_csv("data.csv")

## text cleaning function

Remove punctuations

Convert text to tokens

Remove tokens of length less than or equal to 3

Remove stopwords using NLTK corpus stopwords list to match

Apply stemming

Apply lemmatization

Convert words to feature vectors

### Convert text to lowercase

In [3]:
def lower_case_convertion(input_text: str) -> str:
    """ Convert input text to lower case """
    return input_text.lower()

### Remove URLs

In [4]:
import re

url_pattern = r'https?://\S+|www\.\S+'

def remove_urls(input_text):
    for i in range(len(input_text)):
        input_text[i] = re.sub(pattern=url_pattern, repl=' ', string=input_text[i])
    return input_text

### Remove numbers

In [5]:
def remove_number(input_text: str) -> str:
    for i in range(len(input_text)):
        input_text[i] = re.sub(r'\d+', '', str(input_text[i]))
    return input_text

### Remove whitespaces

In [6]:
def remove_whitespaces(input_text: str) -> str:
    for i in range(len(input_text)):
        input_text[i] = input_text[i].strip()
    return input_text

### 악센트 텍스트 ASCII 변환기로 구현

In [7]:
# Implementation of accented text to ASCII converter in python

import unidecode

def accented_to_ascii(input_text):
    for i in range(len(input_text)):
        # apply unidecode function on text to convert
        # accented characters to ASCII values
        input_text[i] = unidecode.unidecode(input_text[i])
    return input_text

### Converting chat conversion words to normal words

In [8]:
# open short_form file and then read sentences from text file using read())
short_form_list = open('slang.txt', 'r')
chat_words_str = short_form_list.read()

chat_words_map_dict = {}
chat_words_list = []
for line in chat_words_str.split("\n"):
    if line != "":
        cw = line.split("=")[0]
        cw_expanded = line.split("=")[0]
        chat_words_list.append(cw)
        chat_words_map_dict[cw] = cw_expanded
chat_words_list = set(chat_words_list)

In [9]:
# ex) omg => oh my god

def short_to_original(input_text):
    for i in range(len(input_text)):
        new_text = []
        for w in input_text[i].split():
            if w.upper() in chat_words_list:
                new_text.append(chat_words_map_dict[w.upper()])
            else:
                new_text.append(w)
        input_text[i] = " ".join(new_text)
    return input_text

### Expanding Contractions

In [10]:
from contractions import contractions_dict

def expand_contractions(text, contraction_mapping=contractions_dict):
    contractions_pattern = re.compile('({})'.format('|'.join(contraction_mapping.keys())),
                                    flags=re.IGNORECASE | re.DOTALL)

    def expand_match(contraction):
        match = contraction.group(0)
        first_char = match[0]
        expanded_contraction = contraction_mapping.get(match) \
            if contraction_mapping.get(match) \
            else contraction_mapping.get(match.lower())
        expanded_contraction = first_char + expanded_contraction[1:]
        return expanded_contraction

    try:
        expanded_text = contractions_pattern.sub(expand_match, text)
        expanded_text = re.sub("'", "", expanded_text)
    except:
        return text
    return expanded_text

In [11]:
def expand_contractions_function(input_text):
    for i in range(len(input_text)):
        input_text[i] = expand_contractions(input_text[i])
        #expanded_text = re.sub("'", "", expanded_text)
    return input_text

### NER(named entity recognition), 개체명 인식

In [12]:
# New York -> New-York

import spacy

import en_core_web_sm
nlp = spacy.load('en_core_web_sm')  #loaded large model

def convert_NER(input_text):
    for i in range(len(input_text)):
        entitylist = list() #Empty list
        for w in nlp(input_text[i]).ents: # nlp(s).ents gives us name of entities in s
            if len(str(w.text).split()) > 1: # if number of words in s > 1
                entitylist.append((w.text, str(w.text).replace(' ','-')))
        #replacing space with - to join words
        entitylist # list of entities which should be single token
        # Output: [('New York', 'New-York')]
        for item in entitylist:
            input_text[i] = input_text[i].replace(item[0],\
                                                  item[1])
    return input_text



### Remove stop words

In [13]:
from nltk.tokenize import word_tokenize

def remove_stopword(input_text, stop_words):
    stop_words = set(stop_words)
    filtered_sentence = []
    for i in range(0, len(input_text)):
        word_tokens = word_tokenize(input_text[i])
        filt_sentence = [w for w in word_tokens if not w in stop_words]
        filtered_sentence.append(filt_sentence)
    return filtered_sentence

### Stemming

Stemming using PorterStemming from nltk library

In [14]:
# Implementation of Stemming using PorterStemming from nltk library

from nltk.stem import PorterStemmer

def porter_stemmer(input_text, stemmer):
    for i in range(len(input_text)):
        # word tokenization
        try:
            tokens = word_tokenize(input_text[i])
        except:
            tokens = input_text[i]
        for index in range(len(tokens)):
            # stem word to each word
            stem_word = stemmer.stem(tokens[index])
            # update tokens list with stem word
            tokens[index] = stem_word
        input_text[i] = ' '.join(tokens)
        # join list with space separator as string
    return input_text

### lemmatization

lemmatization using nltk

In [15]:
## Implementation of lemmatization using nltk

from nltk.stem import WordNetLemmatizer

def lemmatization(input_text, lemma):
    for i in range(len(input_text)):
        # word tokenization
        try:
            tokens = word_tokenize(input_text[i])
        except:
            tokens = input_text[i]
        for index in range(len(tokens)):
            # lemma word
            lemma_word = lemma.lemmatize(tokens[index])
            tokens[index] = lemma_word
        input_text[i] = ' '.join(tokens)
    return input_text

## initialize lemmatizer object
# lemma = WordNetLemmatizer()
# lemma_result = lemmatization(data)

## 엑셀 파일로 저장

첫번째 자리 - "개체명 인식" 처리를 했는지(0: 안했음, 1: 했음)

두번째 자리 - "줄임 기호 늘리기" 처리를 했는지(0: 안했음, 1: 했음)

세번째 자리 - "줄임말을 늘렸는지" 처리를 했는지(0: 안했음, 1: 했음)

-

네번째 자리 - "stopword 제거" 처리를 했는지(0: 안했음, 1: 했음)

-

다섯번째 자리 - "Stemming" 처리를 했는지(0: 안했음, 1: 했음)

여섯번째 자리 - "lemmatization" 처리를 했는지(0: 안했음, 1: 했음)


ex) 

1. 000-0-00

-> 아무런 전처리 과정이 일어나지 않았다는 것

2. 111-1-00

-> 개체명 인식 + 줄임 기호 늘이기 + 줄임말 늘리기 O, stopword 처리 O, stemming + lemmatization 처리 X

In [16]:
stemmer = PorterStemmer()
lemma = WordNetLemmatizer()

# data_preprocessing 이라는 함수를 선언.
# data_preprocessing의 parameter는 총 7가지.
# input text : DataFrame
# 나머지 : boolean - 0(False), 1(True)

def data_preprocessing(input_text, is_NER, is_expand, is_sortToOriginal, m_stopword, is_stem, is_lem):
    
    # ----------------------------기본----------------------------
    input_text = lower_case_convertion(input_text.str)
    input_text = remove_urls(input_text)
    input_text = remove_whitespaces(input_text)
    # 텍스트에서의 강조 악센트 지우기
    input_text = accented_to_ascii(input_text)
    
    # ----------------------------선택----------------------------    
    # 개체명 인식
    if(is_NER) : input_text = convert_NER(input_text)
        
    # 줄임 기호 늘리기(Expand contractions)
    # ex) don’t -> do not
    if(is_expand) : input_text = expand_contractions_function(input_text)
        
    # 줄임말(Expand Acronym)
    # Ex) ASAP -> As Soon As Possible
    if(is_sortToOriginal) : input_text = short_to_original(input_text)
    
    # stopword remove
    input_text = remove_stopword(input_text, m_stopword)
    
    # Stemming
    # Stem을 추출하는 작업. 형태학적 분석을 단순화 해주는 기법
    # Ex) am -> am , the going -> the go, having -> hav
    if(is_stem) : input_text = porter_stemmer(input_text, stemmer)
        
    # lemmatization
    # Lemma의 기본 단어 (기본 사전 단어)로 변화 하는 기법
    # Ex) am -> be , the going -> the going, having -> have
    if(is_lem) : input_text = lemmatization(input_text, lemma)
    
    return input_text

### 전처리 처리한 dataframe을 csv로 저장

In [17]:
## 아래 코드(for문이 3개 겹쳐있는 코드)에서 자동화를 시켜주기 위해서 parameter를 설정했습니다.

## 함수 parameter 설정
# 0(처리가 안되어있는 경우 - False) 
# 1(처리가 되어있는 경우 - True)
is_NER = [0, 1]
is_expand = [0, 1]
is_sortToOriginal = [0, 1]

# stopword
# 0(stopword 처리가 없는 경우 - False)
# 1(stopword 처리가 있는 경우 - True)
no_stopword = []
stopword = pd.read_csv("./stopword.txt", encoding='cp949')
stopword = stopword['word']
stop_words = [no_stopword, stopword]

## 함수 parameter 설정
# 0(처리가 안되어있는 경우 - False) 
# 1(처리가 되어있는 경우 - True)
is_stem = [0, 1] # Stemming
is_lem = [0, 1] # lemmatization

### 자동화 처리

초기에 전처리의 경우의 수는 2x2x2x2x2x2x2 = 128 개 였습니다.

그러나 너무 많은 경우의 수로,

000-0-00
000-0-11
000-1-00
000-1-11
111-0-00
111-0-11
111-1-00
111-1-11

총 8가지의 경우만 확인해보기로 했습니다. (X-X-X의 form이 아닌, XXX-X-XX의 form으로 남겨둔 이유는 확장성 때문에 혹시나 남겨뒀습니다.)

In [18]:
for ner in is_NER:
    for stopwordIndex in range(len(stop_words)):
        for stem in is_stem:
            df = pd.DataFrame({'sentence' : 
                   data_preprocessing(data['sentence'], ner, ner, ner,
                                      stop_words[stopwordIndex], stem, stem),
                  'trust':data.trust,
                  'control mutuality':data['control mutuality'],
                  'commitment':data.commitment,
                  'satisfaction':data.satisfaction})
            
            label = str(ner) + str(ner) + str(ner) + '-' + str(stopwordIndex) + '-' + str(stem) + str(stem)
            print(label)
            #df.to_csv('textPreprocessingDataSet/text_preprocessing('+label+').csv',index = False)

000-0-00
000-0-11
000-1-00
000-1-11
111-0-00
111-0-11
111-1-00
111-1-11
