# Text Preprocessing - 텍스트 전처리

내가 해결하고자 하는 문제의 용도에 맞게 텍스트를 사전에 처리해버리는 작업

In [1]:
import nltk   # 자연어 처리를 위한 패키지
from nltk.tokenize import word_tokenize
from nltk.tokenize import WordPunctTokenizer
from tensorflow.keras.preprocessing.text import text_to_word_sequence
nltk.download('punkt')    # 문장 구조를 학습한 일종의 모델

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

# 토큰화 (Tokenization)

어떤 문장을 단어로 잘라내서 정제하고, 정규화를 시키는 과정

- 구두점(Functuation)
  * 마침표, 쉼표, 물음표, 느낌표, 세미콜론, ...
  

In [6]:
text = 'No pain No gain. You scratch my back and I`ll scratch yours. Do not count your chickens before they hatch'

In [7]:
print(word_tokenize(text))
# word_tokenize : Don't => Do와 n't / ain't => ai와 n't / you're => you와 're로 구분
print()

print(WordPunctTokenizer().tokenize(text))
# 구두점을 별도로 표시

print()

print(text_to_word_sequence(text))
# keras의 text_to_word_sequence : 모든 알파벳을 소문자로 바꿔줌
#                                 구두점 제거
#                                 you're, dont't, ain't 같은 경우는 보존함

['No', 'pain', 'No', 'gain', '.', 'You', 'scratch', 'my', 'back', 'and', 'I', '`', 'll', 'scratch', 'yours', '.', 'Do', 'not', 'count', 'your', 'chickens', 'before', 'they', 'hatch']

['No', 'pain', 'No', 'gain', '.', 'You', 'scratch', 'my', 'back', 'and', 'I', '`', 'll', 'scratch', 'yours', '.', 'Do', 'not', 'count', 'your', 'chickens', 'before', 'they', 'hatch']

['no', 'pain', 'no', 'gain', 'you', 'scratch', 'my', 'back', 'and', 'i', 'll', 'scratch', 'yours', 'do', 'not', 'count', 'your', 'chickens', 'before', 'they', 'hatch']


# 문장 토큰화 (Sentence Tokenization)

In [9]:
sentence = """But here is an artist. He desires to paint you the dreamiest, shadiest, quietest, most enchanting bit of romantic landscape in all the valley of the Saco. What is the chief element he employs? There stand his trees, each with a hollow trunk, as if a hermit and a crucifix were within; and here sleeps his meadow, and there sleep his cattle; and up from yonder cottage goes a sleepy smoke. Deep into distant woodlands winds a mazy way, reaching to overlapping spurs of mountains bathed in their hill-side blue. But though the picture lies thus tranced, and though this pine-tree shakes down its sighs like leaves upon this shepherd’s head, yet all were vain, unless the shepherd’s eye were fixed upon the magic stream before him."""
sentence

'But here is an artist. He desires to paint you the dreamiest, shadiest, quietest, most enchanting bit of romantic landscape in all the valley of the Saco. What is the chief element he employs? There stand his trees, each with a hollow trunk, as if a hermit and a crucifix were within; and here sleeps his meadow, and there sleep his cattle; and up from yonder cottage goes a sleepy smoke. Deep into distant woodlands winds a mazy way, reaching to overlapping spurs of mountains bathed in their hill-side blue. But though the picture lies thus tranced, and though this pine-tree shakes down its sighs like leaves upon this shepherd’s head, yet all were vain, unless the shepherd’s eye were fixed upon the magic stream before him.'

In [10]:
from nltk.tokenize import sent_tokenize

In [11]:
sent_tokenize(sentence)
# NLTK는 단순하게 마침표로 문장을 구분하지 않음
# Dr. , Mrs. Mr. 등 단어들은 마침표를 기준으로 해서 나뉘어지지 않음 => 성공적!!

['But here is an artist.',
 'He desires to paint you the dreamiest, shadiest, quietest, most enchanting bit of romantic landscape in all the valley of the Saco.',
 'What is the chief element he employs?',
 'There stand his trees, each with a hollow trunk, as if a hermit and a crucifix were within; and here sleeps his meadow, and there sleep his cattle; and up from yonder cottage goes a sleepy smoke.',
 'Deep into distant woodlands winds a mazy way, reaching to overlapping spurs of mountains bathed in their hill-side blue.',
 'But though the picture lies thus tranced, and though this pine-tree shakes down its sighs like leaves upon this shepherd’s head, yet all were vain, unless the shepherd’s eye were fixed upon the magic stream before him.']

In [12]:
# KSS (Korean Sentence Splitter)
!pip install kss5

Collecting kss
  Downloading kss-6.0.4.tar.gz (1.1 MB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/1.1 MB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━[0m[91m╸[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.2/1.1 MB[0m [31m5.6 MB/s[0m eta [36m0:00:01[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m1.1/1.1 MB[0m [31m18.0 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.1/1.1 MB[0m [31m13.2 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting emoji==1.2.0 (from kss)
  Downloading emoji-1.2.0-py3-none-any.whl.metadata (4.3 kB)
Collecting pecab (from kss)
  Downloading pecab-1.0.8.tar.gz (26.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m26.4/26.4 MB[0m [31m63.6 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting jamo (from kss)
  Downloading ja

In [13]:
import kss

In [14]:
kor = "오늘부터 AI 시작이에요. 텍스트 전처리는 한국어가 영어보다 훨씬 난이도가 높아요. 한번 경험해봅시다."
kor

'오늘부터 AI 시작이에요. 텍스트 전처리는 한국어가 영어보다 훨씬 난이도가 높아요. 한번 경험해봅시다.'

In [15]:
print(kss.split_sentences(kor))




['오늘부터 AI 시작이에요.', '텍스트 전처리는 한국어가 영어보다 훨씬 난이도가 높아요.', '한번 경험해봅시다.']


# 한국어 = 교착어(어근 + 접사)

한국어에는 [조사]가 존재

- 글자 뒤에 띄어쓰기 없이 존재
- 형태소 (morpheme)
  - 말의 가장 작은 단위
    - 자립형태소 : 명사, 대명사, 수사, 관형사, 부사, ...
    - 의존형태소 : 다른 형태소와 결합을 해야만하는... 어간, 어미, 접사, 조사, ...

# 품사 태깅(Part-of-speech tagging) : 단어 토큰화를 거친 토큰들(단어들)에게 품사를 붙여주는 작업

동음이의어

mean : 동사] 의미하다 / 명사] 평균 / 형용사] 비열한, 못된

연패 : 연속해서 패하다 / 연속해서 이기다

# NLTK / KoNLPy

In [16]:
nltk.download('averaged_perceptron_tagger') # 품사태깅을 위한 라이브러리

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


True

In [17]:
from nltk.tag import pos_tag

In [19]:
text = "No pain No gain. You scratch my back and I'll scratch yours. Don't count your chickens before they hatch"
tokenized_sentence = word_tokenize(text)
print(tokenized_sentence)
print(pos_tag(tokenized_sentence))

['No', 'pain', 'No', 'gain', '.', 'You', 'scratch', 'my', 'back', 'and', 'I', "'ll", 'scratch', 'yours', '.', 'Do', "n't", 'count', 'your', 'chickens', 'before', 'they', 'hatch']
[('No', 'DT'), ('pain', 'NN'), ('No', 'RB'), ('gain', 'NN'), ('.', '.'), ('You', 'PRP'), ('scratch', 'VBP'), ('my', 'PRP$'), ('back', 'NN'), ('and', 'CC'), ('I', 'PRP'), ("'ll", 'MD'), ('scratch', 'VB'), ('yours', 'NNS'), ('.', '.'), ('Do', 'VBP'), ("n't", 'RB'), ('count', 'VB'), ('your', 'PRP$'), ('chickens', 'NNS'), ('before', 'IN'), ('they', 'PRP'), ('hatch', 'VBP')]


# PRP : 인칭대명사
# RB : 부사
# DT : 관사
# VBP : 단수, 현재형, 3인칭 동사
# W ~ : wh~
# JJ : 형용사
# NN : 단수명사
# NNS : 복수명사
# MD : 조동사
# VB : 동사 기본형
# VBD : 동사 과거시제
# VBG : 동명사

# 한국어 자연어처리 : KoNLPY라는 파이썬 패키지

KoNLPy에서 사용할 수 있느 형태소 분석기
- Okt(Open Korean Text)
- Komoran
- kkma(꼬꼬마)
- Mecab
- Hannanum

In [20]:
!pip install konlpy

Collecting konlpy
  Downloading konlpy-0.6.0-py2.py3-none-any.whl.metadata (1.9 kB)
Collecting JPype1>=0.7.0 (from konlpy)
  Downloading JPype1-1.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (4.9 kB)
Downloading konlpy-0.6.0-py2.py3-none-any.whl (19.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m19.4/19.4 MB[0m [31m45.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading JPype1-1.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (488 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m488.6/488.6 kB[0m [31m26.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: JPype1, konlpy
Successfully installed JPype1-1.5.0 konlpy-0.6.0


In [21]:
from konlpy.tag import Okt
from konlpy.tag import Kkma

In [22]:
okt = Okt()

print(okt.morphs("오늘은 화요일이고요. 내일은 수요일입니다!"))
# morphs : 형태소 분석 : 어떤 대상의 어절을 최소 의미단위인 형태소로 분석하는 것
print(okt.pos("오늘은 화요일이고요. 내일은 수요일입니다!"))
# pos : 품사 태깅(Part-of-Speech tagging)
print(okt.nouns("오늘은 화요일이고요. 내일은 수요일입니다!"))
# nouns : 명사 추출

['오늘', '은', '화요일', '이', '고요', '.', '내일', '은', '수요일', '입니다', '!']
[('오늘', 'Noun'), ('은', 'Josa'), ('화요일', 'Noun'), ('이', 'Josa'), ('고요', 'Noun'), ('.', 'Punctuation'), ('내일', 'Noun'), ('은', 'Josa'), ('수요일', 'Noun'), ('입니다', 'Adjective'), ('!', 'Punctuation')]
['오늘', '화요일', '고요', '내일', '수요일']
