# Text_Preprocessing

## Tokenization

In [11]:
from nltk.tokenize import word_tokenize
from nltk.tokenize import WordPunctTokenizer
from torchtext.vocab import vocab

In [6]:
text = "Don't be fooled by the dark sounding name, Mr. Jone's Orphanage is a cheery as cheery goes for a pastry shop"

In [7]:
word_tokenize(text)

['Do',
 "n't",
 'be',
 'fooled',
 'by',
 'the',
 'dark',
 'sounding',
 'name',
 ',',
 'Mr.',
 'Jone',
 "'s",
 'Orphanage',
 'is',
 'a',
 'cheery',
 'as',
 'cheery',
 'goes',
 'for',
 'a',
 'pastry',
 'shop']

In [9]:
WordPunctTokenizer().tokenize(text)

['Don',
 "'",
 't',
 'be',
 'fooled',
 'by',
 'the',
 'dark',
 'sounding',
 'name',
 ',',
 'Mr',
 '.',
 'Jone',
 "'",
 's',
 'Orphanage',
 'is',
 'a',
 'cheery',
 'as',
 'cheery',
 'goes',
 'for',
 'a',
 'pastry',
 'shop']

In [15]:
from nltk.tokenize import TreebankWordTokenizer

tokenizer = TreebankWordTokenizer()

text = "Starting a home-based restaurant may be an ideal. it doesn't have a food chain or restaurant of their own."
print(tokenizer.tokenize(text))

['Starting', 'a', 'home-based', 'restaurant', 'may', 'be', 'an', 'ideal.', 'it', 'does', "n't", 'have', 'a', 'food', 'chain', 'or', 'restaurant', 'of', 'their', 'own', '.']


In [17]:
from nltk.tokenize import sent_tokenize

text = "His barber kept his word. But keeping such a huge secret to himself was driving him crazy. Finally, the barber went up a mountain and almost to the edge of a cliff. He dug a hole in the midst of some reeds. He looked about, to make sure no one was near."
print(sent_tokenize(text))

['His barber kept his word.', 'But keeping such a huge secret to himself was driving him crazy.', 'Finally, the barber went up a mountain and almost to the edge of a cliff.', 'He dug a hole in the midst of some reeds.', 'He looked about, to make sure no one was near.']


In [18]:
text = "I am actively looking for Ph.D. students. and you are a Ph.D student."
print(sent_tokenize(text))

['I am actively looking for Ph.D. students.', 'and you are a Ph.D student.']


In [22]:
from nltk.tokenize import word_tokenize
from nltk.tag import pos_tag

text = "I am activly looking for Ph.D. students. and you are a Ph.D. student"
tokenized_sentence = word_tokenize(text)
tokenized_sentence, pos_tag(tokenized_sentence)

(['I',
  'am',
  'activly',
  'looking',
  'for',
  'Ph.D.',
  'students',
  '.',
  'and',
  'you',
  'are',
  'a',
  'Ph.D.',
  'student'],
 [('I', 'PRP'),
  ('am', 'VBP'),
  ('activly', 'JJ'),
  ('looking', 'VBG'),
  ('for', 'IN'),
  ('Ph.D.', 'NNP'),
  ('students', 'NNS'),
  ('.', '.'),
  ('and', 'CC'),
  ('you', 'PRP'),
  ('are', 'VBP'),
  ('a', 'DT'),
  ('Ph.D.', 'NNP'),
  ('student', 'NN')])

In [4]:
from konlpy.tag import Okt
from konlpy.tag import Kkma

okt = Okt()
kkma = Kkma()

text = "열심히 코딩한 당신, 연휴에는 여행을 가봐요"
print(f"OKT 형태소 분석 {okt.morphs(text)}")
print(f"OKT 품사 태깅 {okt.pos(text)}")
print(f"OKT 명사 추출 {okt.nouns(text)}")

print("-" * 50)

print(f"꼬꼬마 형태소 분석 {kkma.morphs(text)}")
print(f"꼬꼬마 품사 태깅 {kkma.pos(text)}")
print(f"꼬꼬마 명사 추출 {kkma.nouns(text)}")

OKT 형태소 분석 ['열심히', '코딩', '한', '당신', ',', '연휴', '에는', '여행', '을', '가봐요']
OKT 품사 태깅 [('열심히', 'Adverb'), ('코딩', 'Noun'), ('한', 'Josa'), ('당신', 'Noun'), (',', 'Punctuation'), ('연휴', 'Noun'), ('에는', 'Josa'), ('여행', 'Noun'), ('을', 'Josa'), ('가봐요', 'Verb')]
OKT 명사 추출 ['코딩', '당신', '연휴', '여행']
--------------------------------------------------
꼬꼬마 형태소 분석 ['열심히', '코딩', '하', 'ㄴ', '당신', ',', '연휴', '에', '는', '여행', '을', '가보', '아요']
꼬꼬마 품사 태깅 [('열심히', 'MAG'), ('코딩', 'NNG'), ('하', 'XSV'), ('ㄴ', 'ETD'), ('당신', 'NP'), (',', 'SP'), ('연휴', 'NNG'), ('에', 'JKM'), ('는', 'JX'), ('여행', 'NNG'), ('을', 'JKO'), ('가보', 'VV'), ('아요', 'EFN')]
꼬꼬마 명사 추출 ['코딩', '당신', '연휴', '여행']


## Cleaning & Normalization

### Lemmatization (표제어 추출) - Normalization

In [6]:
from nltk.stem import WordNetLemmatizer

lemmatizer = WordNetLemmatizer()

words = ['policy', 'doing', 'organization', 'have', 'going', 'love', 'lives', 'fly', 'dies', 'watched', 'has', 'starting']

## WordNetLemmatizer가 단어의 품사를 모를 때

print(f"Before abstracting lemmatizer : {words}")
print(f"After abstracting lemmatizer : {[lemmatizer.lemmatize(word) for word in words]}")

Before abstracting lemmatizer : ['policy', 'doing', 'organization', 'have', 'going', 'love', 'lives', 'fly', 'dies', 'watched', 'has', 'starting']
After abstracting lemmatizer : ['policy', 'doing', 'organization', 'have', 'going', 'love', 'life', 'fly', 'dy', 'watched', 'ha', 'starting']


In [7]:
## WordNetLemmatizer가 단어의 품사를 알 때

print(lemmatizer.lemmatize('dies', 'v'))
print(lemmatizer.lemmatize('watched', 'v'))
print(lemmatizer.lemmatize('has', 'v'))

die
watch
have


### Stemming (어간 추출) - Normalization

In [9]:
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize

stemmer = PorterStemmer()

sentence = "This was not the map we found in Billy Bones's chest, but an accurate copy, complete in all things--names and heights and soundings--with the single exception of the red crosses and the written notes."
tokenized_sentence = word_tokenize(sentence)

print(f"Before abstracting Stem : {tokenized_sentence}")
print(f"After abstracting Stem : {[stemmer.stem(word) for word in tokenized_sentence]}")

Before abstracting Stem : ['This', 'was', 'not', 'the', 'map', 'we', 'found', 'in', 'Billy', 'Bones', "'s", 'chest', ',', 'but', 'an', 'accurate', 'copy', ',', 'complete', 'in', 'all', 'things', '--', 'names', 'and', 'heights', 'and', 'soundings', '--', 'with', 'the', 'single', 'exception', 'of', 'the', 'red', 'crosses', 'and', 'the', 'written', 'notes', '.']
After abstracting Stem : ['thi', 'wa', 'not', 'the', 'map', 'we', 'found', 'in', 'billi', 'bone', "'s", 'chest', ',', 'but', 'an', 'accur', 'copi', ',', 'complet', 'in', 'all', 'thing', '--', 'name', 'and', 'height', 'and', 'sound', '--', 'with', 'the', 'singl', 'except', 'of', 'the', 'red', 'cross', 'and', 'the', 'written', 'note', '.']


In [11]:
words = ['formalize', 'allowance', 'electricical']

print(f'Before abstracting Stem : {words}')
print(f'After abstracting Stem : {[stemmer.stem(word) for word in words]}')

Before abstracting Stem : ['formalize', 'allowance', 'electricical']
After abstracting Stem : ['formal', 'allow', 'electric']


### Removing Stopword (불용어) - Cleaning

In [14]:
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from konlpy.tag import Okt

In [17]:
stop_word_list = stopwords.words('english')
print(f"Count of Stopword : {len(stop_word_list)}")
print(f"10 Stopwords : {stop_word_list[:10]}")

Count of Stopword : 179
10 Stopwords : ['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're"]


In [20]:
example = "Family is not an important thing. It's everything."
stop_words = set(stopwords.words('english')) 

word_tokens = word_tokenize(example)

result = []

for word in word_tokens:
    if word not in stop_words:
        result.append(word)

print(f"Before Removing Stopword : {word_tokens}")
print(f"After Removing Stopword : {result}")

Before Removing Stopword : ['Family', 'is', 'not', 'an', 'important', 'thing', '.', 'It', "'s", 'everything', '.']
After Removing Stopword : ['Family', 'important', 'thing', '.', 'It', "'s", 'everything', '.']


### Integer Encoding

In [21]:
from nltk.tokenize import sent_tokenize
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

raw_text = "A barber is a person. a barber is good person. a barber is huge person. he Knew A Secret! The Secret He Kept is huge secret. Huge secret. His barber kept his word. a barber kept his word. His barber kept his secret. But keeping and keeping such a huge secret to himself was driving the barber crazy. the barber went up a huge mountain."
raw_text

'A barber is a person. a barber is good person. a barber is huge person. he Knew A Secret! The Secret He Kept is huge secret. Huge secret. His barber kept his word. a barber kept his word. His barber kept his secret. But keeping and keeping such a huge secret to himself was driving the barber crazy. the barber went up a huge mountain.'

In [22]:
sentences = sent_tokenize(raw_text)
sentences

['A barber is a person.',
 'a barber is good person.',
 'a barber is huge person.',
 'he Knew A Secret!',
 'The Secret He Kept is huge secret.',
 'Huge secret.',
 'His barber kept his word.',
 'a barber kept his word.',
 'His barber kept his secret.',
 'But keeping and keeping such a huge secret to himself was driving the barber crazy.',
 'the barber went up a huge mountain.']

In [26]:
vocab = {}
preprocessed_sentences = []
stop_words = set(stopwords.words('english'))

for sentence in sentences:
    tokenized_sentence = word_tokenize(sentence)
    result = []

    for word in tokenized_sentence:
        word = word.lower()
        if word not in stop_words:
            if len(word) > 2:
                result.append(word)
                if word not in vocab:
                    vocab[word] = 0
                vocab[word] += 1

    preprocessed_sentences.append(result)
print(preprocessed_sentences)

[['barber', 'person'], ['barber', 'good', 'person'], ['barber', 'huge', 'person'], ['knew', 'secret'], ['secret', 'kept', 'huge', 'secret'], ['huge', 'secret'], ['barber', 'kept', 'word'], ['barber', 'kept', 'word'], ['barber', 'kept', 'secret'], ['keeping', 'keeping', 'huge', 'secret', 'driving', 'barber', 'crazy'], ['barber', 'went', 'huge', 'mountain']]


In [27]:
print(f"Word Zip : {vocab}")

Word Zip : {'barber': 8, 'person': 3, 'good': 1, 'huge': 5, 'knew': 1, 'secret': 6, 'kept': 4, 'word': 2, 'keeping': 2, 'driving': 1, 'crazy': 1, 'went': 1, 'mountain': 1}


In [28]:
vocab_sorted = sorted(vocab.items(), key=lambda x:x[1], reverse=True)
print(vocab_sorted)

[('barber', 8), ('secret', 6), ('huge', 5), ('kept', 4), ('person', 3), ('word', 2), ('keeping', 2), ('good', 1), ('knew', 1), ('driving', 1), ('crazy', 1), ('went', 1), ('mountain', 1)]


### Padding

In [30]:
import numpy as np
from torchtext.data import get_tokenizer 

In [34]:
tokenizer = get_tokenizer('basic_english')
tokenizer(raw_text)

['a',
 'barber',
 'is',
 'a',
 'person',
 '.',
 'a',
 'barber',
 'is',
 'good',
 'person',
 '.',
 'a',
 'barber',
 'is',
 'huge',
 'person',
 '.',
 'he',
 'knew',
 'a',
 'secret',
 '!',
 'the',
 'secret',
 'he',
 'kept',
 'is',
 'huge',
 'secret',
 '.',
 'huge',
 'secret',
 '.',
 'his',
 'barber',
 'kept',
 'his',
 'word',
 '.',
 'a',
 'barber',
 'kept',
 'his',
 'word',
 '.',
 'his',
 'barber',
 'kept',
 'his',
 'secret',
 '.',
 'but',
 'keeping',
 'and',
 'keeping',
 'such',
 'a',
 'huge',
 'secret',
 'to',
 'himself',
 'was',
 'driving',
 'the',
 'barber',
 'crazy',
 '.',
 'the',
 'barber',
 'went',
 'up',
 'a',
 'huge',
 'mountain',
 '.']

## One-Hot Encoding

In [35]:
from konlpy.tag import Okt

okt = Okt()
tokens = okt.morphs("나는 자연어 처리를 배운다")
print(tokens)

['나', '는', '자연어', '처리', '를', '배운다']


In [36]:
word_to_index = {word : index for index, word in enumerate(tokens)}
print(f"단어 집합 : {word_to_index}")

단어 집합 : {'나': 0, '는': 1, '자연어': 2, '처리': 3, '를': 4, '배운다': 5}


In [41]:
def one_hot_encoding(word, word_to_index):
    one_hot_vector = [0] * (len(word_to_index))
    index = word_to_index[word]
    one_hot_vector[index] = 1
    return one_hot_vector

In [42]:
one_hot_encoding("자연어", word_to_index)

[0, 0, 1, 0, 0, 0]

#### One-Hot Encoding의 한계 <br> 1. 단어의 개수 증가할수록 요구되는 벡터 차원 수 증가로 인한 메모리 비효율 <br> 2. 단어의 유사도 표현 불가

#### One-Hot Encoding의 한계 해결 -> LSA, HAL (Count Based) | NNLM, RNNLM, Word2Vec, FastText (Prediction Based)  