## Preprocessing

In [1]:
import pandas as pd
import nltk
from bs4 import BeautifulSoup
import string
from nltk.corpus import stopwords

### 1. Read in csv file and create Dataframe & check shape.

In [2]:
str_data = """<html><h2>What is nlp??? </h2></html>
Natural Language Processing, or NLP for short, is broadly defined as the automatic manipulation of natural language, like speech and text, by software.
The study of natural language processing has been around for more than 50 years and grew out of the field of linguistics with the rise of computers.
(In this post), you will discover what natural language processing is and why it is so important.
After reading this post, you will know => What natural language is and how it is different from other types of data."""
str_data

'<html><h2>What is nlp??? </h2></html>\nNatural Language Processing, or NLP for short, is broadly defined as the automatic manipulation of natural language, like speech and text, by software.\nThe study of natural language processing has been around for more than 50 years and grew out of the field of linguistics with the rise of computers.\n(In this post), you will discover what natural language processing is and why it is so important.\nAfter reading this post, you will know => What natural language is and how it is different from other types of data.'

### 2-1. Cleaning - Remove HTML

In [10]:
def remove_html(text_data):
    """
    remove_html takes raw text and removes html tags from the text.
    """

    soup = BeautifulSoup(text_data, "lxml")
    return soup.get_text()

processed_text = remove_html(str_data)
print(processed_text)

What is nlp??? 
Natural Language Processing, or NLP for short, is broadly defined as the automatic manipulation of natural language, like speech and text, by software.
The study of natural language processing has been around for more than 50 years and grew out of the field of linguistics with the rise of computers.
(In this post), you will discover what natural language processing is and why it is so important.
After reading this post, you will know => What natural language is and how it is different from other types of data.


### 2-2. Cleaning - Remove punctuation(구두점) & Lower case

In [11]:
## Check English's punctuation
print('Punctuation: ', string.punctuation)

Punctuation:  !"#$%&'()*+,-./:;<=>?@[\]^_`{|}~


In [21]:
def remove_punctuation(text):
    sent =[]
    for t in text.split(' '):
        no_punct = "".join([c for c in t if c not in string.punctuation])
        sent.append(no_punct)
 
    sentence = " ".join(s for s in sent)
    return sentence

In [23]:
rmv_punc_sentence = remove_punctuation(processed_text)
print(rmv_punc_sentence)

What is nlp 
Natural Language Processing or NLP for short is broadly defined as the automatic manipulation of natural language like speech and text by software
The study of natural language processing has been around for more than 50 years and grew out of the field of linguistics with the rise of computers
In this post you will discover what natural language processing is and why it is so important
After reading this post you will know  What natural language is and how it is different from other types of data


In [24]:
lowwer_sentence = rmv_punc_sentence.lower()
print(lowwer_sentence)

what is nlp 
natural language processing or nlp for short is broadly defined as the automatic manipulation of natural language like speech and text by software
the study of natural language processing has been around for more than 50 years and grew out of the field of linguistics with the rise of computers
in this post you will discover what natural language processing is and why it is so important
after reading this post you will know  what natural language is and how it is different from other types of data


### 3. Lemmatization & Tokenization with spacy library

In [39]:
## using "spacy" library
import spacy

## Load the installed model "en_core_web_sm" into "nlp"
nlp = spacy.load('en_core_web_sm')

In [44]:
## 'doc' is a sequence of Token objects
## it holds all information about the tokens, their linguistic features and their relationships.
doc = nlp(lowwer_sentence.strip())

In [None]:
"""
WRITE THE CODE
"""
tok_lem_sentence[:15]

In [45]:
tok_lem_sentence = [token.lemma_ for token in doc]
tok_lem_sentence[:15]

['what',
 'be',
 'nlp',
 '\n',
 'natural',
 'language',
 'processing',
 'or',
 'nlp',
 'for',
 'short',
 'be',
 'broadly',
 'define',
 'as']

### 4. Remove stop words(불용어: 큰 의미가 없는 단어)

In [29]:
# if you do not have 'stopwords' then run the below statement.
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /Users/woo-
[nltk_data]     hyun/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [31]:
from nltk.corpus import stopwords

print (stopwords.words ('english' ) [ :10])
print (len(stopwords.words ('english' )))

['a', 'about', 'above', 'after', 'again', 'against', 'ain', 'all', 'am', 'an']
198


In [46]:
from nltk.corpus import stopwords

stop_words = set(stopwords.words('english'))

print(tok_lem_sentence, '\n')
rmv_sw_sentence = [w for w in tok_lem_sentence if  not w  in stop_words]
print(rmv_sw_sentence)
removed_word = [word for word in tok_lem_sentence if not word in rmv_punc_sentence]

print("\nRemoved word: ", set(removed_word))

['what', 'be', 'nlp', '\n', 'natural', 'language', 'processing', 'or', 'nlp', 'for', 'short', 'be', 'broadly', 'define', 'as', 'the', 'automatic', 'manipulation', 'of', 'natural', 'language', 'like', 'speech', 'and', 'text', 'by', 'software', '\n', 'the', 'study', 'of', 'natural', 'language', 'processing', 'have', 'be', 'around', 'for', 'more', 'than', '50', 'year', 'and', 'grow', 'out', 'of', 'the', 'field', 'of', 'linguistic', 'with', 'the', 'rise', 'of', 'computer', '\n', 'in', 'this', 'post', 'you', 'will', 'discover', 'what', 'natural', 'language', 'processing', 'be', 'and', 'why', 'it', 'be', 'so', 'important', '\n', 'after', 'read', 'this', 'post', 'you', 'will', 'know', ' ', 'what', 'natural', 'language', 'be', 'and', 'how', 'it', 'be', 'different', 'from', 'other', 'type', 'of', 'datum'] 

['nlp', '\n', 'natural', 'language', 'processing', 'nlp', 'short', 'broadly', 'define', 'automatic', 'manipulation', 'natural', 'language', 'like', 'speech', 'text', 'software', '\n', 'study

### 5. Make dictionary

In [51]:
import numpy as np

dictionary = {}

def make_frequency_dict(text):
    for word in text:
        if word not in dictionary:
            dictionary[word] =0
        dictionary[word] += 1

make_frequency_dict(rmv_sw_sentence)

In [52]:
len(dictionary)

32

In [53]:
dictionary

{'nlp': 2,
 '\n': 4,
 'natural': 5,
 'language': 5,
 'processing': 3,
 'short': 1,
 'broadly': 1,
 'define': 1,
 'automatic': 1,
 'manipulation': 1,
 'like': 1,
 'speech': 1,
 'text': 1,
 'software': 1,
 'study': 1,
 'around': 1,
 '50': 1,
 'year': 1,
 'grow': 1,
 'field': 1,
 'linguistic': 1,
 'rise': 1,
 'computer': 1,
 'post': 2,
 'discover': 1,
 'important': 1,
 'read': 1,
 'know': 1,
 ' ': 1,
 'different': 1,
 'type': 1,
 'datum': 1}

In [54]:
vocab_sorted = sorted(dictionary.items(), key=lambda x:x[1], reverse = True)
vocab_sorted

[('natural', 5),
 ('language', 5),
 ('\n', 4),
 ('processing', 3),
 ('nlp', 2),
 ('post', 2),
 ('short', 1),
 ('broadly', 1),
 ('define', 1),
 ('automatic', 1),
 ('manipulation', 1),
 ('like', 1),
 ('speech', 1),
 ('text', 1),
 ('software', 1),
 ('study', 1),
 ('around', 1),
 ('50', 1),
 ('year', 1),
 ('grow', 1),
 ('field', 1),
 ('linguistic', 1),
 ('rise', 1),
 ('computer', 1),
 ('discover', 1),
 ('important', 1),
 ('read', 1),
 ('know', 1),
 (' ', 1),
 ('different', 1),
 ('type', 1),
 ('datum', 1)]

In [55]:
word_to_index = {}
i = 0

for (word, frequency) in vocab_sorted :
    if frequency > 1:
        word_to_index[word] = i
        i += 1
        word_to_index[word] = i

print(word_to_index)

{'natural': 1, 'language': 2, '\n': 3, 'processing': 4, 'nlp': 5, 'post': 6}


In [56]:
word_to_index['OOV'] = len(word_to_index) + 1

print(word_to_index)

{'natural': 1, 'language': 2, '\n': 3, 'processing': 4, 'nlp': 5, 'post': 6, 'OOV': 7}


### 6. Encoding

In [60]:
encoded = []

print(rmv_sw_sentence)

for w in rmv_sw_sentence:
    try:
        encoded.append(word_to_index[w])
    except KeyError:
        encoded.append(word_to_index['OOV'])
        
print(encoded)

['nlp', '\n', 'natural', 'language', 'processing', 'nlp', 'short', 'broadly', 'define', 'automatic', 'manipulation', 'natural', 'language', 'like', 'speech', 'text', 'software', '\n', 'study', 'natural', 'language', 'processing', 'around', '50', 'year', 'grow', 'field', 'linguistic', 'rise', 'computer', '\n', 'post', 'discover', 'natural', 'language', 'processing', 'important', '\n', 'read', 'post', 'know', ' ', 'natural', 'language', 'different', 'type', 'datum']
[5, 3, 1, 2, 4, 5, 7, 7, 7, 7, 7, 1, 2, 7, 7, 7, 7, 3, 7, 1, 2, 4, 7, 7, 7, 7, 7, 7, 7, 7, 3, 6, 7, 1, 2, 4, 7, 3, 7, 6, 7, 7, 1, 2, 7, 7, 7]


## THE END 🌟