In [7]:
# In any machine learning task, cleaning or preprocessing the data is as important as model building if not more. And when it comes to unstructured data like text, this process is even more important.

# Objective of this code is to understand the various text preprocessing steps with examples.

# Some of the common text preprocessing / cleaning steps are:

# * Sentence Tokenizer
# * Word Tokenizer
# * Porter Stemming 
# * Snowball Stemming
# * Lemmatization
# * Removal of Punctuations
# * Removal of Stopwords
# * Removal of Frequent words
# * Removal of Rare words
# * Bag Of Words (Word Embeddings)

# So these are the different types of text preprocessing steps which we can do on text data. But we need not do all of these all the times. We need to carefully choose the preprocessing steps based on our use case since that also play an important role.

# For example, in sentiment analysis use case, we need not remove the emojis or emoticons as it will convey some important information about the sentiment. Similarly we need to decide based on our use cases.

In [4]:
#Sentence Tokenizer
import nltk
input_str = "OpenAI is an artificial intelligence (AI) research laboratory consisting of the for-profit corporation OpenAI LP and its parent company, the non-profit OpenAI Inc. The company, considered a competitor to DeepMind, conducts research in the field of AI with the stated goal of promoting and developing friendly AI in a way that benefits humanity as a whole."
from nltk.tokenize import sent_tokenize
tokens = sent_tokenize(input_str)
print(tokens)

['OpenAI is an artificial intelligence (AI) research laboratory consisting of the for-profit corporation OpenAI LP and its parent company, the non-profit OpenAI Inc.', 'The company, considered a competitor to DeepMind, conducts research in the field of AI with the stated goal of promoting and developing friendly AI in a way that benefits humanity as a whole.']


In [5]:
# Word Tokenizer
import nltk
input_str = "OpenAI is an artificial intelligence (AI) research laboratory consisting of the for-profit corporation OpenAI LP and its parent company, the non-profit OpenAI Inc. The company, considered a competitor to DeepMind, conducts research in the field of AI with the stated goal of promoting and developing friendly AI in a way that benefits humanity as a whole."
from nltk.tokenize import word_tokenize
tokens = word_tokenize(input_str)
print(tokens)

['OpenAI', 'is', 'an', 'artificial', 'intelligence', '(', 'AI', ')', 'research', 'laboratory', 'consisting', 'of', 'the', 'for-profit', 'corporation', 'OpenAI', 'LP', 'and', 'its', 'parent', 'company', ',', 'the', 'non-profit', 'OpenAI', 'Inc', '.', 'The', 'company', ',', 'considered', 'a', 'competitor', 'to', 'DeepMind', ',', 'conducts', 'research', 'in', 'the', 'field', 'of', 'AI', 'with', 'the', 'stated', 'goal', 'of', 'promoting', 'and', 'developing', 'friendly', 'AI', 'in', 'a', 'way', 'that', 'benefits', 'humanity', 'as', 'a', 'whole', '.']


In [3]:
# Porter Stemming 
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize
stemmer_1 = PorterStemmer()
input_str = "Natural language processing (NLP) is a subfield of linguistics, computer science, and artificial intelligence concerned with the interactions between computers and human language, in particular how to program computers to process and analyze large amounts of natural language data."
input_str = word_tokenize(input_str)
for word in input_str:
    porter_stemmed_token = stemmer_1.stem(word)
    if word != porter_stemmed_token:
        print(f"Original : {word}, New: {porter_stemmed_token}")
    

Original : Natural, New: natur
Original : language, New: languag
Original : processing, New: process
Original : NLP, New: nlp
Original : linguistics, New: linguist
Original : computer, New: comput
Original : science, New: scienc
Original : artificial, New: artifici
Original : intelligence, New: intellig
Original : concerned, New: concern
Original : interactions, New: interact
Original : computers, New: comput
Original : language, New: languag
Original : computers, New: comput
Original : analyze, New: analyz
Original : large, New: larg
Original : amounts, New: amount
Original : natural, New: natur
Original : language, New: languag


In [2]:
# Snowball Stemming
from nltk.tokenize import word_tokenize
from nltk.stem.snowball import SnowballStemmer
stemmer_2 = SnowballStemmer("english")
input_str = "Natural language processing (NLP) is a subfield of linguistics, computer science, and artificial intelligence concerned with the interactions between computers and human language, in particular how to program computers to process and analyze large amounts of natural language data."
input_str = word_tokenize(input_str)
for word in input_str:
    snowball_stemmed_token = stemmer_2.stem(word)
    if word != snowball_stemmed_token:
        print(f"Original : {word}, New: {snowball_stemmed_token}")

Original : Natural, New: natur
Original : language, New: languag
Original : processing, New: process
Original : NLP, New: nlp
Original : linguistics, New: linguist
Original : computer, New: comput
Original : science, New: scienc
Original : artificial, New: artifici
Original : intelligence, New: intellig
Original : concerned, New: concern
Original : interactions, New: interact
Original : computers, New: comput
Original : language, New: languag
Original : computers, New: comput
Original : analyze, New: analyz
Original : large, New: larg
Original : amounts, New: amount
Original : natural, New: natur
Original : language, New: languag


In [22]:
# Lemmatization with Part of Speech Tags
import nltk
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk.corpus import wordnet
lemma = nltk.wordnet.WordNetLemmatizer()
pos_verb = lemma.lemmatize(word='played',pos=wordnet.VERB)
pos_noun = lemma.lemmatize(word='played',pos=wordnet.NOUN)
print(f"Lemmatization with pos Verb for Played: {pos_verb}")
print(f"Lemmatization with pos Noun for Played: {pos_noun}")


def lemmatize_words(text):
    return [lemma.lemmatize(word) for word in text.split()]

sentence = "The ideal characteristic of artificial intelligence is its ability to rationalize and take actions that have the best chance of achieving a specific goal."
print(lemmatize_words(sentence))


Lemmatization with pos Verb for Played: play
Lemmatization with pos Noun for Played: played
['The', 'ideal', 'characteristic', 'of', 'artificial', 'intelligence', 'is', 'it', 'ability', 'to', 'rationalize', 'and', 'take', 'action', 'that', 'have', 'the', 'best', 'chance', 'of', 'achieving', 'a', 'specific', 'goal.']


In [28]:
# Punctuations 
import string
Punctuation = string.punctuation
print (Punctuation)
def remove_punctuation(text):
    return text.translate(str.maketrans('', '', Punctuation))

text_without_punctuation = remove_punctuation("My friends caught the bus without me; I was left standing around on my own.")
print("Text before removing puntuation: My friends caught the bus without me; I was left standing around on my own.")
print(f"Text after removing puntuation: {text_without_punctuation}")


!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~
Text before removing puntuation: My friends caught the bus without me; I was left standing around on my own.
Text after removing puntuation: My friends caught the bus without me I was left standing around on my own


In [37]:
# Stop Words
from nltk.corpus import stopwords
Stop_Words = set(stopwords.words('english'))
print(f"List of Stop Words: {Stop_Words}")


sentence = "My friends caught the bus without me; I was left standing around on my own."

words = nltk.word_tokenize(sentence)
without_stop_words = [word for word in words if not word in Stop_Words]
print(f"Sentence after removing Stop Words: {without_stop_words}")

List of Stop Words: {"she's", 'had', 'during', 'other', 'over', 'hadn', 'am', 'have', 'yourself', 'were', 'with', 's', 'once', "should've", 'wasn', 'but', 'few', 't', 'didn', 'then', 'on', 'more', 'mustn', 'he', 'y', 'd', "hadn't", 'than', "wasn't", 'by', 'now', 'same', 'ain', "aren't", 'into', 'does', 'until', 'needn', 'too', 'been', 'itself', 'each', 'ourselves', 'are', 'if', 'for', 'it', 'couldn', 'hers', 'those', 'here', 'my', 'that', 'some', 'me', 'an', "you'll", 'between', 'yourselves', 'doing', 'there', 'not', 'mightn', 'm', "shouldn't", 'after', 'won', 'very', 'shan', 'from', 'all', 'our', 'do', 'yours', "shan't", 'ours', 'when', 'wouldn', 'shouldn', 'where', 'which', "hasn't", 'both', 'having', 'can', "don't", 'such', 'down', 'at', 'most', 'the', 'below', 'own', 'isn', 'you', "isn't", 'ma', "needn't", 'because', 'they', 'these', 'up', 'nor', 'before', 'further', 'through', 'has', 'while', 'who', 'don', "mightn't", "won't", "weren't", "mustn't", "doesn't", 'so', 'in', 'your', '

In [52]:
# Frequent words

from collections import Counter
Sentence = "I felt happy because I saw the others were happy and because I knew I should feel happy, but I wasn’t really happy."
Count = Counter()
for text in Sentence.split():
    Count[text] += 1
        
Count.most_common()

[('I', 5),
 ('happy', 2),
 ('because', 2),
 ('felt', 1),
 ('saw', 1),
 ('the', 1),
 ('others', 1),
 ('were', 1),
 ('and', 1),
 ('knew', 1),
 ('should', 1),
 ('feel', 1),
 ('happy,', 1),
 ('but', 1),
 ('wasn’t', 1),
 ('really', 1),
 ('happy.', 1)]

In [51]:
# Rare words
# Rare words can be found by the least frequent words. We can use the previous code. 
no_of_rare_words = 10
RAREWORDS = set([w for (w, wc) in Count.most_common()[:-no_of_rare_words-1:-1]])
print(RAREWORDS)

{'should', 'really', 'happy.', 'wasn’t', 'feel', 'knew', 'were', 'and', 'happy,', 'but'}


In [6]:
# Bag Of Words (Word Embeddings)
Sentences = ["Welcome to my blog. Use my blog to start learning NLP", "Everyday learning NLP is a good practice"]

{"welcome":1, "to":2, "my":2, "blog":2, "Use":1, "start":1, "learning":1 , "NLP":1} # Sentence 1

{"Everyday":1, "learning":1, "NLP":1, "is":1, "a":1, "good":1, "practice":1} # Sentence 2

{'Everyday': 1,
 'learning': 1,
 'NLP': 1,
 'is': 1,
 'a': 1,
 'good': 1,
 'practice': 1}

In [3]:
#Count of words in both the sentence
{"welcome":1, "to":2, "my":2, "blog":2, "Use":1, "start":1, "learning":2, "NLP":2, "Everyday":1, "is":1, "a":1, "good":1, "practice":1}

{'welcome': 1,
 'to': 2,
 'my': 2,
 'blog': 2,
 'Use': 1,
 'start': 1,
 'learning': 2,
 'NLP': 2,
 'Everyday': 1,
 'is': 1,
 'a': 1,
 'good': 1,
 'practice': 1}

In [5]:
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer()
Vectorized_Sentence = vectorizer.fit_transform(Sentences)
print(Vectorized_Sentence.toarray())

[[2 0 0 0 1 2 1 0 1 2 1 1]
 [0 1 1 1 1 0 1 1 0 0 0 0]]
