In [1]:
raw_docs = ["Here are some very simple basic sentences.",
"They won't be very interesting, I'm afraid.",
"The point of these examples is to _learn how basic text cleaning works_ on *very simple* data."]

In [2]:
raw_docs

['Here are some very simple basic sentences.',
 "They won't be very interesting, I'm afraid.",
 'The point of these examples is to _learn how basic text cleaning works_ on *very simple* data.']

In [6]:
# Tokenizing text into bag of words with help of nltk
from nltk.tokenize import word_tokenize
tokenized_docs = [word_tokenize(doc) for doc in raw_docs]
print(tokenized_docs)

[['Here', 'are', 'some', 'very', 'simple', 'basic', 'sentences', '.'], ['They', 'wo', "n't", 'be', 'very', 'interesting', ',', 'I', "'m", 'afraid', '.'], ['The', 'point', 'of', 'these', 'examples', 'is', 'to', '_learn', 'how', 'basic', 'text', 'cleaning', 'works_', 'on', '*very', 'simple*', 'data', '.']]


In [16]:
# there are punctuations in data like .,*_ etc which are not important so let's remove them
import re
import string
regex = re.compile('[%s]' % re.escape(string.punctuation))
tokenized_docs_withput_punctuations = []

for review in tokenized_docs:
    new_review = []
    for token in review:
        new_token = regex.sub(u'',token)
        if not new_token == u'':
            new_review.append(new_token)
    tokenized_docs_withput_punctuations.append(new_review)
    
print(tokenized_docs_withput_punctuations)

[['Here', 'are', 'some', 'very', 'simple', 'basic', 'sentences'], ['They', 'wo', 'nt', 'be', 'very', 'interesting', 'I', 'm', 'afraid'], ['The', 'point', 'of', 'these', 'examples', 'is', 'to', 'learn', 'how', 'basic', 'text', 'cleaning', 'works', 'on', 'very', 'simple', 'data']]


In [18]:
# Cleaning Stop words
from nltk.corpus import stopwords
tokenized_docs_without_stopword_punctuations = []

for review in tokenized_docs_withput_punctuations:
    new_review = []
    for token in review:
        if not token in stopwords.words('english'):
            new_review.append(token)
    tokenized_docs_without_stopword_punctuations.append(new_review)
print(tokenized_docs_without_stopword_punctuations)

[['Here', 'simple', 'basic', 'sentences'], ['They', 'wo', 'nt', 'interesting', 'I', 'afraid'], ['The', 'point', 'examples', 'learn', 'basic', 'text', 'cleaning', 'works', 'simple', 'data']]


In [21]:
# Stemming and Lemmetizing
from nltk.stem.porter import PorterStemmer
from nltk.stem.snowball import SnowballStemmer
from nltk.stem.wordnet import WordNetLemmatizer

porter = PorterStemmer()
snowball = SnowballStemmer('english')
wordnet = WordNetLemmatizer()

preprocessed_docs = []

for doc in tokenized_docs_without_stopword_punctuations:
    final_review = []
    for token in doc:
        lem_token = wordnet.lemmatize(token)
        final_review.append(snowball.stem(lem_token))
    preprocessed_docs.append(final_review)
print(preprocessed_docs)

[['here', 'simpl', 'basic', 'sentenc'], ['they', 'wo', 'nt', 'interest', 'i', 'afraid'], ['the', 'point', 'exampl', 'learn', 'basic', 'text', 'clean', 'work', 'simpl', 'data']]


##### Final 

In [23]:
# Create Pandas Dataframe
import pandas as pd
df = pd.DataFrame({'document': raw_docs })

In [24]:
df.head()

Unnamed: 0,document
0,Here are some very simple basic sentences.
1,"They won't be very interesting, I'm afraid."
2,The point of these examples is to _learn how b...


In [64]:
from nltk.tokenize import word_tokenize
import re
import string
from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer
from nltk.stem.wordnet import WordNetLemmatizer

def clean_tokenized_column(x):
    tokenized_docs = word_tokenize(x)
    regex = re.compile('[%s]' % re.escape(string.punctuation))
    snowball = SnowballStemmer('english')
    wordnet = WordNetLemmatizer()
    new_review = []
    
    for token in tokenized_docs:
        new_token = regex.sub(u'',token)
        if not new_token == u'':
            if not new_token in stopwords.words('english'):
                lem_token = wordnet.lemmatize(new_token)
                new_review.append(snowball.stem(lem_token))
    return new_review

In [65]:
df['document_tokens'] = df.document.apply(clean_tokenized_column)

In [66]:
df.head()

Unnamed: 0,document,document_tokens
0,Here are some very simple basic sentences.,"[here, simpl, basic, sentenc]"
1,"They won't be very interesting, I'm afraid.","[they, wo, nt, interest, i, afraid]"
2,The point of these examples is to _learn how b...,"[the, point, exampl, learn, basic, text, clean..."
