# Text Cleaning Manually

In [4]:
import pandas as pd

## load file

In [5]:
file = "data.txt"
with open(file, "rt") as f:
    text = f.read()

## Tokenization

### split words by spaces
- doing this results in words being splitted but punctuation is preserved in words like **armor-like**, **vermin.**, **me?**, **couldn't**

In [26]:
words = text.split()

In [27]:
df = pd.DataFrame({'original_word': words[:100]})
df.head(10)

Unnamed: 0,original_word
0,One
1,"morning,"
2,when
3,Gregor
4,Samsa
5,woke
6,from
7,troubled
8,"dreams,"
9,he


### using regex
- doing this eliminates the problems we had, but it transforms words like **couldn't** to **could** and **t** which is not nice.

In [28]:
import re

In [29]:
words = re.split(r'\W+', text)

In [30]:
df['using regex'] = words[:100]
df.head(10)

Unnamed: 0,original_word,using regex
0,One,One
1,"morning,",morning
2,when,when
3,Gregor,Gregor
4,Samsa,Samsa
5,woke,woke
6,from,from
7,troubled,troubled
8,"dreams,",dreams
9,he,he


### split by whitespace and remove punctuation
- Words like **What's** becomes Whats, **armour-like** becomes armourlike

In [31]:
import string
import re

In [10]:
words = text.split()
# prepare regex for char filtering
re_punc = re.compile('[%s]' % re.escape(string.punctuation))
# remove punctuation
stripped = [re_punc.sub('', w) for w in words]

In [38]:
df['punctuation removed'] =  words[:100]
df.head(10)

Unnamed: 0,original_word,using regex,punctuation removed
0,One,One,One
1,"morning,",morning,morning
2,when,when,when
3,Gregor,Gregor,Gregor
4,Samsa,Samsa,Samsa
5,woke,woke,woke
6,from,from,from
7,troubled,troubled,troubled
8,"dreams,",dreams,dreams
9,he,he,he


## Normalize case
- normalize the case by converting words to lowercase

In [39]:
words = text.split()
lowercased = [w.lower() for w in words]

In [40]:
df['lowered'] =  lowercased[:100]
df.head(10)

Unnamed: 0,original_word,using regex,punctuation removed,lowered
0,One,One,One,one
1,"morning,",morning,morning,"morning,"
2,when,when,when,when
3,Gregor,Gregor,Gregor,gregor
4,Samsa,Samsa,Samsa,samsa
5,woke,woke,woke,woke
6,from,from,from,from
7,troubled,troubled,troubled,troubled
8,"dreams,",dreams,dreams,"dreams,"
9,he,he,he,he


# Cleaning with NLTK

In [41]:
import nltk

## Tokenization

In [42]:
from nltk import sent_tokenize
from nltk import word_tokenize

In [43]:
# Tokenize sentences
sentences = sent_tokenize(text)

In [44]:
sentences[:10]

['One morning, when Gregor Samsa woke from troubled dreams, he found\nhimself transformed in his bed into a horrible vermin.',
 'He lay on\nhis armour-like back, and if he lifted his head a little he could\nsee his brown belly, slightly domed and divided by arches into stiff\nsections.',
 'The bedding was hardly able to cover it and seemed ready\nto slide off any moment.',
 'His many legs, pitifully thin compared\nwith the size of the rest of him, waved about helplessly as he\nlooked.',
 '"What\'s happened to me?"',
 'he thought.',
 "It wasn't a dream.",
 'His room,\na proper human room although a little too small, lay peacefully\nbetween its four familiar walls.',
 'A collection of textile samples\nlay spread out on the table - Samsa was a travelling salesman - and\nabove it there hung a picture that he had recently cut out of an\nillustrated magazine and housed in a nice, gilded frame.',
 'It showed\na lady fitted out with a fur hat and fur boa who sat upright,\nraising a heavy fur m

In [55]:
# Tokenize words
words = word_tokenize(text) 

In [56]:
df = pd.DataFrame({'original_word': words[:100]})
df.head(10)

Unnamed: 0,original_word
0,One
1,morning
2,","
3,when
4,Gregor
5,Samsa
6,woke
7,from
8,troubled
9,dreams


## Filtering punctuations, stop words

- Stop words are those words that do not contribute to meaning of phrase. 

In [57]:
# filter out puncutuations
words = [w for w in words if w.isalpha()]

In [58]:
df["punctuation filtered"] = words[:100]
df.head(10)

Unnamed: 0,original_word,punctuation filtered
0,One,One
1,morning,morning
2,",",when
3,when,Gregor
4,Gregor,Samsa
5,Samsa,woke
6,woke,from
7,from,troubled
8,troubled,dreams
9,dreams,he


In [59]:
from nltk.corpus import stopwords
# filter out Stop words
stops = stopwords.words("english")

In [60]:
stops

['i',
 'me',
 'my',
 'myself',
 'we',
 'our',
 'ours',
 'ourselves',
 'you',
 "you're",
 "you've",
 "you'll",
 "you'd",
 'your',
 'yours',
 'yourself',
 'yourselves',
 'he',
 'him',
 'his',
 'himself',
 'she',
 "she's",
 'her',
 'hers',
 'herself',
 'it',
 "it's",
 'its',
 'itself',
 'they',
 'them',
 'their',
 'theirs',
 'themselves',
 'what',
 'which',
 'who',
 'whom',
 'this',
 'that',
 "that'll",
 'these',
 'those',
 'am',
 'is',
 'are',
 'was',
 'were',
 'be',
 'been',
 'being',
 'have',
 'has',
 'had',
 'having',
 'do',
 'does',
 'did',
 'doing',
 'a',
 'an',
 'the',
 'and',
 'but',
 'if',
 'or',
 'because',
 'as',
 'until',
 'while',
 'of',
 'at',
 'by',
 'for',
 'with',
 'about',
 'against',
 'between',
 'into',
 'through',
 'during',
 'before',
 'after',
 'above',
 'below',
 'to',
 'from',
 'up',
 'down',
 'in',
 'out',
 'on',
 'off',
 'over',
 'under',
 'again',
 'further',
 'then',
 'once',
 'here',
 'there',
 'when',
 'where',
 'why',
 'how',
 'all',
 'any',
 'both',
 'each

In [61]:
words = [w for w in words if not w in stops]

In [62]:
df["removed stopwords"] = words[:100]

In [63]:
df.head(10)

Unnamed: 0,original_word,punctuation filtered,removed stopwords
0,One,One,One
1,morning,morning,morning
2,",",when,Gregor
3,when,Gregor,Samsa
4,Gregor,Samsa,woke
5,Samsa,woke,troubled
6,woke,from,dreams
7,from,troubled,found
8,troubled,dreams,transformed
9,dreams,he,bed


## Stemming 
- refers to process of reducing each word to its root or base., removal of suffixes like ing, ly, s etc by rule based approach

In [64]:
from nltk.stem import PorterStemmer

# init stemmer
porter_stemmer=PorterStemmer()
stemmed_words=[porter_stemmer.stem(word=word) for word in words]

In [65]:
df['stemmed_word'] =  stemmed_words[:100]
df.head(10)

Unnamed: 0,original_word,punctuation filtered,removed stopwords,stemmed_word
0,One,One,One,one
1,morning,morning,morning,morn
2,",",when,Gregor,gregor
3,when,Gregor,Samsa,samsa
4,Gregor,Samsa,woke,woke
5,Samsa,woke,troubled,troubl
6,woke,from,dreams,dream
7,from,troubled,found,found
8,troubled,dreams,transformed,transform
9,dreams,he,bed,bed


## Lemmatization
- Stemming and Lemmatization both generate the root form of the inflected words. The difference is that stem might not be an actual word whereas, lemma is an actual language word

In [67]:
from nltk.stem import WordNetLemmatizer
# init lemmatizer
lemmatizer = WordNetLemmatizer()

lemmatized_words=[lemmatizer.lemmatize(word=word,pos='v') for word in words]

In [68]:
df['lemmatized_word'] =  lemmatized_words[:100]
df.head(10)

Unnamed: 0,original_word,punctuation filtered,removed stopwords,stemmed_word,lemmatized_word
0,One,One,One,one,One
1,morning,morning,morning,morn,morning
2,",",when,Gregor,gregor,Gregor
3,when,Gregor,Samsa,samsa,Samsa
4,Gregor,Samsa,woke,woke,wake
5,Samsa,woke,troubled,troubl,trouble
6,woke,from,dreams,dream,dream
7,from,troubled,found,found,find
8,troubled,dreams,transformed,transform,transform
9,dreams,he,bed,bed,bed
