In [2]:
import unicodedata
import re
import json

import nltk
from nltk.tokenize.toktok import ToktokTokenizer
from nltk.corpus import stopwords

import pandas as pd

# Parsing Text

Parsing text means to break it down into its parts. If we diagram the nouns, verbs, and tense in language, we're parsing using grammatical rules. Because natural language has so many rules and edge cases, when we approach things with NLP, we're looking to apply 

## Big Idea 
- We want to reduce the variability between words. 
- Both "math" and "Math" mean the same thing, so we lowercase things to reduce the variability of the same exact term.
- `Erdős`, `Erdös`, and `Erdos` refer to the same person. Again, we're looking to reduce variability before we start searching for relationships between values.

### Workflow:
1. Convert all text to lowercase
2. Remove accented characters and non-ASCII characters
3. Remove special characters
4. Stem or lemmatize the words (Prefer Lemmatizing since it captures more meaning)
5. Remove stopwords
6. Store the transformed text as well as the original text for future use.

### Terms, libraries, and concepts
- Normalizing text - Text normalization is the process of transforming text into a single canonical form that it might not have had before. Text normalization requires being aware of what type of text is to be normalized and how it is to be processed afterwards; there is no all-purpose normalization procedure
- tokenization
- corpus - body
- stem
- lemmatize
- stopwords (a, an, the, for, of, etc...)
- ASCII and non-ASCII characters
- .encode to ASCII - ASCII is  a short list of 128 characters and numbers
- .decode to utf-8 - converting back to normal Python strong

In [7]:
original = "Paul Erdős and George Pólya are influential Hungarian mathematicians who contributed a lot to the field. Erdős's name contains the Hungarian letter 'ő' ('o' with double acute accent), but is often incorrectly written as Erdos or Erdös either by mistake or out of typographical necessity"

In [12]:
# lowercase and remove accented characters and any non-ASCII characters
# Encode to ASCII, to convert special characters into ASCII 
# Decode from ASCII to UTF-8 so we have a normal Python string
string = original.lower()
string = unicodedata.normalize('NFKD', string)\
    .encode('ascii', 'ignore')\
    .decode('utf-8', 'ignore')    
string

"paul erdos and george polya are influential hungarian mathematicians who contributed a lot to the field. erdos's name contains the hungarian letter 'o' ('o' with double acute accent), but is often incorrectly written as erdos or erdos either by mistake or out of typographical necessity"

In [16]:
# Remove any special characters and replace with an empty string
string = re.sub(r"[^a-z0-9'\s]", '', string)
string

"paul erdos and george polya are influential hungarian mathematicians who contributed a lot to the field erdos's name contains the hungarian letter 'o' 'o' with double acute accent but is often incorrectly written as erdos or erdos either by mistake or out of typographical necessity"

In [18]:
# We can accomplish the above two step by using a tokenizer
tokenizer = nltk.tokenize.ToktokTokenizer()
print(tokenizer.tokenize(string, return_str=True))

paul erdos and george polya are influential hungarian mathematicians who contributed a lot to the field erdos ' s name contains the hungarian letter ' o ' ' o ' with double acute accent but is often incorrectly written as erdos or erdos either by mistake or out of typographical necessity


In [20]:
# Stemming is a super basic way to get only the "stem" of a word
ps = nltk.porter.PorterStemmer()
ps.stem('call'), ps.stem('called'), ps.stem('calling')

('call', 'call', 'call')

In [24]:
stems = [ps.stem(word) for word in string.split()]
stems

['paul',
 'erdo',
 'and',
 'georg',
 'polya',
 'are',
 'influenti',
 'hungarian',
 'mathematician',
 'who',
 'contribut',
 'a',
 'lot',
 'to',
 'the',
 'field',
 "erdos'",
 'name',
 'contain',
 'the',
 'hungarian',
 'letter',
 "'o'",
 "'o'",
 'with',
 'doubl',
 'acut',
 'accent',
 'but',
 'is',
 'often',
 'incorrectli',
 'written',
 'as',
 'erdo',
 'or',
 'erdo',
 'either',
 'by',
 'mistak',
 'or',
 'out',
 'of',
 'typograph',
 'necess']

In [26]:
wnl = nltk.stem.WordNetLemmatizer()

In [27]:
lemmas = [wnl.lemmatize(word) for word in string.split()]

In [28]:
lemmas

['paul',
 'erdos',
 'and',
 'george',
 'polya',
 'are',
 'influential',
 'hungarian',
 'mathematician',
 'who',
 'contributed',
 'a',
 'lot',
 'to',
 'the',
 'field',
 "erdos's",
 'name',
 'contains',
 'the',
 'hungarian',
 'letter',
 "'o'",
 "'o'",
 'with',
 'double',
 'acute',
 'accent',
 'but',
 'is',
 'often',
 'incorrectly',
 'written',
 'a',
 'erdos',
 'or',
 'erdos',
 'either',
 'by',
 'mistake',
 'or',
 'out',
 'of',
 'typographical',
 'necessity']

In [33]:
stopword_list = stopwords.words('english')



['paul',
 'erdos',
 'and',
 'george',
 'polya',
 'are',
 'influential',
 'hungarian',
 'mathematician',
 'who',
 'contributed',
 'a',
 'lot',
 'to',
 'the',
 'field',
 "erdos's",
 'name',
 'contains',
 'the',
 'hungarian',
 'letter',
 "'o'",
 "'o'",
 'with',
 'double',
 'acute',
 'accent',
 'but',
 'is',
 'often',
 'incorrectly',
 'written',
 'a',
 'erdos',
 'or',
 'erdos',
 'either',
 'by',
 'mistake',
 'or',
 'out',
 'of',
 'typographical',
 'necessity']

In [36]:
clean_stems = [w for w in stems if w not in stopword_list]
clean_stems

['paul',
 'erdo',
 'georg',
 'polya',
 'influenti',
 'hungarian',
 'mathematician',
 'contribut',
 'lot',
 'field',
 "erdos'",
 'name',
 'contain',
 'hungarian',
 'letter',
 "'o'",
 "'o'",
 'doubl',
 'acut',
 'accent',
 'often',
 'incorrectli',
 'written',
 'erdo',
 'erdo',
 'either',
 'mistak',
 'typograph',
 'necess']

In [37]:
clean_lemmas = [w for w in lemmas if w not in stopword_list]
clean_lemmas

['paul',
 'erdos',
 'george',
 'polya',
 'influential',
 'hungarian',
 'mathematician',
 'contributed',
 'lot',
 'field',
 "erdos's",
 'name',
 'contains',
 'hungarian',
 'letter',
 "'o'",
 "'o'",
 'double',
 'acute',
 'accent',
 'often',
 'incorrectly',
 'written',
 'erdos',
 'erdos',
 'either',
 'mistake',
 'typographical',
 'necessity']