# NLP Preparation Lesson

In [1]:
import pandas as pd
import numpy as np

In [2]:
original = "Paul Erdős and George Pólya were influential Hungarian mathematicians who contributed \
a lot to the field. Erdős's name contains the Hungarian letter 'ő' ('o' with double acute accent), \
but is often incorrectly written as Erdos or Erdös either by mistake or out of typographical necessity"

In [3]:
original

"Paul Erdős and George Pólya were influential Hungarian mathematicians who contributed a lot to the field. Erdős's name contains the Hungarian letter 'ő' ('o' with double acute accent), but is often incorrectly written as Erdos or Erdös either by mistake or out of typographical necessity"

## 1.  lowercase everthing

In [4]:
original = original.lower()
original

"paul erdős and george pólya were influential hungarian mathematicians who contributed a lot to the field. erdős's name contains the hungarian letter 'ő' ('o' with double acute accent), but is often incorrectly written as erdos or erdös either by mistake or out of typographical necessity"

## 2. remove accented  characters and non-ASCII characters

In [5]:
import unicodedata

original = unicodedata.normalize('NFKD', original).encode('ascii', 'ignore').decode('utf-8')
original

"paul erdos and george polya were influential hungarian mathematicians who contributed a lot to the field. erdos's name contains the hungarian letter 'o' ('o' with double acute accent), but is often incorrectly written as erdos or erdos either by mistake or out of typographical necessity"

## 3. Remove special characters

In [6]:
import re

In [7]:
original = re.sub(r'[^a-z0-9\s]', '', original)
original

'paul erdos and george polya were influential hungarian mathematicians who contributed a lot to the field erdoss name contains the hungarian letter o o with double acute accent but is often incorrectly written as erdos or erdos either by mistake or out of typographical necessity'

## 4. Tokenize

In [8]:
import nltk

In [9]:
tokenize = nltk.tokenize.ToktokTokenizer()
tokenize

<nltk.tokenize.toktok.ToktokTokenizer at 0x12e853a90>

In [10]:
original = tokenize.tokenize(original)
original

['paul',
 'erdos',
 'and',
 'george',
 'polya',
 'were',
 'influential',
 'hungarian',
 'mathematicians',
 'who',
 'contributed',
 'a',
 'lot',
 'to',
 'the',
 'field',
 'erdoss',
 'name',
 'contains',
 'the',
 'hungarian',
 'letter',
 'o',
 'o',
 'with',
 'double',
 'acute',
 'accent',
 'but',
 'is',
 'often',
 'incorrectly',
 'written',
 'as',
 'erdos',
 'or',
 'erdos',
 'either',
 'by',
 'mistake',
 'or',
 'out',
 'of',
 'typographical',
 'necessity']

## 5. Stemming or Lemmatizing

### Stemming

In [11]:
ps = nltk.porter.PorterStemmer()
ps

<PorterStemmer>

In [12]:
ps.stem('calling'), ps.stem('calls'), ps.stem('called'), ps.stem('call')

('call', 'call', 'call', 'call')

In [13]:
ps.stem('house'), ps.stem('housing')

('hous', 'hous')

In [14]:
[ps.stem(word) for word in original]

['paul',
 'erdo',
 'and',
 'georg',
 'polya',
 'were',
 'influenti',
 'hungarian',
 'mathematician',
 'who',
 'contribut',
 'a',
 'lot',
 'to',
 'the',
 'field',
 'erdoss',
 'name',
 'contain',
 'the',
 'hungarian',
 'letter',
 'o',
 'o',
 'with',
 'doubl',
 'acut',
 'accent',
 'but',
 'is',
 'often',
 'incorrectli',
 'written',
 'as',
 'erdo',
 'or',
 'erdo',
 'either',
 'by',
 'mistak',
 'or',
 'out',
 'of',
 'typograph',
 'necess']

In [15]:
stems = [ps.stem(word) for word in original]
' '.join(stems)

'paul erdo and georg polya were influenti hungarian mathematician who contribut a lot to the field erdoss name contain the hungarian letter o o with doubl acut accent but is often incorrectli written as erdo or erdo either by mistak or out of typograph necess'

### Lemmatize

In [16]:
#Run the first time
#nltk.download('all')

In [17]:
wnl = nltk.stem.WordNetLemmatizer()
wnl

<WordNetLemmatizer>

In [18]:
wnl.lemmatize('calling'), wnl.lemmatize('calls'), wnl.lemmatize('called'), wnl.lemmatize('call')

('calling', 'call', 'called', 'call')

In [19]:
wnl.lemmatize('house'), wnl.lemmatize('housing')

('house', 'housing')

In [20]:
ps.stem('mouse'), ps.stem('mice')

('mous', 'mice')

In [21]:
wnl.lemmatize('mouse'), wnl.lemmatize('mice')

('mouse', 'mouse')

In [22]:
wnl.lemmatize('bass'), wnl.lemmatize('basses'), wnl.lemmatize('base')

('bass', 'bass', 'base')

In [23]:
lemmas = [wnl.lemmatize(word) for word in original]
' '.join(lemmas)

'paul erdos and george polya were influential hungarian mathematician who contributed a lot to the field erdoss name contains the hungarian letter o o with double acute accent but is often incorrectly written a erdos or erdos either by mistake or out of typographical necessity'

## 6. Remove Stopwords

In [24]:
from nltk.corpus import stopwords

In [25]:
#Do this once
#nltk.download('stopwords')

In [26]:
stopwords_english = stopwords.words('english')
stopwords_english[0:10]

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're"]

In [27]:
len(stopwords_english)

179

In [28]:
original

['paul',
 'erdos',
 'and',
 'george',
 'polya',
 'were',
 'influential',
 'hungarian',
 'mathematicians',
 'who',
 'contributed',
 'a',
 'lot',
 'to',
 'the',
 'field',
 'erdoss',
 'name',
 'contains',
 'the',
 'hungarian',
 'letter',
 'o',
 'o',
 'with',
 'double',
 'acute',
 'accent',
 'but',
 'is',
 'often',
 'incorrectly',
 'written',
 'as',
 'erdos',
 'or',
 'erdos',
 'either',
 'by',
 'mistake',
 'or',
 'out',
 'of',
 'typographical',
 'necessity']

In [29]:
stopwords_english.append('o')

In [30]:
stopwords_english[-1]

'o'

In [31]:
len(stopwords_english)

180

In [32]:
original_with_stopwords_removed = [word for word in original if word not in stopwords_english]
' '.join(original_with_stopwords_removed)

'paul erdos george polya influential hungarian mathematicians contributed lot field erdoss name contains hungarian letter double acute accent often incorrectly written erdos erdos either mistake typographical necessity'