# Understanding NLP techniques
This is a draft notebook where I will play around with different NLP techniques to improve my understanding of certain functions that we will be using from the nltk package.

In [5]:
example1 = "This is an example sentence. I'll try to remove stopwords and I'll try to remove punctuation!"

In [2]:
import nltk

We can use the functions sent_tokenize and word_tokenize to split the text into sentences or into words. Word_tokenize will be more useful to us.

In [6]:
from nltk.tokenize import sent_tokenize, word_tokenize
print(sent_tokenize(example1))
print(word_tokenize(example1))

['This is an example sentence.', "I'll try to remove stopwords and I'll try to remove punctuation!"]
['This', 'is', 'an', 'example', 'sentence', '.', 'I', "'ll", 'try', 'to', 'remove', 'stopwords', 'and', 'I', "'ll", 'try', 'to', 'remove', 'punctuation', '!']


In [8]:
example1_words = word_tokenize(example1)
example1_words

['This',
 'is',
 'an',
 'example',
 'sentence',
 '.',
 'I',
 "'ll",
 'try',
 'to',
 'remove',
 'stopwords',
 'and',
 'I',
 "'ll",
 'try',
 'to',
 'remove',
 'punctuation',
 '!']

We can remove punctuation. The isalpha() function returns True if all characters in the string are alphabet letters.

In [14]:
remove_punctuation = [word for word in example1_words if word.isalpha()]
remove_punctuation

['This',
 'is',
 'an',
 'example',
 'sentence',
 'I',
 'try',
 'to',
 'remove',
 'stopwords',
 'and',
 'I',
 'try',
 'to',
 'remove',
 'punctuation']

In [20]:
# make all words lower case
remove_punctuation = [word.lower() for word in remove_punctuation]
remove_punctuation

['this',
 'is',
 'an',
 'example',
 'sentence',
 'i',
 'try',
 'to',
 'remove',
 'stopwords',
 'and',
 'i',
 'try',
 'to',
 'remove',
 'punctuation']

Now let's try and remove stopwords.

In [38]:
from nltk.corpus import stopwords

In [40]:
list_stopwords = stopwords.words('english')
list_stopwords

['i',
 'me',
 'my',
 'myself',
 'we',
 'our',
 'ours',
 'ourselves',
 'you',
 "you're",
 "you've",
 "you'll",
 "you'd",
 'your',
 'yours',
 'yourself',
 'yourselves',
 'he',
 'him',
 'his',
 'himself',
 'she',
 "she's",
 'her',
 'hers',
 'herself',
 'it',
 "it's",
 'its',
 'itself',
 'they',
 'them',
 'their',
 'theirs',
 'themselves',
 'what',
 'which',
 'who',
 'whom',
 'this',
 'that',
 "that'll",
 'these',
 'those',
 'am',
 'is',
 'are',
 'was',
 'were',
 'be',
 'been',
 'being',
 'have',
 'has',
 'had',
 'having',
 'do',
 'does',
 'did',
 'doing',
 'a',
 'an',
 'the',
 'and',
 'but',
 'if',
 'or',
 'because',
 'as',
 'until',
 'while',
 'of',
 'at',
 'by',
 'for',
 'with',
 'about',
 'against',
 'between',
 'into',
 'through',
 'during',
 'before',
 'after',
 'above',
 'below',
 'to',
 'from',
 'up',
 'down',
 'in',
 'out',
 'on',
 'off',
 'over',
 'under',
 'again',
 'further',
 'then',
 'once',
 'here',
 'there',
 'when',
 'where',
 'why',
 'how',
 'all',
 'any',
 'both',
 'each

In [41]:
[word for word in remove_punctuation if not word in list_stopwords]

['example',
 'sentence',
 'try',
 'remove',
 'stopwords',
 'try',
 'remove',
 'punctuation']

In [42]:
example2 = "Sarah and Ira drove to the store. Jenny and I opened all the gifts. The cat and dog ate. My parents and I went to a movie. Mrs. Juarez and Mr. Smith are dancing gracefully. Samantha, Elizabeth, and Joan are on the committee. The ham, green beans, mashed potatoes, and corn are gluten-free. The paper and pencil sat idle on the desk."

WordNetLemmatizer is a tool for lemmatizing words, i.e. reducing a word to its root form.

In [46]:
from nltk.stem import WordNetLemmatizer

In [56]:
WNL = WordNetLemmatizer()

In [73]:
WNL.lemmatize('feet')

'foot'

In [58]:
WNL.lemmatize('wolves')

'wolf'

In [71]:
example2_words = word_tokenize(example2)
example2_words

['Sarah',
 'and',
 'Ira',
 'drove',
 'to',
 'the',
 'store',
 '.',
 'Jenny',
 'and',
 'I',
 'opened',
 'all',
 'the',
 'gifts',
 '.',
 'The',
 'cat',
 'and',
 'dog',
 'ate',
 '.',
 'My',
 'parents',
 'and',
 'I',
 'went',
 'to',
 'a',
 'movie',
 '.',
 'Mrs.',
 'Juarez',
 'and',
 'Mr.',
 'Smith',
 'are',
 'dancing',
 'gracefully',
 '.',
 'Samantha',
 ',',
 'Elizabeth',
 ',',
 'and',
 'Joan',
 'are',
 'on',
 'the',
 'committee',
 '.',
 'The',
 'ham',
 ',',
 'green',
 'beans',
 ',',
 'mashed',
 'potatoes',
 ',',
 'and',
 'corn',
 'are',
 'gluten-free',
 '.',
 'The',
 'paper',
 'and',
 'pencil',
 'sat',
 'idle',
 'on',
 'the',
 'desk',
 '.']

In [72]:
[WNL.lemmatize(word) for word in example2_words]

['Sarah',
 'and',
 'Ira',
 'drove',
 'to',
 'the',
 'store',
 '.',
 'Jenny',
 'and',
 'I',
 'opened',
 'all',
 'the',
 'gift',
 '.',
 'The',
 'cat',
 'and',
 'dog',
 'ate',
 '.',
 'My',
 'parent',
 'and',
 'I',
 'went',
 'to',
 'a',
 'movie',
 '.',
 'Mrs.',
 'Juarez',
 'and',
 'Mr.',
 'Smith',
 'are',
 'dancing',
 'gracefully',
 '.',
 'Samantha',
 ',',
 'Elizabeth',
 ',',
 'and',
 'Joan',
 'are',
 'on',
 'the',
 'committee',
 '.',
 'The',
 'ham',
 ',',
 'green',
 'bean',
 ',',
 'mashed',
 'potato',
 ',',
 'and',
 'corn',
 'are',
 'gluten-free',
 '.',
 'The',
 'paper',
 'and',
 'pencil',
 'sat',
 'idle',
 'on',
 'the',
 'desk',
 '.']