In [13]:
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.tag import pos_tag
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import nltk

lemmatizer = WordNetLemmatizer() # used to lemmatize words.

text = "One morning I shot an elephant in my pajamas. How he got into my pajamas I'll never know." # by Groucho Marx

sentences = sent_tokenize(text)
print(sentences)

words = word_tokenize(sentences[0])
print(words)

pos = pos_tag(words)
print(pos)

print([lemmatizer.lemmatize(w) for w in ['elephants', 'go', 'goes', 'going', 'went', 'gone']])

stopWords = set(stopwords.words('english'))
print(stopWords)



['One morning I shot an elephant in my pajamas.', "How he got into my pajamas I'll never know."]
['One', 'morning', 'I', 'shot', 'an', 'elephant', 'in', 'my', 'pajamas', '.']
[('One', 'CD'), ('morning', 'NN'), ('I', 'PRP'), ('shot', 'VBP'), ('an', 'DT'), ('elephant', 'NN'), ('in', 'IN'), ('my', 'PRP$'), ('pajamas', 'NN'), ('.', '.')]
['elephant', 'go', 'go', 'going', 'went', 'gone']
{'ve', 'after', 'your', "haven't", 'ours', 'shouldn', "you're", 'as', 'just', "you've", 'because', "shouldn't", 'there', 'herself', 'won', 'until', 'for', 'shan', 'themselves', 'before', 'y', 'about', "aren't", 'above', 'd', 'once', 'when', 'more', 'very', 'yourself', 'out', 'below', 'those', "didn't", "hadn't", 'weren', 'at', 'll', 'with', 'ain', 'between', 'where', 'by', 'nor', 't', 'myself', 'their', 'both', "wasn't", 'these', 'few', 'some', 'doesn', 'needn', 'while', 'what', 'against', 'all', 'don', 'that', "shan't", 'you', 'not', 'am', 'o', "hasn't", "that'll", 'here', "doesn't", 'been', 'off', 'each',

1. Sentence tokenization is splitting multiple sentences into separate sentence entities.
2. Splitting splitting a sentence into separate token entities i.e. words, commas etc.
3. It classifies words tokens as the parts of speech that they represent. It can be sometimes helpful to find some specific parts of a sentence and it can help in other NLP tasks.
4. Its goal is to join separate types of a single word into the same token, i.e. dogs, dog's and dog are the same dog lemma. Now if you want to find all references to the dog token it can be done easily as all of the references of dog are in the same lemmatized form.
5. They are common words that do not really convey much meaning of the sentence and as such can sometimes be ignored when analyzing texts.

Build your own NLP pipeline (a function named process_text(text)) that takes a paragraph as input, and splits the paragraph into sentences, applies word tokenization, POS tagging and lemmatization on all words. The function should return a list containing the processed sentences.

In [39]:
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.tag import pos_tag
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import nltk

def process_text(text):
    sents = sent_tokenize(text)
    words = [word_tokenize(sent) for sent in sents]
    pos = [pos_tag(sent) for sent in words]
    lem = [[lemmatizer.lemmatize(word) for word in sent] for sent in words]
    
    assert(len(words) == len(pos) and len(words) == len(lem))
    assert(len(words[0]) == len(pos[0]) and len(words[0]) == len(lem[0]))
    
    return (words, pos, lem)
    
text = 'Build your own NLP pipeline (a function named process_text(text)) that takes a paragraph as input, and splits the paragraph into sentences, applies word tokenization, POS tagging and lemmatization on all words. The function should return a list containing the processed sentences.'

process = process_text(text)

print(process)


([['Build', 'your', 'own', 'NLP', 'pipeline', '(', 'a', 'function', 'named', 'process_text', '(', 'text', ')', ')', 'that', 'takes', 'a', 'paragraph', 'as', 'input', ',', 'and', 'splits', 'the', 'paragraph', 'into', 'sentences', ',', 'applies', 'word', 'tokenization', ',', 'POS', 'tagging', 'and', 'lemmatization', 'on', 'all', 'words', '.'], ['The', 'function', 'should', 'return', 'a', 'list', 'containing', 'the', 'processed', 'sentences', '.']], [[('Build', 'VB'), ('your', 'PRP$'), ('own', 'JJ'), ('NLP', 'NNP'), ('pipeline', 'NN'), ('(', '('), ('a', 'DT'), ('function', 'NN'), ('named', 'VBN'), ('process_text', 'NN'), ('(', '('), ('text', 'NN'), (')', ')'), (')', ')'), ('that', 'WDT'), ('takes', 'VBZ'), ('a', 'DT'), ('paragraph', 'NN'), ('as', 'IN'), ('input', 'NN'), (',', ','), ('and', 'CC'), ('splits', 'VBZ'), ('the', 'DT'), ('paragraph', 'NN'), ('into', 'IN'), ('sentences', 'NNS'), (',', ','), ('applies', 'NNS'), ('word', 'NN'), ('tokenization', 'NN'), (',', ','), ('POS', 'NNP'), ('

Implement a function (filter_text(text)) that uses process_text(text) to process a paragraph and then removes stop words and words that are not verbs, adjectives or nouns (for descriptions of POS tags, read this).

In [49]:
import re

stopWords = set(stopwords.words('english'))
accepted_pos = re.compile('(NN.?.?|VB.?|JJ.?)')

def filter_text(text):
    words, pos, lem = process_text(text)
    filtered = []
    
    for sind, sent in enumerate(lem):
        for wind, word in enumerate(sent):
            if word not in stopWords and accepted_pos.match(pos[sind][wind][1]):
                filtered += [words[sind][wind]]
                
    return filtered

text = 'Implement a function (filter_text(text)) that uses process_text(text) to process a paragraph and then removes stop words and words that are not verbs, adjectives or nouns (for descriptions of POS tags, read this). Here is a second sentence to wow everyone how well this thing is truly working. What a masterpiece of software.' 
print(filter_text(text))
                

['Implement', 'function', 'filter_text', 'text', 'uses', 'process_text', 'text', 'process', 'paragraph', 'removes', 'stop', 'words', 'words', 'verbs', 'adjectives', 'nouns', 'descriptions', 'POS', 'tags', 'read', 'second', 'sentence', 'wow', 'everyone', 'thing', 'working', 'masterpiece', 'software']


In [61]:
import spacy 
nlp = spacy.load("en") 
    
def spacy_parser(text):
    doc = nlp(text)
    for sent in doc.sents:
        for token in sent:
            print(token.text, token.pos_, token.tag_, token.dep_)

VERB VB
DET DT
NOUN NN
PUNCT -LRB-
NOUN NN
PUNCT -RRB-
PUNCT -RRB-
DET WDT
VERB VBZ



1. Dependency parsing attempts to find the dependency relations between the words in a sentence. For example, that in  "I am eating" I and am are dependent on eating. POS tagging only wants to find out the tag for a single word.
2. The pos_ attribute is a simplified version of the tag_ attribute, both describe the words part-of-speech role.
3. A chunk is a collection of words that describe a single entity for example in the form the -adjective- -adjective- noun.
4. NER is a process whose goal is to recognize different entities in the text, for example organizations, products, money etc.

In [62]:
def parse_compare(text):
    print(spacy_parser(text))
    print(process_text(text))

parse_compare('I have a dog')

I PRON PRP nsubj
have VERB VBP ROOT
a DET DT det
dog NOUN NN dobj
None
([['I', 'have', 'a', 'dog']], [[('I', 'PRP'), ('have', 'VBP'), ('a', 'DT'), ('dog', 'NN')]], [['I', 'have', 'a', 'dog']])


In [63]:
parse_compare('Finger licking good.')

Finger NOUN NN nsubj
licking VERB VBG ROOT
good ADJ JJ acomp
. PUNCT . punct
None
([['Finger', 'licking', 'good', '.']], [[('Finger', 'NNP'), ('licking', 'VBG'), ('good', 'JJ'), ('.', '.')]], [['Finger', 'licking', 'good', '.']])


In [70]:
parse_compare('Finger Lickin\' Good')

Finger PROPN NNP compound
Lickin PROPN NNP ROOT
' PUNCT '' case
Good PROPN NNP amod
None
([['Finger', 'Lickin', "'", 'Good']], [[('Finger', 'NNP'), ('Lickin', 'NNP'), ("'", 'POS'), ('Good', 'JJ')]], [['Finger', 'Lickin', "'", 'Good']])


In [None]:
parse_compare('Think different.')

NLTK and Stanford give adjective tag to 'different' in think different when it is in lower case and consider it to be a noun when it is uppercase. Spacy thinks it is an adjective in both cases. Spacy thinks that Finger Lickin' Good is NNP NNP NNP, NLTK gives NNP NNP JJ and Stanford tool gives NN VBG JJ, which is the same as the original Finger licking good for all.

In [166]:
import requests, xmltodict, pickle, os
import numpy as np

def totally_pun_word(word):
    res = requests.get(f'https://api.datamuse.com/words?sl={word}').json()
    rand = np.random.randint(0, len(res))
    return res[rand]['word']

In [159]:
from nltk.corpus import cmudict
arpabet = cmudict.dict()
def pronounce(word):
    return arpabet[word.lower()][0] if word.lower() in arpabet else None

In [160]:
import editdistance
distance = editdistance.eval(pronounce('pi'), pronounce('pie'))

In [177]:
def make_punny(text):
    fil = filter_text(text)
    rand = np.random.randint(0, len(fil))
    replace = fil[rand]
    replacer = totally_pun_word(fil[rand])
    text = text.replace(replace, replacer)
    
    return text
    
print(make_punny('Jurassic park'))
print(make_punny('Jurassic park'))
print(make_punny('Jurassic park'))
print(make_punny('Jurassic park'))
print(make_punny('Life of Pi'))
print(make_punny('Life of Pi'))
print(make_punny('Life of Pi'))
print(make_punny('Life of Pi'))
print(make_punny('Life of Pi'))
print(make_punny('Game of Thrones'))
print(make_punny('Game of Thrones'))
print(make_punny('Game of Thrones'))
print(make_punny('Game of Thrones'))
print(make_punny('Lord of the Rings'))
print(make_punny('Lord of the Rings'))
print(make_punny('Lord of the Rings'))
print(make_punny('Lord of the Rings'))

juris park
Jurassic poar
Jurassic purk
gierig park
Life of paye
fluff of Pi
Life of piy
lerf of Pi
Life of pye
Game of thomes
Game of arens
Game of careens
Game of crohns
laury of the Rings
lardy of the Rings
Lord of the rez
loud of the Rings
