In [107]:
import nltk
nltk.download('gutenberg')
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger_eng')

moby_raw = nltk.corpus.gutenberg.raw('melville-moby_dick.txt')

from nltk.tokenize import word_tokenize
tockens = word_tokenize(moby_raw)

[nltk_data] Downloading package gutenberg to
[nltk_data]     C:\Users\Hello\AppData\Roaming\nltk_data...
[nltk_data]   Package gutenberg is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Hello\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     C:\Users\Hello\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping taggers\averaged_perceptron_tagger_eng.zip.


<font color = green >

### Question 1

</font>


What is the lexical diversity of the given text input? (i.e. ratio of unique tokens to the total number of tokens)


In [90]:
def answer_one():
    return len(set(tockens)) / len(tockens)

answer_one()

0.08086685317096488

<font color = green >

### Question 2

</font>

What percentage of tokens is 'whale'or 'Whale'?

In [91]:
def answer_two():
    count = tockens.count("whale") + tockens.count("Whale")
    return  count / len((tockens)) * 100

answer_two()

0.41571651346670746

<font color = green >

### Question 3

</font>

What are the 20 most frequently occurring (unique) tokens in the text? What is their frequency?

In [92]:
from nltk import FreqDist

def answer_three():
    frequencies = FreqDist(tockens)
    return frequencies.most_common(10)

answer_three()

[(',', 19204),
 ('the', 13715),
 ('.', 7306),
 ('of', 6513),
 ('and', 6010),
 ('a', 4545),
 ('to', 4515),
 (';', 4173),
 ('in', 3908),
 ('that', 2981)]

<font color = green >

### Question 4

</font>

What tokens have a length of greater than 5 and frequency of more than 150?
<br>*This function should return a sorted list of the tokens that match the above constraints. To sort your list, use `sorted()`*

In [93]:
def answer_four():
    frequencies = FreqDist(tockens)
    ret = [w for w in frequencies.keys() if len(w) > 5 
           and frequencies[str(w)] > 150]
    return sorted(ret)


answer_four()

['Captain',
 'Pequod',
 'Queequeg',
 'Starbuck',
 'almost',
 'before',
 'himself',
 'little',
 'seemed',
 'should',
 'though',
 'through',
 'whales',
 'without']

<font color = green >

### Question 5

</font>

Find the longest word in text1 and that word's length.
<br>
*This function should return a tuple `(longest_word, length)`.*


In [94]:
def answer_five():
    tocken = max(tockens, key=len)
    length = len(tocken)
    return (tocken, length)

answer_five()

("twelve-o'clock-at-night", 23)

<font color = green >

### Question 6

</font>

What unique words have a frequency of more than 2000? What is their frequency?
<br>*This function should return a list of tuples of the form `(frequency, word)` sorted in descending order of frequency.*


In [95]:
def answer_six():
    no_punct = [t for t in tockens if t.isalpha()]
    frequencies = FreqDist(no_punct)
    ret = [(freq, word) for word, freq in frequencies.items()
           if freq > 2000]
    return sorted(ret, reverse=True)

answer_six()

[(13715, 'the'),
 (6513, 'of'),
 (6010, 'and'),
 (4545, 'a'),
 (4515, 'to'),
 (3908, 'in'),
 (2981, 'that'),
 (2459, 'his'),
 (2206, 'it'),
 (2121, 'I')]

<font color = green >

### Question 7

</font>

What is the average number of tokens per sentence?
<br>*This function should return a float.*

In [96]:
from nltk.tokenize import sent_tokenize
import numpy as np


def answer_seven():
    tokens_per_sent = [len(word_tokenize(sent)) for sent in sent_tokenize(moby_raw)]
    return np.average(tokens_per_sent)

answer_seven()

np.float64(25.90560292326431)

<font color = green >

### Question 8

</font>

What are the 5 most frequent parts of speech in this text? What is their frequency?
<br>*This function should return a list of tuples of the form `(part_of_speech, frequency)` sorted in descending order of frequency.*

In [108]:
from nltk import pos_tag

def answer_eight():
    tagged = pos_tag(tockens)
    pos_counts = FreqDist(tag for (word, tag) in tagged)
    return pos_counts.most_common(5)


answer_eight()

[('NN', 32722), ('IN', 28659), ('DT', 25885), (',', 19204), ('JJ', 17598)]

<font color = green >

### Question 9

</font>

Create spelling recommender, that take a list of misspelled words and recommends a correctly spelled word for every word in the list.

For every misspelled word, the recommender should find find the word in `correct_spellings` that has the shortest `edit distance` (you may need  to use `nltk.edit_distance(word_1, word_2, transpositions=True)`), and starts with the same letter as the misspelled word, and return that word as a recommendation.

Recommender should provide recommendations for the three words: `['cormulent', 'incendenece', 'validrate']`.
<br>*This function should return a list of length three:
`['cormulent_reccomendation', 'incendenece_reccomendation', 'validrate_reccomendation']`.*

In [25]:
from nltk.corpus import words
from nltk import edit_distance


def answer_nine(default_words= ['cormulent', 'incendenece', 'validrate']):
    word_list = words.words()
    
    ret = []
    for word in default_words:
        closest_word = None
        min_dist = 100

        for correct_word in word_list:
            dist = edit_distance(word, correct_word)
            if dist <= min_dist:
                min_dist = dist
                closest_word = correct_word
            if dist == 1:
                break
        ret.append(closest_word)

    return ret

answer_nine()

['corpulent', 'intendence', 'validate']