In [20]:
def count_words(text):
    '''
    Count the number of times each word occurs in text (str).
    Return dictionary where keys are unique words and values are
    word counts. Skips punctuation
    '''
    
    #lower case letters
    text = text.lower()
    
    #skip punctuation
    skips = ['.', ',',':', ';', "'", '"']
    for ch in skips:
        text = text.replace(ch, "")
    
    word_counts = {}
    for word in text.split(" "):
        #known word
        if word in word_counts:
            word_counts[word] +=1 
        #unknown word
        else:
            word_counts[word] = 1
    return word_counts

In [32]:
def read_book(title_path):
    '''
    Read a book and retirn it as a string
    '''
    with open(title_path, 'r', encoding = 'UTF-8') as current_file:
        text = current_file.read()
        text = text.replace('\n', '').replace('\r', '')
    return text

In [38]:
def word_stats(word_counts):
    '''
    Return number of unique words and
    word frequences
    '''
    num_unique = len(word_counts)
    counts = word_counts.values()
    return (num_unique, counts)

In [23]:
text = read_book('books/English/shakespeare/Romeo and Juliet.txt')

In [24]:
word_counts = count_words(text)

In [25]:
(num_unique, counts) = word_stats(word_counts)

In [28]:
num_unique

7527

In [29]:
counts

dict_values([735, 1, 1, 2, 1, 1, 1, 1, 2, 1, 1, 7, 1, 1, 4, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 5, 1, 1, 1, 1, 1, 1, 3, 1, 36, 1, 1, 1, 1, 1, 3, 1, 5, 1, 17, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 2, 1, 4, 1, 1, 1, 1, 2, 2, 1, 1, 1, 1, 6, 1, 3, 1, 1, 2, 1, 1, 1, 1, 1, 1, 3, 1, 1, 1, 1, 1, 4, 1, 1, 1, 1, 1, 1, 3, 1, 1, 2, 1, 11, 4, 1, 2, 1, 1, 1, 1, 1, 1, 1, 2, 2, 4, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 9, 2, 1, 3, 5, 1, 1, 1, 1, 1, 1, 1, 1, 5, 3, 1, 1, 1, 1, 170, 2, 1, 4, 3, 1, 1, 1, 1, 1, 21, 1, 1, 1, 5, 1, 1, 1, 1, 1, 2, 1, 3, 1, 1, 19, 4, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 5, 1, 32, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 44, 3, 7, 1, 7, 1, 111, 2, 1, 1, 1, 1, 2, 1, 3, 1, 1, 1, 1, 1, 1, 2, 1, 1, 4, 1, 1, 15, 1, 1, 1, 1, 1, 1, 2, 10, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 29, 1, 1, 1, 2, 1, 9, 1, 3, 1, 1, 1, 1, 14, 1, 1, 58, 1, 1, 3, 2, 1, 1, 1, 1, 1, 3, 1, 1, 1, 1, 2, 1, 8, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 6, 1, 1, 6, 2, 1, 1, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 6, 1, 1, 1, 2, 1, 1, 1, 

In [11]:
sum(counts)

40776

In [27]:
#English edition of Romeo And Juliet

text = read_book('books/English/shakespeare/Romeo and Juliet.txt')
word_counts = count_words(text)
(num_unique, counts) = word_stats(word_counts)
print(num_unique, sum(counts))

#German edition of Romeo And Juliet

text = read_book('books/German/shakespeare/Romeo und Julia.txt')
word_counts = count_words(text)
(num_unique, counts) = word_stats(word_counts)
print(num_unique, sum(counts))

5118 40776
7527 20311


#### Computing Word Frequency Statistics: Question 1

As defined in Video 3.2.4, which of the following does the function ```word_stats``` return?

- The total number of words
- **The number of unique words**
- **A list of word counts**
- A dictionary of word counts

#### Computing Word Frequency Statistics: Question 2

Which of the two versions of Romeo and Juliet from Project Gutenberg contains more unique words, the original or the German translation?

- The original
- **The German translation**
- The two contain the same number of unique words