# Parse Martin Luther King's "I have a dream" speech and count the most commonly used words, skipping the so-called "stop words" (like "I", "an", "the", etc.)

### We'll go step by step through this

In [None]:
# importing some libraries - we will discuss this more later
from pathlib import Path # this library is helpful for navigating file locations
datapath = Path('../data') # setting a path for location of data files
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS # here is a list of the English stop words
stopwords = list(ENGLISH_STOP_WORDS)
print(stopwords)

### Now we will read in the speech and do some cleaning of the data (usually called "data munging"--see glossary in `more_resources` folder). Don't sweat the details of the next few lines....this is just here for future you to come back to

In [None]:
speech_raw = open(datapath / 'dream.txt').readlines() 
speech = [i.strip() for i in speech_raw]

In [None]:
pwd

In [None]:
words = []
for i in speech:
    words.append(i)
words

In [None]:
words = ' '.join(words)
words

In [None]:
# put this all in lower case
words = words.lower()

In [None]:
# get rid of punctutation using replace and replacing with empty streng (e.g. '')
punks = ['.',',',':',';','!','-']
for cp in punks:
    words = words.replace(cp, '')

In [None]:
# finally split on "whitespace" resulting in a list of words
words = words.split()
words

# NOW - it's your turn 

In [None]:
# loop through the words and, check whether each one is in the stopwords list and,
# if it is NOT, keep it in a new list called "kept_words"

In [None]:
# hint: look at the following line and see what it returns. try "not in" as well
'the' in stopwords

In [None]:
# hint: remember the difference between append and extend for lists

In [None]:
kept_words = [] # make an empty list
for w in words:
    if w not in stopwords:
        kept_words.append(w)

In [None]:
kept_words

### cool - got rid of stop words

In [None]:
# make an empty dictionary that will have keys being the unique words in the speech, 
# and values being their counts
counts = dict()

In [None]:
# hint: recall dictionaries have properties like .keys(), .values(), and .items()

In [None]:
for cw in kept_words:
    if cw in counts.keys():
        counts[cw] += 1
    else:
        counts[cw] = 1

In [None]:
counts

### can we organize these by sorting them? 
hint: look at the function `sorted()` and its options  
hint: we can get the values from the dictionary using `counts.values()`

In [None]:
top_counts = sorted(counts.values(), reverse=True)
len(top_counts)

In [None]:
# it would be really nice to reduce this to a unique set of values rather than repeated numbers
top_counts_unique = []
for i in top_counts:
    if i not in top_counts_unique:
        top_counts_unique.append(i)
len(top_counts_unique)

### and finally, let's print out the words with their counts in descending order
hint: a common way to iterate over a dictionary is like the following:

In [None]:
for key, val in counts.items():
    print(key, val)

In [None]:
# so how about we iterate over the counts (top_counts_unique)
# and print the corresponding keys and values from the dictionary
for ccount in top_counts_unique:
    for key, val in counts.items():
        if val==ccount:
            print(key, val)