# Parse Martin Luther King's "I have a dream" speech and count the most commonly used words, skipping the so-called "stop words" (like "I", "an", "the", etc.)

### We'll go step by step through this

In [1]:
# importing some libraries - we will discuss this more later
from pathlib import Path # this library is helpful for navigating file locations
datapath = Path('../data') # setting a path for location of data files
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS # here is a list of the English stop words
stopwords = list(ENGLISH_STOP_WORDS)
print(stopwords)

['same', 'since', 'noone', 'eight', 'fifteen', 'his', 'latter', 'both', 'they', 'other', 'for', 'amoungst', 'several', 'something', 'had', 'less', 'me', 'nobody', 'any', 'these', 'whereafter', 'always', 'hereafter', 'until', 'give', 'neither', 'thru', 'others', 'con', 'been', 'toward', 'eleven', 'itself', 'also', 'herein', 'interest', 'nothing', 'their', 'amount', 'fifty', 'why', 'the', 'your', 'four', 'should', 'what', 'on', 'latterly', 'among', 'is', 'might', 'nine', 'otherwise', 'ie', 'often', 'elsewhere', 'few', 'everywhere', 'even', 'move', 'sincere', 'himself', 'fill', 'almost', 'here', 'hasnt', 'anything', 'mill', 'onto', 'still', 'we', 'somewhere', 'above', 'many', 'someone', 'formerly', 'keep', 'nevertheless', 'and', 'hence', 'hereby', 'never', 'together', 'were', 'its', 'therefore', 'that', 'in', 'first', 'under', 'last', 'wherever', 'much', 'he', 'ever', 'whatever', 'else', 'could', 'from', 'out', 'an', 'him', 'alone', 'whether', 'none', 'by', 'seems', 'across', 'ourselves',

### Now we will read in the speech and do some cleaning of the data (usually called "data munging"--see glossary in `more_resources` folder). Don't sweat the details of the next few lines....this is just here for future you to come back to

In [2]:
speech_raw = open(datapath / 'dream.txt').readlines() 
speech = [i.strip() for i in speech_raw]

In [3]:
pwd

'/Users/mnfienen/Documents/GIT/python-for-hydrology/notebooks/part0_python_intro/solutions'

In [4]:
words = []
for i in speech:
    words.append(i)
words

['I am happy to join with you today in what will go down in history as the greatest demonstration for freedom in the history of our nation.',
 '',
 'Five score years ago, a great American, in whose symbolic shadow we stand today, signed the Emancipation Proclamation. This momentous decree came as a great beacon light of hope to millions of Negro slaves who had been seared in the flames of withering injustice. It came as a joyous daybreak to end the long night of their captivity.',
 '',
 "But one hundred years later, the Negro still is not free. One hundred years later, the life of the Negro is still sadly crippled by the manacles of segregation and the chains of discrimination. One hundred years later, the Negro lives on a lonely island of poverty in the midst of a vast ocean of material prosperity. One hundred years later, the Negro is still languished in the corners of American society and finds himself an exile in his own land. And so we've come here today to dramatize a shameful co

In [5]:
words = ' '.join(words)
words

"I am happy to join with you today in what will go down in history as the greatest demonstration for freedom in the history of our nation.  Five score years ago, a great American, in whose symbolic shadow we stand today, signed the Emancipation Proclamation. This momentous decree came as a great beacon light of hope to millions of Negro slaves who had been seared in the flames of withering injustice. It came as a joyous daybreak to end the long night of their captivity.  But one hundred years later, the Negro still is not free. One hundred years later, the life of the Negro is still sadly crippled by the manacles of segregation and the chains of discrimination. One hundred years later, the Negro lives on a lonely island of poverty in the midst of a vast ocean of material prosperity. One hundred years later, the Negro is still languished in the corners of American society and finds himself an exile in his own land. And so we've come here today to dramatize a shameful condition.  In a se

In [6]:
# put this all in lower case
words = words.lower()

In [7]:
# get rid of punctutation using replace and replacing with empty streng (e.g. '')
punks = ['.',',',':',';','!','-']
for cp in punks:
    words = words.replace(cp, '')

In [8]:
# finally split on "whitespace" resulting in a list of words
words = words.split()
words

['i',
 'am',
 'happy',
 'to',
 'join',
 'with',
 'you',
 'today',
 'in',
 'what',
 'will',
 'go',
 'down',
 'in',
 'history',
 'as',
 'the',
 'greatest',
 'demonstration',
 'for',
 'freedom',
 'in',
 'the',
 'history',
 'of',
 'our',
 'nation',
 'five',
 'score',
 'years',
 'ago',
 'a',
 'great',
 'american',
 'in',
 'whose',
 'symbolic',
 'shadow',
 'we',
 'stand',
 'today',
 'signed',
 'the',
 'emancipation',
 'proclamation',
 'this',
 'momentous',
 'decree',
 'came',
 'as',
 'a',
 'great',
 'beacon',
 'light',
 'of',
 'hope',
 'to',
 'millions',
 'of',
 'negro',
 'slaves',
 'who',
 'had',
 'been',
 'seared',
 'in',
 'the',
 'flames',
 'of',
 'withering',
 'injustice',
 'it',
 'came',
 'as',
 'a',
 'joyous',
 'daybreak',
 'to',
 'end',
 'the',
 'long',
 'night',
 'of',
 'their',
 'captivity',
 'but',
 'one',
 'hundred',
 'years',
 'later',
 'the',
 'negro',
 'still',
 'is',
 'not',
 'free',
 'one',
 'hundred',
 'years',
 'later',
 'the',
 'life',
 'of',
 'the',
 'negro',
 'is',
 'sti

# NOW - it's your turn 

In [9]:
# loop through the words and, check whether each one is in the stopwords list and,
# if it is NOT, keep it in a new list called "kept_words"

In [10]:
# hint: look at the following line and see what it returns. try "not in" as well
'the' in stopwords

True

In [11]:
# hint: remember the difference between append and extend for lists

In [12]:
kept_words = [] # make an empty list
for w in words:
    if w not in stopwords:
        kept_words.append(w)

In [13]:
kept_words

['happy',
 'join',
 'today',
 'history',
 'greatest',
 'demonstration',
 'freedom',
 'history',
 'nation',
 'score',
 'years',
 'ago',
 'great',
 'american',
 'symbolic',
 'shadow',
 'stand',
 'today',
 'signed',
 'emancipation',
 'proclamation',
 'momentous',
 'decree',
 'came',
 'great',
 'beacon',
 'light',
 'hope',
 'millions',
 'negro',
 'slaves',
 'seared',
 'flames',
 'withering',
 'injustice',
 'came',
 'joyous',
 'daybreak',
 'end',
 'long',
 'night',
 'captivity',
 'years',
 'later',
 'negro',
 'free',
 'years',
 'later',
 'life',
 'negro',
 'sadly',
 'crippled',
 'manacles',
 'segregation',
 'chains',
 'discrimination',
 'years',
 'later',
 'negro',
 'lives',
 'lonely',
 'island',
 'poverty',
 'midst',
 'vast',
 'ocean',
 'material',
 'prosperity',
 'years',
 'later',
 'negro',
 'languished',
 'corners',
 'american',
 'society',
 'finds',
 'exile',
 'land',
 "we've",
 'come',
 'today',
 'dramatize',
 'shameful',
 'condition',
 'sense',
 "we've",
 'come',
 "nation's",
 'capit

### cool - got rid of stop words

In [14]:
# make an empty dictionary that will have keys being the unique words in the speech, 
# and values being their counts
counts = dict()

In [15]:
# hint: recall dictionaries have properties like .keys(), .values(), and .items()

In [16]:
for cw in kept_words:
    if cw in counts.keys():
        counts[cw] += 1
    else:
        counts[cw] = 1

In [17]:
counts

{'happy': 1,
 'join': 3,
 'today': 9,
 'history': 2,
 'greatest': 1,
 'demonstration': 1,
 'freedom': 20,
 'nation': 10,
 'score': 1,
 'years': 5,
 'ago': 1,
 'great': 5,
 'american': 4,
 'symbolic': 1,
 'shadow': 1,
 'stand': 3,
 'signed': 1,
 'emancipation': 1,
 'proclamation': 1,
 'momentous': 1,
 'decree': 1,
 'came': 2,
 'beacon': 1,
 'light': 1,
 'hope': 4,
 'millions': 1,
 'negro': 13,
 'slaves': 2,
 'seared': 1,
 'flames': 1,
 'withering': 1,
 'injustice': 3,
 'joyous': 1,
 'daybreak': 1,
 'end': 2,
 'long': 6,
 'night': 1,
 'captivity': 1,
 'later': 4,
 'free': 5,
 'life': 2,
 'sadly': 1,
 'crippled': 1,
 'manacles': 1,
 'segregation': 2,
 'chains': 1,
 'discrimination': 1,
 'lives': 1,
 'lonely': 1,
 'island': 1,
 'poverty': 1,
 'midst': 1,
 'vast': 1,
 'ocean': 1,
 'material': 1,
 'prosperity': 1,
 'languished': 1,
 'corners': 1,
 'society': 1,
 'finds': 1,
 'exile': 1,
 'land': 4,
 "we've": 3,
 'come': 10,
 'dramatize': 1,
 'shameful': 1,
 'condition': 1,
 'sense': 1,
 "nat

### can we organize these by sorting them? 
hint: look at the function `sorted()` and its options  
hint: we can get the values from the dictionary using `counts.values()`

In [18]:
top_counts = sorted(counts.values(), reverse=True)
len(top_counts)

432

In [19]:
# it would be really nice to reduce this to a unique set of values rather than repeated numbers
top_counts_unique = []
for i in top_counts:
    if i not in top_counts_unique:
        top_counts_unique.append(i)
len(top_counts_unique)

14

### and finally, let's print out the words with their counts in descending order
hint: a common way to iterate over a dictionary is like the following:

In [20]:
for key, val in counts.items():
    print(key, val)

happy 1
join 3
today 9
history 2
greatest 1
demonstration 1
freedom 20
nation 10
score 1
years 5
ago 1
great 5
american 4
symbolic 1
shadow 1
stand 3
signed 1
emancipation 1
proclamation 1
momentous 1
decree 1
came 2
beacon 1
light 1
hope 4
millions 1
negro 13
slaves 2
seared 1
flames 1
withering 1
injustice 3
joyous 1
daybreak 1
end 2
long 6
night 1
captivity 1
later 4
free 5
life 2
sadly 1
crippled 1
manacles 1
segregation 2
chains 1
discrimination 1
lives 1
lonely 1
island 1
poverty 1
midst 1
vast 1
ocean 1
material 1
prosperity 1
languished 1
corners 1
society 1
finds 1
exile 1
land 4
we've 3
come 10
dramatize 1
shameful 1
condition 1
sense 1
nation's 1
capital 1
cash 2
check 5
architects 1
republic 1
wrote 1
magnificent 1
words 3
constitution 1
declaration 1
independence 1
signing 1
promissory 2
note 3
fall 1
heir 1
promise 1
men 6
yes 1
black 4
white 6
guaranteed 1
unalienable 1
rights 3
liberty 2
pursuit 1
happiness 1
obvious 1
america 5
defaulted 1
insofar 1
citizens 1
color 2


In [21]:
# so how about we iterate over the counts (top_counts_unique)
# and print the corresponding keys and values from the dictionary
for ccount in top_counts_unique:
    for key, val in counts.items():
        if val==ccount:
            print(key, val)

freedom 20
negro 13
let 13
day 12
ring 12
dream 11
nation 10
come 10
today 9
justice 8
able 8
satisfied 7
long 6
men 6
white 6
years 5
great 5
free 5
check 5
america 5
time 5
children 5
new 5
shall 5
faith 5
american 4
hope 4
later 4
land 4
black 4
mississippi 4
mountain 4
join 3
stand 3
injustice 3
we've 3
words 3
note 3
rights 3
people 3
make 3
rise 3
valley 3
brotherhood 3
god's 3
sweltering 3
alabama 3
georgia 3
state 3
little 3
sing 3
history 2
came 2
slaves 2
end 2
life 2
segregation 2
cash 2
promissory 2
liberty 2
color 2
insufficient 2
funds 2
refuse 2
believe 2
urgency 2
racial 2
negro's 2
content 2
continue 2
say 2
struggle 2
dignity 2
allow 2
creative 2
physical 2
force 2
brothers 2
realize 2
destiny 2
walk 2
police 2
brutality 2
cities 2
vote 2
york 2
like 2
mighty 2
jail 2
quest 2
suffering 2
work 2
south 2
knowing 2
despair 2
live 2
true 2
meaning 2
sons 2
heat 2
boys 2
girls 2
hands 2
hill 2
places 2
stone 2
thee 2
mountainside 2
happy 1
greatest 1
demonstration 1
score 