# Local example of word counts with Map-Reduce

## Mapper

In [None]:
#!/usr/bin/env python2
"""Example mapper module for counting words via map-reduce.

This file is saved as wc_mapper.py with execute permission 
(chmod +x wc_mapper.py)"""

import sys


def main():
    """Take lines from stdin and emit each word with count 1.

    This is for illustration purposes, treating any string separated by
    whitespace as a 'word'. Additional cleaning (e.g., removing punctuation)
    could be added if necessary."""
    for line in sys.stdin:
        words = line.strip().split()
        for word in words:
            print word + '\t' + '1'

if __name__ == "__main__":
    main()

## Reducer

In [None]:
#!/usr/bin/env python2
"""Example reducer module for counting words via map-reduce.

This file is saved as wc_reducer.py with execute permission 
(chmod +x wc_reducer.py)"""

from itertools import groupby
from operator import itemgetter
import sys


def read_mapper_output(lines):
    """Returns generator over each line of lines as a list split by tabs."""
    for line in lines:
        yield line.rstrip().split('\t', 1)


def main():
    """Take lines from stdin and print the sum in each group of words."""
    data = read_mapper_output(sys.stdin)
    for word, group in groupby(data, itemgetter(0)):
        total_count = sum([int(count) for _, count in group])
        print word + '\t' + str(total_count)

if __name__ == "__main__":
    main()

## Example

Count words in the following example text:

In [1]:
%%bash
cat seuss

one fish two fish
red fish blue fish
black fish blue fish
old fish new fish


For illustrative purposes, use the `sort` command in `bash` for the shuffle stage.

In [2]:
%%bash
cat seuss | ./wc_mapper.py | sort | ./wc_reducer.py

black	1
blue	2
fish	8
new	1
old	1
one	1
red	1
two	1


Another example, using a longer text (the full text of Alice in Wonderland), sorted by the total word count

In [3]:
%%bash
cat alice.txt | ./wc_mapper.py | sort | ./wc_reducer.py | sort --key 2 -n --reverse

the	1664
and	780
to	773
a	662
of	596
she	484
said	416
in	401
it	356
was	329
you	301
I	260
as	246
that	226
Alice	221
with	213
at	211
her	203
had	175
all	168
be	154
on	148
for	146
or	137
very	127
this	122
not	122
'I	121
little	117
they	109
but	105
so	104
The	100
out	97
he	96
his	93
about	91
is	89
what	86
up	83
were	82
went	79
Project	78
one	78
have	77
down	77
Alice,	76
like	74
if	73
no	72
by	72
would	68
when	67
into	67
any	67
thought	63
could	63
your	62
its	60
do	60
*	60
Mock	56
an	56
my	55
are	54
Alice.	54
quite	53
Gutenberg-tm	53
who	51
then	50
their	50
did	50
them	49
And	49
see	48
must	48
don't	48
some	47
began	47
time	46
me	46
know	46
such	45
only	45
looked	45
got	45
from	45
which	44
there	44
just	43
get	43
much	41
it,	41
how	41
herself	40
work	39
way	39
other	39
great	39
off	38
more	38
go	38
came	38
'and	38
think	37
never	37
can	37
I'm	36
been	36
thing	35
say	35
after	35
Queen	34
Turtle	33
She	33
without	32
'What	32
large	32
put	31
over	31
March	31
herself,	31
found	31
again,	31
'Yo