# Local example of word counts with Map-Reduce

## Mapper

In [None]:
#!/usr/bin/env python2
"""Example mapper module for counting words via map-reduce.

This file is saved as wc_mapper.py with execute permission 
(chmod +x wc_mapper.py)"""

import sys


def main():
    """Take lines from stdin and emit each word with count 1.

    This is for illustration purposes, treating any string separated by
    whitespace as a 'word'. Additional cleaning (e.g., removing punctuation)
    could be added if necessary."""
    for line in sys.stdin:
        words = line.strip().split()
        for word in words:
            print word + '\t' + '1'

if __name__ == "__main__":
    main()

## Reducer

In [None]:
#!/usr/bin/env python2
"""Example reducer module for counting words via map-reduce.

This file is saved as wc_reducer.py with execute permission 
(chmod +x wc_reducer.py)"""

from itertools import groupby
from operator import itemgetter
import sys


def read_mapper_output(lines):
    """Returns generator over each line of lines as a list split by tabs."""
    for line in lines:
        yield line.rstrip().split('\t', 1)


def main():
    """Take lines from stdin and print the sum in each group of words."""
    data = read_mapper_output(sys.stdin)
    for word, group in groupby(data, itemgetter(0)):
        total_count = sum([int(count) for _, count in group])
        print word + '\t' + str(total_count)

if __name__ == "__main__":
    main()

## Example

Count words in the following example text:

In [1]:
%%bash
cat seuss

one fish two fish
red fish blue fish
black fish blue fish
old fish new fish


For illustrative purposes, use the `sort` command in `bash` for the shuffle stage.

In [1]:
%%bash
cat seuss | ./wc_mapper.py | sort | ./wc_reducer.py | head

black	1
blue	2
fish	8
new	1
old	1
one	1
red	1
two	1


Another example, using a longer text (the full text of Alice in Wonderland), sorted by the total word count

In [8]:
%%bash
cat alice.txt | ./wc_mapper.py | sort | ./wc_reducer.py | sort --key 2 -n --reverse > word_counts.txt

In [9]:
%%bash
cat word_counts.txt | head

the	1664
and	780
to	773
a	662
of	596
she	484
said	416
in	401
it	356
was	329
