<?xml version="1.0" encoding="UTF-8"?>
<commit>
  <added type="array"/>
  <modified type="array">
    <modified>
      <diff>@@ -25,11 +25,47 @@ THE SOFTWARE.
 
 &quot;&quot;&quot;
   A set of classes which make writing map/reduce tasks for Hadoop easy.
+  
+  An example job, which reads countries from a Tab-Separated Value file and
+  outputs the number of times each country appears:
+    
+  country_count.py:
+  
+    from collections import defaultdict
+    from hadoop import Job
+    from hadoop.parsers import TSVParser
+    
+    COUNTRY_COLUMN = 3
+    
+    class CountryCount(Job):
+      def __init__(self):
+        super(CountryCount, self).__init__()
+        self.map_parser = TSVParser
+      
+      def map(self, key, values, collector):
+        collector.collect(values[COUNTRY_COLUMN], 1)
+      
+      def reduce(self, keys_and_values, collector):
+        countries = defaultdict(int)
+        for country, count in keys_and_values:
+          countries[country] += int(count)
+        for country, count in countries.iteritems():
+          collector.collect(country, count)
+  
+  To run locally:
+    
+    cat data.tsv | python country_count.py --map | python country_count.py --reduce
+  
+  To run via Hadoop Streaming:
+    
+    bin/hadoop jar contrib/streaming/hadoop-streaming-0.16.0.jar \\
+      -input my_countries.tsv -output country_counts \\
+      -mapper &quot;country_count.py --map&quot; -reducer &quot;country_count.py --reduce&quot;
 &quot;&quot;&quot;
 
 import sys
 
-from hadoop.collectors import Collector
+from hadoop.collectors import KeyValueCollector
 from hadoop.parsers import LineParser, KeyValueParser
 from hadoop.runner import Runner
 
@@ -39,13 +75,24 @@ class Job(object):
     implement map() and reduce().
   &quot;&quot;&quot;
   def __init__(self):
+    &quot;&quot;&quot;
+      Creates a new job instance.
+      
+      Override this to change the parser and collector types for your map()
+      and reduce() methods. They default to:
+        
+        map_parser = LineParser
+        map_collector = KeyValueCollector
+        reduce_parser = KeyValueParser
+        reduce_collector = KeyValueCollector
+    &quot;&quot;&quot;
     super(Job, self).__init__()
-    self.map_parser, self.map_collector = LineParser, Collector
-    self.reduce_parser, self.reduce_collector = KeyValueParser, Collector
+    self.map_parser, self.map_collector = LineParser, KeyValueCollector
+    self.reduce_parser, self.reduce_collector = KeyValueParser, KeyValueCollector
   
   def start_map(self, parser_stream=sys.stdin, collector_stream=sys.stdout):
     &quot;&quot;&quot;
-      Starts the mapping process.
+      Starts the mapping process. Should only be called by the Runner.
     &quot;&quot;&quot;
     parser = self.map_parser(parser_stream)
     collector = self.map_collector(collector_stream)
@@ -57,7 +104,7 @@ class Job(object):
   
   def start_reduce(self, parser_stream=sys.stdin, collector_stream=sys.stdout):
     &quot;&quot;&quot;
-      Starts the reducing process.
+      Starts the reducing process. Should only be called by the Runner.
     &quot;&quot;&quot;
     parser = self.reduce_parser(parser_stream)
     collector = self.reduce_collector(collector_stream)</diff>
      <filename>python/lib/hadoop/__init__.py</filename>
    </modified>
    <modified>
      <diff>@@ -1,7 +1,7 @@
 &quot;&quot;&quot;
   Output collectors for Hadoop tasks.
 &quot;&quot;&quot;
-class Collector(object):
+class KeyValueCollector(object):
   &quot;&quot;&quot;
     A basic string/string collector for key/value pairs.
     
@@ -13,7 +13,7 @@ class Collector(object):
       Creates a new Collector instance which outputs data to the provided
       stream.
     &quot;&quot;&quot;
-    super(Collector, self).__init__()
+    super(KeyValueCollector, self).__init__()
     self.stream = stream
   
   def collect(self, key, value):
@@ -23,7 +23,7 @@ class Collector(object):
     self.stream.write('%s\t%s\n' % (key, value))
   
 
-class TSVCollector(Collector):
+class TSVCollector(KeyValueCollector):
   &quot;&quot;&quot;
     A collector which outputs multiple, tab-separated values.
     </diff>
      <filename>python/lib/hadoop/collectors.py</filename>
    </modified>
    <modified>
      <diff>@@ -11,6 +11,11 @@ class LineParser(object):
     &gt;&gt;&gt; p = LineParser(sys.stdin)
     &gt;&gt;&gt; lines = [line for line in p]
     ['blah', 'blee', 'blorg']
+    
+    Your map() or reduce() method should have the following profile:
+      
+      f(self, line, collector)
+      
   &quot;&quot;&quot;
   def __init__(self, stream):
     &quot;&quot;&quot;
@@ -37,6 +42,10 @@ class KeyValueParser(LineParser):
   &quot;&quot;&quot;
     A key/value parser. Each key and value are separated by a tab, as per
     Hadoop Streaming.
+    
+    Your map() or reduce() method should have the following profile:
+      
+      f(self, key, value, collector)
   &quot;&quot;&quot;
   def parse_line(self, line):
     &quot;&quot;&quot;
@@ -58,6 +67,10 @@ class TSVParser(KeyValueParser):
     &gt;&gt;&gt; p = TSVParser(sys.stdin)
     &gt;&gt;&gt; lines = [line for line in p]
     [('key', ('1', '2', '3')), ('another', ('4', '5', '6'))]
+    
+    Your map() or reduce() method should have the following profile:
+      
+      f(self, key, values, collector)
   &quot;&quot;&quot;
   def parse_line(self, line):
     &quot;&quot;&quot;</diff>
      <filename>python/lib/hadoop/parsers.py</filename>
    </modified>
    <modified>
      <diff>@@ -3,7 +3,7 @@
 import unittest
 from helpers import test
 
-from hadoop.collectors import Collector, TSVCollector
+from hadoop.collectors import KeyValueCollector, TSVCollector
 
 class MockStdOut(object):
   def __init__(self):
@@ -24,7 +24,7 @@ class CollectorTests(unittest.TestCase):
   
   @test
   def collector_should_output_key_and_value_to_stdout(self):
-    collector = Collector(stream=self.stdout)
+    collector = KeyValueCollector(stream=self.stdout)
     collector.collect('key', 'value')
     self.assertEqual(['key\tvalue\n'], self.stdout.read_lines())
   </diff>
      <filename>python/tests/test_collectors.py</filename>
    </modified>
    <modified>
      <diff>@@ -6,7 +6,7 @@ from examples.word_count import WordCountJob
 
 from hadoop import Job
 from hadoop.parsers import LineParser, KeyValueParser
-from hadoop.collectors import Collector
+from hadoop.collectors import KeyValueCollector
 import hadoop.runner
 
 
@@ -47,11 +47,11 @@ class JobTests(unittest.TestCase):
   
   @test
   def job_should_have_a_map_collector(self):
-    self.assertEqual(Collector, self.job.map_collector)
+    self.assertEqual(KeyValueCollector, self.job.map_collector)
   
   @test
   def job_should_have_a_reduce_collector(self):
-    self.assertEqual(Collector, self.job.reduce_collector)
+    self.assertEqual(KeyValueCollector, self.job.reduce_collector)
   
   @test
   def job_should_map_parser_output_to_collector_input(self):</diff>
      <filename>python/tests/test_job.py</filename>
    </modified>
  </modified>
  <removed type="array"/>
  <parents type="array">
    <parent>
      <id>d1b3bc3dffb8ee3690dce306b55b67ddff82159c</id>
    </parent>
  </parents>
  <author>
    <name>Coda Hale</name>
    <email>coda.hale@gmail.com</email>
  </author>
  <url>http://github.com/codahale/hadoop-streaming/commit/53c462ecee0da108f330fdbe2791641078081f36</url>
  <id>53c462ecee0da108f330fdbe2791641078081f36</id>
  <committed-date>2008-03-25T12:09:15-07:00</committed-date>
  <authored-date>2008-03-25T12:09:15-07:00</authored-date>
  <message>Changed Collector to KeyValueCollector to better describe it, and added some documentation to stuff.</message>
  <tree>9c2ceda7823be92a81d56b3b6e931a778caa752e</tree>
  <committer>
    <name>Coda Hale</name>
    <email>coda.hale@gmail.com</email>
  </committer>
</commit>
