public
Description: Support libraries for writing Hadoop Streaming-compatible map/reduce tasks.
Clone URL: git://github.com/codahale/hadoop-streaming.git
Search Repo:
Extracted out an example job.
codahale (author)
Mon Mar 24 15:16:09 -0700 2008
commit  df084ac7be3205ed039f50dabbc598ed8160ffe4
tree    1fa0e78c8905a22114fb16f3ba61fcf542ea7db2
parent  dec0d947c3fc852039361c3a8c4b4aca75ad3c1e
...
1
2
3
4
5
 
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
...
1
2
 
3
4
5
6
7
8
9
10
11
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
12
13
14
0
@@ -1,30 +1,14 @@
0
 #!/usr/bin/env python
0
 # encoding: utf-8
0
-from collections import defaultdict
0
 import unittest
0
 from helpers import test
0
+from examples.word_count import WordCountJob
0
 
0
 from hadoop import Job
0
 from hadoop.parsers import LineParser, KeyValueParser
0
 from hadoop.collectors import Collector
0
 import hadoop.runner
0
 
0
-class WordCountJob(Job):
0
- """A sample word-count Hadoop job."""
0
-
0
- def map(self, line, collector):
0
- """Splits line into words, emits counts."""
0
- for word in line.split(' '):
0
- collector.collect(word.strip(), 1)
0
-
0
- def reduce(self, iterator, collector):
0
- """Reduces word counts, collects totals."""
0
- accumulator = defaultdict(int)
0
- for word, count in iterator:
0
- accumulator[word] += int(count)
0
- for word, total in accumulator.iteritems():
0
- collector.collect(word, total)
0
-
0
 
0
 class MockStream(object):
0
   def __init__(self):

Comments

    No one has commented yet.