0
A set of classes which make writing map/reduce tasks for Hadoop easy.
0
+ An example job, which reads countries from a Tab-Separated Value file and
0
+ outputs the number of times each country appears:
0
+ from collections import defaultdict
0
+ from hadoop import Job
0
+ from hadoop.parsers import TSVParser
0
+ class CountryCount(Job):
0
+ super(CountryCount, self).__init__()
0
+ self.map_parser = TSVParser
0
+ def map(self, key, values, collector):
0
+ collector.collect(values[COUNTRY_COLUMN], 1)
0
+ def reduce(self, keys_and_values, collector):
0
+ countries = defaultdict(int)
0
+ for country, count in keys_and_values:
0
+ countries[country] += int(count)
0
+ for country, count in countries.iteritems():
0
+ collector.collect(country, count)
0
+ cat data.tsv | python country_count.py --map | python country_count.py --reduce
0
+ To run via Hadoop Streaming:
0
+ bin/hadoop jar contrib/streaming/hadoop-streaming-0.16.0.jar \\
0
+ -input my_countries.tsv -output country_counts \\
0
+ -mapper "country_count.py --map" -reducer "country_count.py --reduce"
0
-from hadoop.collectors import
Collector
0
+from hadoop.collectors import
KeyValueCollector
0
from hadoop.parsers import LineParser, KeyValueParser
0
from hadoop.runner import Runner
0
implement map() and reduce().
0
+ Creates a new job instance.
0
+ Override this to change the parser and collector types for your map()
0
+ and reduce() methods. They default to:
0
+ map_parser = LineParser
0
+ map_collector = KeyValueCollector
0
+ reduce_parser = KeyValueParser
0
+ reduce_collector = KeyValueCollector
0
super(Job, self).__init__()
0
- self.map_parser, self.map_collector = LineParser, Collector
0
- self.reduce_parser, self.reduce_collector = KeyValueParser, Collector
0
+ self.map_parser, self.map_collector = LineParser, KeyValueCollector
0
+ self.reduce_parser, self.reduce_collector = KeyValueParser, KeyValueCollector
0
def start_map(self, parser_stream=sys.stdin, collector_stream=sys.stdout):
0
- Starts the mapping process.
.
0
+ Starts the mapping process.
Should only be called by the Runner.
0
parser = self.map_parser(parser_stream)
0
collector = self.map_collector(collector_stream)
0
def start_reduce(self, parser_stream=sys.stdin, collector_stream=sys.stdout):
0
- Starts the reducing process.
.
0
+ Starts the reducing process.
Should only be called by the Runner.
0
parser = self.reduce_parser(parser_stream)
0
collector = self.reduce_collector(collector_stream)
Comments
No one has commented yet.