This repository is private.
All pages are served over SSL and all pushing and pulling is done over SSH.
No one may fork, clone, or view it unless they are added as a member.
Every repository with this icon (
) is private.
Every repository with this icon (
This repository is public.
Anyone may fork, clone, or view it.
Every repository with this icon (
) is public.
Every repository with this icon (
Example programs
They don’t illustrate all of Dumbo’s nifty features, but you should be able to get the basics from these examples (read dumbo.py to get familiar with the rest!). Some of these examples can also be found here with additional info in the form of docstrings.
simplewordcount.py
def mapper(key,value):
for word in value.split(): yield word,1
def reducer(key,values):
yield key,sum(values)
if __name__ == "__main__":
import dumbo
dumbo.run(mapper,reducer)
wordcount.py
def loadexcludes():
global excludes
file = open("excludes.txt","r")
excludes = set(line.strip() for line in file)
file.close()
def mapper(key,value):
for word in value.split():
if not (word in excludes): yield word,1
def reducer(key,values):
yield key,sum(values)
if __name__ == "__main__":
import dumbo
dumbo.run(mapper,reducer,combiner=reducer,mapconf=loadexcludes)
oowordcount.py
class Mapper:
def __init__(self):
file = open("excludes.txt","r")
self.excludes = set(line.strip() for line in file)
file.close()
def __call__(self,key,value):
for word in value.split():
if not (word in self.excludes): yield word,1
def reducer(key,values):
yield key,sum(values)
if __name__ == "__main__":
import dumbo
dumbo.run(Mapper,reducer,reducer)
itertwice.py
def mapper1(key,value):
for word in value.split(): yield word,1
def mapper2(key,value):
for letter in key: yield letter,1
def reducer1(key,values):
count = sum(values)
if count > 1: yield key,count
def reducer2(key,values):
yield key,sum(values)
if __name__ == "__main__":
import dumbo
job = dumbo.Job()
job.additer(mapper1,reducer1,reducer2)
job.additer(mapper2,reducer2,reducer2)
job.run()
greplogs.py
def mapper(key,value):
if "playground.last.fm" in value: yield key,value
if __name__ == "__main__":
import dumbo
dumbo.run(mapper)
sumnorm.py
def mapper(key,value):
parts = value.split()
yield parts[0],(parts[1:-1],float(parts[-1]))
def reducer(key,values):
values = list(values)
s = sum(map(lambda x: x[1],values))
for value in values:
yield [key]+value[0],int(value[1]*100/s)
if __name__ == "__main__":
import dumbo
dumbo.run(mapper,reducer)
Last edited by klbostee, 6 days ago
Versions:





