GitHub Sale: sign up for any paid plan this week and pay nothing until January 1, 2009!  [ hide ]

public
Description: Python API that allows you to easily write and run MapReduce programs.

Example programs

They don’t illustrate all of Dumbo’s nifty features, but you should be able to get the basics from these examples (read dumbo.py to get familiar with the rest!). Some of these examples can also be found here with additional info in the form of docstrings.

simplewordcount.py

def mapper(key,value):
    for word in value.split(): yield word,1

def reducer(key,values):
    yield key,sum(values)

if __name__ == "__main__":
    import dumbo
    dumbo.run(mapper,reducer)

wordcount.py

def loadexcludes():
    global excludes
    file = open("excludes.txt","r")
    excludes = set(line.strip() for line in file)
    file.close()

def mapper(key,value):
    for word in value.split():
        if not (word in excludes): yield word,1

def reducer(key,values):
    yield key,sum(values)

if __name__ == "__main__":
    import dumbo
    dumbo.run(mapper,reducer,combiner=reducer,mapconf=loadexcludes)

oowordcount.py

class Mapper:
    def __init__(self):
        file = open("excludes.txt","r")
        self.excludes = set(line.strip() for line in file)
        file.close()
    def __call__(self,key,value):
        for word in value.split():
            if not (word in self.excludes): yield word,1

def reducer(key,values):
    yield key,sum(values)

if __name__ == "__main__":
    import dumbo
    dumbo.run(Mapper,reducer,reducer)

itertwice.py

def mapper1(key,value):
    for word in value.split(): yield word,1

def mapper2(key,value):
    for letter in key: yield letter,1

def reducer1(key,values):
    count = sum(values)
    if count > 1: yield key,count

def reducer2(key,values):
    yield key,sum(values)

if __name__ == "__main__":
    import dumbo
    job = dumbo.Job()
    job.additer(mapper1,reducer1,reducer2)
    job.additer(mapper2,reducer2,reducer2)
    job.run()

greplogs.py

def mapper(key,value):
    if "playground.last.fm" in value: yield key,value

if __name__ == "__main__":
    import dumbo
    dumbo.run(mapper)

sumnorm.py

def mapper(key,value):
    parts = value.split()
    yield parts[0],(parts[1:-1],float(parts[-1]))

def reducer(key,values):
    values = list(values)
    s = sum(map(lambda x: x[1],values))
    for value in values:
        yield [key]+value[0],int(value[1]*100/s)

if __name__ == "__main__":
    import dumbo
    dumbo.run(mapper,reducer)
Last edited by klbostee, 6 days ago
Versions: