Skip to content

Commit

Permalink
Added a pyspark example using tdigest to calculate percentiles for di…
Browse files Browse the repository at this point in the history
…fferent keys.
  • Loading branch information
Dave Matthews committed Jun 10, 2016
1 parent 364b150 commit 96e3767
Showing 1 changed file with 25 additions and 0 deletions.
25 changes: 25 additions & 0 deletions pyspark_by_key_example.py
@@ -0,0 +1,25 @@
from __future__ import print_function

from random import random
from operator import add
from tdigest import TDigest

data = sc.parallelize([(0, random()) for _ in range(1000)] +
[(1, random() + 1) for _ in range(1000)] +
[(2, random() + 2) for _ in range(1000)], 10)

def initialise_digest(v):
d = TDigest()
d.update(v)
return d

def update_digest(d, v):
d.update(v)
return d

percentiles = data\
.combineByKey(initialise_digest, update_digest, add)\
.map(lambda kv: (kv[0], kv[1].percentile(95)))\
.collect()

print(percentiles)

0 comments on commit 96e3767

Please sign in to comment.