From 96e37678b10007470e8237136df2f5aeb4333bfe Mon Sep 17 00:00:00 2001 From: Dave Matthews Date: Fri, 10 Jun 2016 09:42:45 +0100 Subject: [PATCH] Added a pyspark example using tdigest to calculate percentiles for different keys. --- pyspark_by_key_example.py | 25 +++++++++++++++++++++++++ 1 file changed, 25 insertions(+) create mode 100644 pyspark_by_key_example.py diff --git a/pyspark_by_key_example.py b/pyspark_by_key_example.py new file mode 100644 index 0000000..cd9e73f --- /dev/null +++ b/pyspark_by_key_example.py @@ -0,0 +1,25 @@ +from __future__ import print_function + +from random import random +from operator import add +from tdigest import TDigest + +data = sc.parallelize([(0, random()) for _ in range(1000)] + + [(1, random() + 1) for _ in range(1000)] + + [(2, random() + 2) for _ in range(1000)], 10) + +def initialise_digest(v): + d = TDigest() + d.update(v) + return d + +def update_digest(d, v): + d.update(v) + return d + +percentiles = data\ + .combineByKey(initialise_digest, update_digest, add)\ + .map(lambda kv: (kv[0], kv[1].percentile(95)))\ + .collect() + +print(percentiles)