In [1]:
import pyspark
from random import randint, random, choice
import matplotlib.pyplot as plt

sc = pyspark.SparkContext('local[*]')

In [20]:
from dataGenerator.rowGenerator import *
from dataGenerator.datasetGenerator import *
from datetime import datetime

dataset = datasetGenerator()
dataset.addGenerator(intGenerator(),min=1,max=100,keyName="lineId")
dataset.addGenerator(dateGenerator(),keyName="ts")

rdd = sc.parallelize(dataset.generateDataset(1000000)).persist()

In [21]:
rdd = rdd.map(lambda x: (x.get('lineId'), x.get('ts')))
rdd.take(5)

[(57, datetime.datetime(2017, 5, 13, 17, 37, 41)),
 (25, datetime.datetime(2013, 4, 11, 15, 22, 17)),
 (45, datetime.datetime(2012, 7, 1, 12, 4, 40)),
 (53, datetime.datetime(2020, 9, 11, 6, 52, 21)),
 (34, datetime.datetime(2011, 7, 20, 23, 41, 22))]

In [22]:
boh = rdd.join(rdd)
boh.take(5)

[(96,
  (datetime.datetime(2015, 9, 16, 19, 47, 31),
   datetime.datetime(2015, 9, 16, 19, 47, 31))),
 (96,
  (datetime.datetime(2015, 9, 16, 19, 47, 31),
   datetime.datetime(2010, 9, 4, 14, 14, 32))),
 (96,
  (datetime.datetime(2015, 9, 16, 19, 47, 31),
   datetime.datetime(2016, 9, 29, 20, 25, 11))),
 (96,
  (datetime.datetime(2015, 9, 16, 19, 47, 31),
   datetime.datetime(2017, 2, 24, 4, 54, 33))),
 (96,
  (datetime.datetime(2015, 9, 16, 19, 47, 31),
   datetime.datetime(2018, 9, 20, 4, 25, 27)))]

In [25]:
filtered = boh.filter(lambda x: x[1][0] < x[1][1])
filtered.take(5)

[(96,
  (datetime.datetime(2015, 9, 16, 19, 47, 31),
   datetime.datetime(2016, 9, 29, 20, 25, 11))),
 (96,
  (datetime.datetime(2015, 9, 16, 19, 47, 31),
   datetime.datetime(2017, 2, 24, 4, 54, 33))),
 (96,
  (datetime.datetime(2015, 9, 16, 19, 47, 31),
   datetime.datetime(2018, 9, 20, 4, 25, 27))),
 (96,
  (datetime.datetime(2015, 9, 16, 19, 47, 31),
   datetime.datetime(2016, 7, 21, 6, 31, 22))),
 (96,
  (datetime.datetime(2015, 9, 16, 19, 47, 31),
   datetime.datetime(2019, 12, 31, 11, 3, 29)))]

In [27]:
split = filtered.map(lambda x: ((x[0], x[1][0]), x[1][1]))
split.take(5)

[((96, datetime.datetime(2015, 9, 16, 19, 47, 31)),
  datetime.datetime(2016, 9, 29, 20, 25, 11)),
 ((96, datetime.datetime(2015, 9, 16, 19, 47, 31)),
  datetime.datetime(2017, 2, 24, 4, 54, 33)),
 ((96, datetime.datetime(2015, 9, 16, 19, 47, 31)),
  datetime.datetime(2018, 9, 20, 4, 25, 27)),
 ((96, datetime.datetime(2015, 9, 16, 19, 47, 31)),
  datetime.datetime(2016, 7, 21, 6, 31, 22)),
 ((96, datetime.datetime(2015, 9, 16, 19, 47, 31)),
  datetime.datetime(2019, 12, 31, 11, 3, 29))]

In [43]:
diff = split.map(lambda x: (x[0], (x[1], x[1]-x[0][1])))
diff.take(5)

[((96, datetime.datetime(2015, 9, 16, 19, 47, 31)),
  (datetime.datetime(2016, 9, 29, 20, 25, 11),
   datetime.timedelta(days=379, seconds=2260))),
 ((96, datetime.datetime(2015, 9, 16, 19, 47, 31)),
  (datetime.datetime(2017, 2, 24, 4, 54, 33),
   datetime.timedelta(days=526, seconds=32822))),
 ((96, datetime.datetime(2015, 9, 16, 19, 47, 31)),
  (datetime.datetime(2018, 9, 20, 4, 25, 27),
   datetime.timedelta(days=1099, seconds=31076))),
 ((96, datetime.datetime(2015, 9, 16, 19, 47, 31)),
  (datetime.datetime(2016, 7, 21, 6, 31, 22),
   datetime.timedelta(days=308, seconds=38631))),
 ((96, datetime.datetime(2015, 9, 16, 19, 47, 31)),
  (datetime.datetime(2019, 12, 31, 11, 3, 29),
   datetime.timedelta(days=1566, seconds=54958)))]

In [44]:
res = diff.reduceByKey(lambda a,b: a if a[1] < b[1] else b)
res.take(5)

KeyboardInterrupt: 