In [1]:
import pyspark
from random import randint, random, choice
import matplotlib.pyplot as plt

sc = pyspark.SparkContext('local[*]')

In [4]:
from dataGenerator.rowGenerator import *
from dataGenerator.datasetGenerator import *
from datetime import datetime

dataset = datasetGenerator()
dataset.addGenerator(intGenerator(),min=1,max=10,keyName="lineId")
dataset.addGenerator(intGenerator(),min=1,max=100,keyName="itemId")
dataset.addGenerator(dateGenerator(),keyName="ts")

rdd = sc.parallelize(dataset.generateDataset(25000)).persist()

In [5]:
rdd.take(5)

[{'lineId': 10,
  'itemId': 18,
  'ts': datetime.datetime(2010, 11, 19, 23, 37, 4)},
 {'lineId': 8,
  'itemId': 43,
  'ts': datetime.datetime(2013, 12, 17, 20, 48, 30)},
 {'lineId': 7, 'itemId': 84, 'ts': datetime.datetime(2010, 6, 3, 19, 5, 3)},
 {'lineId': 2, 'itemId': 98, 'ts': datetime.datetime(2011, 3, 28, 12, 39, 59)},
 {'lineId': 7, 'itemId': 75, 'ts': datetime.datetime(2019, 12, 7, 6, 13, 27)}]

In [7]:
tojoin = rdd.map(lambda x: ((x.get('lineId'), x.get('itemId')), x.get('ts')))
tojoin.take(5)

[((10, 18), datetime.datetime(2010, 11, 19, 23, 37, 4)),
 ((8, 43), datetime.datetime(2013, 12, 17, 20, 48, 30)),
 ((7, 84), datetime.datetime(2010, 6, 3, 19, 5, 3)),
 ((2, 98), datetime.datetime(2011, 3, 28, 12, 39, 59)),
 ((7, 75), datetime.datetime(2019, 12, 7, 6, 13, 27))]

In [8]:
joined = tojoin.join(tojoin)
joined.take(5)

[((5, 25),
  (datetime.datetime(2018, 4, 15, 15, 42, 39),
   datetime.datetime(2018, 4, 15, 15, 42, 39))),
 ((5, 25),
  (datetime.datetime(2018, 4, 15, 15, 42, 39),
   datetime.datetime(2012, 11, 5, 23, 58, 42))),
 ((5, 25),
  (datetime.datetime(2018, 4, 15, 15, 42, 39),
   datetime.datetime(2011, 5, 18, 10, 27, 49))),
 ((5, 25),
  (datetime.datetime(2018, 4, 15, 15, 42, 39),
   datetime.datetime(2011, 7, 15, 11, 40, 11))),
 ((5, 25),
  (datetime.datetime(2018, 4, 15, 15, 42, 39),
   datetime.datetime(2015, 8, 20, 21, 57, 19)))]

In [10]:
filtered = joined.filter(lambda x: x[1][0] < x[1][1])
filtered.take(5)

[((5, 25),
  (datetime.datetime(2018, 4, 15, 15, 42, 39),
   datetime.datetime(2018, 5, 19, 18, 12, 27))),
 ((5, 25),
  (datetime.datetime(2018, 4, 15, 15, 42, 39),
   datetime.datetime(2019, 3, 27, 6, 39, 4))),
 ((5, 25),
  (datetime.datetime(2018, 4, 15, 15, 42, 39),
   datetime.datetime(2019, 6, 14, 16, 13, 38))),
 ((5, 25),
  (datetime.datetime(2018, 4, 15, 15, 42, 39),
   datetime.datetime(2020, 1, 20, 19, 54, 15))),
 ((5, 25),
  (datetime.datetime(2018, 4, 15, 15, 42, 39),
   datetime.datetime(2021, 4, 13, 17, 56, 3)))]

In [14]:
diff = filtered.map(lambda x: ((x[0][0],x[0][1],x[1][0]), (x[1][0], x[1][1], x[1][1]-x[1][0])))
diff.take(5)

[((5, 25, datetime.datetime(2018, 4, 15, 15, 42, 39)),
  (datetime.datetime(2018, 4, 15, 15, 42, 39),
   datetime.datetime(2018, 5, 19, 18, 12, 27),
   datetime.timedelta(days=34, seconds=8988))),
 ((5, 25, datetime.datetime(2018, 4, 15, 15, 42, 39)),
  (datetime.datetime(2018, 4, 15, 15, 42, 39),
   datetime.datetime(2019, 3, 27, 6, 39, 4),
   datetime.timedelta(days=345, seconds=53785))),
 ((5, 25, datetime.datetime(2018, 4, 15, 15, 42, 39)),
  (datetime.datetime(2018, 4, 15, 15, 42, 39),
   datetime.datetime(2019, 6, 14, 16, 13, 38),
   datetime.timedelta(days=425, seconds=1859))),
 ((5, 25, datetime.datetime(2018, 4, 15, 15, 42, 39)),
  (datetime.datetime(2018, 4, 15, 15, 42, 39),
   datetime.datetime(2020, 1, 20, 19, 54, 15),
   datetime.timedelta(days=645, seconds=15096))),
 ((5, 25, datetime.datetime(2018, 4, 15, 15, 42, 39)),
  (datetime.datetime(2018, 4, 15, 15, 42, 39),
   datetime.datetime(2021, 4, 13, 17, 56, 3),
   datetime.timedelta(days=1094, seconds=8004)))]

In [15]:
reduced = diff.reduceByKey(lambda a,b: a if a[2] < b[2] else b)
reduced.take(5)

[((5, 25, datetime.datetime(2010, 11, 23, 20, 0, 4)),
  (datetime.datetime(2010, 11, 23, 20, 0, 4),
   datetime.datetime(2010, 11, 25, 10, 59, 56),
   datetime.timedelta(days=1, seconds=53992))),
 ((3, 55, datetime.datetime(2014, 9, 29, 23, 43, 29)),
  (datetime.datetime(2014, 9, 29, 23, 43, 29),
   datetime.datetime(2014, 12, 1, 2, 38, 17),
   datetime.timedelta(days=62, seconds=10488))),
 ((6, 36, datetime.datetime(2018, 11, 24, 20, 40, 41)),
  (datetime.datetime(2018, 11, 24, 20, 40, 41),
   datetime.datetime(2018, 12, 20, 1, 48, 42),
   datetime.timedelta(days=25, seconds=18481))),
 ((6, 36, datetime.datetime(2016, 2, 18, 9, 20, 33)),
  (datetime.datetime(2016, 2, 18, 9, 20, 33),
   datetime.datetime(2017, 3, 8, 0, 53, 8),
   datetime.timedelta(days=383, seconds=55955))),
 ((6, 36, datetime.datetime(2013, 8, 31, 14, 27, 20)),
  (datetime.datetime(2013, 8, 31, 14, 27, 20),
   datetime.datetime(2013, 9, 16, 19, 57, 18),
   datetime.timedelta(days=16, seconds=19798)))]