In [1]:
import pyspark
from random import randint, random, choice
import matplotlib.pyplot as plt

sc = pyspark.SparkContext('local[*]')

In [2]:
from dataGenerator.rowGenerator import *
from dataGenerator.datasetGenerator import *
from datetime import datetime

dataset = datasetGenerator()
dataset.addGenerator(idGenerator(),prefix="line",min=1,max=100,keyName="lineId")
dataset.addGenerator(dateGenerator(),keyName="ts")

rdd = sc.parallelize(dataset.generateDataset(5000)).persist()
rdd.take(5)

[{'lineId': 'line_0006', 'ts': datetime.datetime(2016, 11, 12, 17, 44, 37)},
 {'lineId': 'line_0068', 'ts': datetime.datetime(2014, 7, 5, 17, 26, 26)},
 {'lineId': 'line_0074', 'ts': datetime.datetime(2015, 6, 20, 14, 44, 40)},
 {'lineId': 'line_0030', 'ts': datetime.datetime(2010, 3, 15, 1, 15, 15)},
 {'lineId': 'line_0015', 'ts': datetime.datetime(2010, 8, 23, 2, 45, 22)}]

In [3]:
toJoin = rdd.map(lambda x: ((x.get('lineId')), x)).persist()
toJoin.count()

5000

In [4]:
toJoin.take(5)

[('line_0006',
  {'lineId': 'line_0006', 'ts': datetime.datetime(2016, 11, 12, 17, 44, 37)}),
 ('line_0068',
  {'lineId': 'line_0068', 'ts': datetime.datetime(2014, 7, 5, 17, 26, 26)}),
 ('line_0074',
  {'lineId': 'line_0074', 'ts': datetime.datetime(2015, 6, 20, 14, 44, 40)}),
 ('line_0030',
  {'lineId': 'line_0030', 'ts': datetime.datetime(2010, 3, 15, 1, 15, 15)}),
 ('line_0015',
  {'lineId': 'line_0015', 'ts': datetime.datetime(2010, 8, 23, 2, 45, 22)})]

In [5]:
lol = toJoin.join(toJoin).persist()
lol.count()

255200

In [6]:
lol.take(5)

[('line_0001',
  ({'lineId': 'line_0001', 'ts': datetime.datetime(2017, 4, 6, 4, 49, 1)},
   {'lineId': 'line_0001', 'ts': datetime.datetime(2017, 4, 6, 4, 49, 1)})),
 ('line_0001',
  ({'lineId': 'line_0001', 'ts': datetime.datetime(2017, 4, 6, 4, 49, 1)},
   {'lineId': 'line_0001', 'ts': datetime.datetime(2016, 3, 2, 1, 1, 32)})),
 ('line_0001',
  ({'lineId': 'line_0001', 'ts': datetime.datetime(2017, 4, 6, 4, 49, 1)},
   {'lineId': 'line_0001', 'ts': datetime.datetime(2017, 3, 11, 19, 24, 20)})),
 ('line_0001',
  ({'lineId': 'line_0001', 'ts': datetime.datetime(2017, 4, 6, 4, 49, 1)},
   {'lineId': 'line_0001', 'ts': datetime.datetime(2011, 2, 12, 22, 6, 4)})),
 ('line_0001',
  ({'lineId': 'line_0001', 'ts': datetime.datetime(2017, 4, 6, 4, 49, 1)},
   {'lineId': 'line_0001', 'ts': datetime.datetime(2010, 5, 13, 8, 59, 16)}))]

In [7]:
filtered = lol.filter(lambda x: x[1][0].get('ts') <= x[1][1].get('ts')).persist()
filtered.count()

130100

In [8]:
filtered.take(5)

[('line_0001',
  ({'lineId': 'line_0001', 'ts': datetime.datetime(2017, 4, 6, 4, 49, 1)},
   {'lineId': 'line_0001', 'ts': datetime.datetime(2017, 4, 6, 4, 49, 1)})),
 ('line_0001',
  ({'lineId': 'line_0001', 'ts': datetime.datetime(2017, 4, 6, 4, 49, 1)},
   {'lineId': 'line_0001', 'ts': datetime.datetime(2020, 4, 16, 20, 52, 49)})),
 ('line_0001',
  ({'lineId': 'line_0001', 'ts': datetime.datetime(2017, 4, 6, 4, 49, 1)},
   {'lineId': 'line_0001', 'ts': datetime.datetime(2018, 6, 19, 12, 54, 21)})),
 ('line_0001',
  ({'lineId': 'line_0001', 'ts': datetime.datetime(2017, 4, 6, 4, 49, 1)},
   {'lineId': 'line_0001', 'ts': datetime.datetime(2019, 12, 21, 13, 7, 51)})),
 ('line_0001',
  ({'lineId': 'line_0001', 'ts': datetime.datetime(2017, 4, 6, 4, 49, 1)},
   {'lineId': 'line_0001', 'ts': datetime.datetime(2018, 9, 4, 6, 53, 4)}))]

In [11]:
newkey = filtered.map(lambda x: ((x[0], x[1][0].get('ts')),(x[1][1].get('ts')-x[1][0].get('ts'), x))).persist()
newkey.count()

130100

In [12]:
newkey.take(5)

[(('line_0001', datetime.datetime(2017, 4, 6, 4, 49, 1)),
  (datetime.timedelta(0),
   ('line_0001',
    ({'lineId': 'line_0001', 'ts': datetime.datetime(2017, 4, 6, 4, 49, 1)},
     {'lineId': 'line_0001',
      'ts': datetime.datetime(2017, 4, 6, 4, 49, 1)})))),
 (('line_0001', datetime.datetime(2017, 4, 6, 4, 49, 1)),
  (datetime.timedelta(days=1106, seconds=57828),
   ('line_0001',
    ({'lineId': 'line_0001', 'ts': datetime.datetime(2017, 4, 6, 4, 49, 1)},
     {'lineId': 'line_0001',
      'ts': datetime.datetime(2020, 4, 16, 20, 52, 49)})))),
 (('line_0001', datetime.datetime(2017, 4, 6, 4, 49, 1)),
  (datetime.timedelta(days=439, seconds=29120),
   ('line_0001',
    ({'lineId': 'line_0001', 'ts': datetime.datetime(2017, 4, 6, 4, 49, 1)},
     {'lineId': 'line_0001',
      'ts': datetime.datetime(2018, 6, 19, 12, 54, 21)})))),
 (('line_0001', datetime.datetime(2017, 4, 6, 4, 49, 1)),
  (datetime.timedelta(days=989, seconds=29930),
   ('line_0001',
    ({'lineId': 'line_0001', 't

In [None]:
# wtf il resto e' un casino