In [1]:
from pyspark.sql import functions as F
import matplotlib.pyplot as plt
from pyspark.sql.types import *
import seaborn as sns
import pandas as pd
import numpy as np
import geopy.distance
import matplotlib
import datetime
import pickle
import os

import fastplot
%matplotlib inline

<h2>1-reading segment table</h2>

In [2]:
#schema of segment table
schema_segments ='deviceId String,\
                  type Integer,\
                  dateTime string,\
                  startLatitude double,\
                  startLongitude double,\
                  startEngineStatus Integer,\
                  startAccuracyDrop Integer,\
                  endAccuracyDrop Integer,\
                  endEngineStatus Integer,\
                  endLatitude double,\
                  endLongitude double,\
                  segmentDistance double,\
                  segmentDuration double,\
                  segmentSpeedKmH double,\
                  xtmp double,\
                  ytmp double,\
                  start_x integer,\
                  start_y integer,\
                  end_x integer,\
                  end_y integer,\
                  cell_id_start string,\
                  cell_id_end string'
#reading csv
path = os.path.abspath(os.getcwd())
df_segments = spark.read.csv('file:///%s/segment_table'%path,sep=",", schema = schema_segments)

In [3]:
df_segments = df_segments.withColumn('dateTime', F.to_timestamp('dateTime'))
# df_segments.count()
df_segments.limit(10).show()

+--------+----+-------------------+-------------+--------------+-----------------+-----------------+---------------+---------------+-----------+------------+---------------+---------------+------------------+------------------+------------------+-------+-------+-----+-----+-------------+-----------+
|deviceId|type|           dateTime|startLatitude|startLongitude|startEngineStatus|startAccuracyDrop|endAccuracyDrop|endEngineStatus|endLatitude|endLongitude|segmentDistance|segmentDuration|   segmentSpeedKmH|              xtmp|              ytmp|start_x|start_y|end_x|end_y|cell_id_start|cell_id_end|
+--------+----+-------------------+-------------+--------------+-----------------+-----------------+---------------+---------------+-----------+------------+---------------+---------------+------------------+------------------+------------------+-------+-------+-----+-----+-------------+-----------+
| 2507794|   2|2019-09-11 00:33:16|     45.01898|        7.5937|                1|               

## converting dates to hour

In [4]:
@F.udf()
def hour_extraction(dateTime):
    hour = dateTime.hour
    return hour

df_segments = df_segments.withColumn('dateTime', hour_extraction('dateTime'))

In [5]:
df_segments.limit(10).show()

+--------+----+--------+-------------+--------------+-----------------+-----------------+---------------+---------------+-----------+------------+---------------+---------------+------------------+------------------+------------------+-------+-------+-----+-----+-------------+-----------+
|deviceId|type|dateTime|startLatitude|startLongitude|startEngineStatus|startAccuracyDrop|endAccuracyDrop|endEngineStatus|endLatitude|endLongitude|segmentDistance|segmentDuration|   segmentSpeedKmH|              xtmp|              ytmp|start_x|start_y|end_x|end_y|cell_id_start|cell_id_end|
+--------+----+--------+-------------+--------------+-----------------+-----------------+---------------+---------------+-----------+------------+---------------+---------------+------------------+------------------+------------------+-------+-------+-----+-----+-------------+-----------+
| 4305513|   1|      19|     45.09819|      7.661505|                1|               19|             18|              1|   45.098

<span style="font-size: 14pt">time slot division over 24h period<br>
#off_peak: 1-6<br>
#peak_1 : 9-11<br>
#peak_2 : 18-20</span>

<h2>2-creating segment tables based peak and not peak hours</h2>

In [6]:
'''Peak 1'''
@F.udf(returnType=BooleanType())
def peak_1(hour):
    valid_period = [9,10,11]
    inside_peak = hour in valid_period
    return inside_peak

df_peak_1 = df_segments.filter(peak_1('dateTime'))

In [7]:
df_peak_1.count()

57100332

In [8]:
'''Peak 2'''
@F.udf(returnType=BooleanType())
def peak_2(hour):
    valid_period = [18,19,20]
    inside_peak = hour in valid_period
    return inside_peak

df_peak_2 = df_segments.filter(peak_2('dateTime'))

In [9]:
df_peak_2.count()

70264520

In [10]:
'''Non-Peak'''
@F.udf(returnType=BooleanType())
def off_peak(hour):
    valid_period = range(1,7)
    inside_peak = hour in valid_period
    return inside_peak

df_off_peak = df_segments.filter(off_peak('dateTime'))

In [11]:
df_off_peak.count()

14723637

<h2>3-Creating grid tables</h2>

In [5]:
schema_grid_calculations= StructType([ 
    StructField("cell_id",StringType(),True), 
    StructField("num_segments",IntegerType(),True), 
#     StructField("segmentIDs",ArrayType(IntegerType()),True),
    
    StructField("minSpeed",DoubleType(),True),
    StructField("avgSpeed",DoubleType(),True),
    StructField("maxSpeed",DoubleType(),True),
    
    StructField("minAccuracyDrop",IntegerType(),True),
    StructField("avgAccuracyDrop",DoubleType(),True),
    StructField("maxAccuracyDrop",IntegerType(),True),
  ])


In [6]:
@F.pandas_udf(schema_grid_calculations, functionType=F.PandasUDFType.GROUPED_MAP)
def cell_data_calculator(df_segments):
    os.environ["ARROW_PRE_0_15_IPC_FORMAT"] = "1"
    
#     df_segments = df_segments.sort_values(by="dateTime")
    df_segments.reset_index(drop = True, inplace=True)
    
    df_data = pd.DataFrame()

    #================================================================#
    #============== min,avg,max calculations for SPEED ==============#
    minSpeed = min(df_segments['segmentSpeedKmH'])
    avgSpeed = np.mean(df_segments['segmentSpeedKmH'])
    maxSpeed = max(df_segments['segmentSpeedKmH'])
    #================================================================#
    #========= min,avg,max calculations for ACCURACY DROP ===========#
    accuracyDrops = list(df_segments['startAccuracyDrop']) + list(df_segments['endAccuracyDrop'])
    minAccuracyDrop = min(accuracyDrops)
    avgAccuracyDrop = np.mean(accuracyDrops)
    maxAccuracyDrop = max(accuracyDrops)
    #================================================================#
    #============== writing calculated parameters in df_data ==============#
    cell_id = df_segments['cell_id_start'][0]
    df_data['cell_id'] = [cell_id]
    
    num_segments = len(df_segments)
    df_data['num_segments'] = [num_segments]
    df_data['num_segments'] = num_segments
    
    df_data['minSpeed'] = round(minSpeed,2)
    df_data['avgSpeed'] = round(avgSpeed,2)
    df_data['maxSpeed'] = round(maxSpeed,2)
    
    df_data['minAccuracyDrop'] = minAccuracyDrop
    df_data['avgAccuracyDrop'] = round(avgAccuracyDrop,2)
    df_data['maxAccuracyDrop'] = maxAccuracyDrop
    
    return df_data

<strong>Peak 1 Grid table

In [17]:
df_grid_peak_1 = df_peak_1.groupby("cell_id_start" or "cell_id_end").apply(cell_data_calculator)

In [18]:
df_grid_peak_1.count()

43216

In [19]:
df_grid_peak_1.toPandas().to_csv('grids/grid_table_peak_1.csv',index = False)

<strong>Peak 2 Grid table

In [20]:
df_grid_peak_2 = df_peak_2.groupby("cell_id_start" or "cell_id_end").apply(cell_data_calculator)

In [21]:
df_grid_peak_2.count()

41595

In [22]:
df_grid_peak_2.toPandas().to_csv('grids/grid_table_peak_2.csv',index = False)

<strong>Off peak Grid table

In [23]:
df_grid_off_peak = df_off_peak.groupby("cell_id_start" or "cell_id_end").apply(cell_data_calculator)

In [24]:
df_grid_off_peak.count()

32549

In [25]:
df_grid_off_peak.toPandas().to_csv('grids/grid_table_off_peak.csv',index = False)

<strong>Grid table whole

In [9]:
df_grid_whole = df_segments.groupby("cell_id_start" or "cell_id_end").apply(cell_data_calculator)

In [10]:
df_grid_whole.count()

50138

In [11]:
df_grid_whole.toPandas().to_csv('grids/grid_table_whole.csv',index = False)