In [1]:
# SETUP CODE
!apt-get install openjdk-8-jdk-headless -qq > /dev/null
!wget -q http://archive.apache.org/dist/spark/spark-3.1.1/spark-3.1.1-bin-hadoop3.2.tgz
!tar xf spark-3.1.1-bin-hadoop3.2.tgz
!pip install -q findspark

In [2]:
# SETUP CODE
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-3.1.1-bin-hadoop3.2"

In [3]:
# SETUP CODE
import findspark
findspark.init()
from pyspark.sql import SparkSession
spark = SparkSession.builder.master("local[*]").getOrCreate()
spark.conf.set("spark.sql.repl.eagerEval.enabled", True) # Property used to format output tables better
spark
sc = spark.sparkContext

In [4]:
# MOUNT Google Drive
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [17]:
# LOAD data
from pyspark import SparkContext, SQLContext, Row
from pyspark.sql.types import *
from pyspark.sql.functions import lit
sqlContext = SQLContext(sc)

#put data in a folder call csvdata
data1 = "/content/drive/MyDrive/csvdata/detail_record_2017_01_02_08_00_00"
data2 = "/content/drive/MyDrive/csvdata/detail_record_2017_01_03_08_00_00"
data3 = "/content/drive/MyDrive/csvdata/detail_record_2017_01_04_08_00_00"
data4 = "/content/drive/MyDrive/csvdata/detail_record_2017_01_05_08_00_00"
data5 = "/content/drive/MyDrive/csvdata/detail_record_2017_01_06_08_00_00"
data6 = "/content/drive/MyDrive/csvdata/detail_record_2017_01_07_08_00_00"
data7 = "/content/drive/MyDrive/csvdata/detail_record_2017_01_08_08_00_00"
data8 = "/content/drive/MyDrive/csvdata/detail_record_2017_01_09_08_00_00"
data9 = "/content/drive/MyDrive/csvdata/detail_record_2017_01_10_08_00_00"
data10 = "/content/drive/MyDrive/csvdata/detail_record_2017_01_11_08_00_00"

out = "/content/drive/MyDrive/csvdata/output"

text_file = sc.textFile(data1 +","+ data2+","+ data3+","+ data4+","+ data5+","+ data6+","+ data7+","+ data8+","+ data9+","+data10)
print(text_file)

/content/drive/MyDrive/csvdata/detail_record_2017_01_02_08_00_00,/content/drive/MyDrive/csvdata/detail_record_2017_01_03_08_00_00,/content/drive/MyDrive/csvdata/detail_record_2017_01_04_08_00_00,/content/drive/MyDrive/csvdata/detail_record_2017_01_05_08_00_00,/content/drive/MyDrive/csvdata/detail_record_2017_01_06_08_00_00,/content/drive/MyDrive/csvdata/detail_record_2017_01_07_08_00_00,/content/drive/MyDrive/csvdata/detail_record_2017_01_08_08_00_00,/content/drive/MyDrive/csvdata/detail_record_2017_01_09_08_00_00,/content/drive/MyDrive/csvdata/detail_record_2017_01_10_08_00_00,/content/drive/MyDrive/csvdata/detail_record_2017_01_11_08_00_00 MapPartitionsRDD[41] at textFile at NativeMethodAccessorImpl.java:0


In [13]:
counts = text_file.map(lambda line: line.split(",")).filter(lambda line: len(line)>8)
column_data = counts.map(lambda p: Row(p[0], p[1], p[2], p[3], p[4], \
                      p[5], p[6], p[7], p[8], p[9] , \
                      p[10], p[11], p[12], p[13], p[14], \
                      p[15], p[16], p[17], p[18]))

column_name = "driverID,carPlateNumber,Latitude,Longitude,Speed,Direction,siteName,Time,isRapidlySpeedup,isRapidlySlowdown,isNeutralSlide,isNeutralSlideFinished,neutralSlideTime,isOverspeed,isOverspeedFinished,overspeedTime,isFatigueDriving,isHthrottleStop,isOilLeak"
sql = "SELECT first(driverID),first(carPlateNumber),first(Time) \
                              as recordDAY,HOUR(Time) as recordHOUR,\
                              sum(isRapidlySpeedup),sum(isRapidlySlowdown),sum(isNeutralSlide),sum(isNeutralSlideFinished),\
                              sum(neutralSlideTime),sum(isOverspeed),sum(isOverspeedFinished),sum(overspeedTime),sum(isFatigueDriving),\
                              sum(isHthrottleStop),sum(isOilLeak) \
                              FROM summary \
                              GROUP BY driverID,DAY(Time),HOUR(Time)"
fields = [StructField(field_name, StringType(), True) for field_name in column_name.split(",")]
schema = StructType(fields)


In [14]:
# apply the schema to the RDD
dataframe = sqlContext.createDataFrame(column_data,schema)
dataframe

driverID,carPlateNumber,Latitude,Longitude,Speed,Direction,siteName,Time,isRapidlySpeedup,isRapidlySlowdown,isNeutralSlide,isNeutralSlideFinished,neutralSlideTime,isOverspeed,isOverspeedFinished,overspeedTime,isFatigueDriving,isHthrottleStop,isOilLeak
likun1000003,华AVM936,32.056444,118.777589,72,211,,2017-01-01 08:00:05,,,,,,,,,1.0,,
haowei1000008,华A709GB,30.6786,104.070835,143,115,,2017-01-01 08:00:08,,,,,,1.0,,,,,
haowei1000008,华A709GB,30.6786,104.070835,120,115,,2017-01-01 08:00:08,,,,,,,1.0,0.0,,,
zouan1000007,华A58M83,28.210856,112.979521,139,177,,2017-01-01 08:00:12,,,,,,1.0,,,,,
duxu1000009,华AT75H8,38.500677,106.210269,137,118,,2017-01-01 08:00:13,,,,,,1.0,,,,,
duxu1000009,华AT75H8,38.500517,106.210581,120,118,,2017-01-01 08:00:14,,,,,,,1.0,1.0,,,
zouan1000007,华A58M83,28.207153,112.97976,131,177,,2017-01-01 08:00:24,,1.0,,,,,,,,,
haowei1000008,华A709GB,30.675895,104.076702,130,115,,2017-01-01 08:00:29,,,,,,1.0,,,,,
haowei1000008,华A709GB,30.675895,104.076702,120,115,,2017-01-01 08:00:29,,,,,,,1.0,0.0,,,
duxu1000009,华AT75H8,38.498514,106.214488,105,118,,2017-01-01 08:00:29,,,,,,,,,,1.0,


In [15]:
# register the DataFrame as a table. (summary)
dataframe.registerTempTable("summary")
dataframe

driverID,carPlateNumber,Latitude,Longitude,Speed,Direction,siteName,Time,isRapidlySpeedup,isRapidlySlowdown,isNeutralSlide,isNeutralSlideFinished,neutralSlideTime,isOverspeed,isOverspeedFinished,overspeedTime,isFatigueDriving,isHthrottleStop,isOilLeak
likun1000003,华AVM936,32.056444,118.777589,72,211,,2017-01-01 08:00:05,,,,,,,,,1.0,,
haowei1000008,华A709GB,30.6786,104.070835,143,115,,2017-01-01 08:00:08,,,,,,1.0,,,,,
haowei1000008,华A709GB,30.6786,104.070835,120,115,,2017-01-01 08:00:08,,,,,,,1.0,0.0,,,
zouan1000007,华A58M83,28.210856,112.979521,139,177,,2017-01-01 08:00:12,,,,,,1.0,,,,,
duxu1000009,华AT75H8,38.500677,106.210269,137,118,,2017-01-01 08:00:13,,,,,,1.0,,,,,
duxu1000009,华AT75H8,38.500517,106.210581,120,118,,2017-01-01 08:00:14,,,,,,,1.0,1.0,,,
zouan1000007,华A58M83,28.207153,112.97976,131,177,,2017-01-01 08:00:24,,1.0,,,,,,,,,
haowei1000008,华A709GB,30.675895,104.076702,130,115,,2017-01-01 08:00:29,,,,,,1.0,,,,,
haowei1000008,华A709GB,30.675895,104.076702,120,115,,2017-01-01 08:00:29,,,,,,,1.0,0.0,,,
duxu1000009,华AT75H8,38.498514,106.214488,105,118,,2017-01-01 08:00:29,,,,,,,,,,1.0,


In [16]:
# execute the sql
group_data = sqlContext.sql(sql)
group_data

first(driverID),first(carPlateNumber),recordDAY,recordHOUR,sum(CAST(isRapidlySpeedup AS DOUBLE)),sum(CAST(isRapidlySlowdown AS DOUBLE)),sum(CAST(isNeutralSlide AS DOUBLE)),sum(CAST(isNeutralSlideFinished AS DOUBLE)),sum(CAST(neutralSlideTime AS DOUBLE)),sum(CAST(isOverspeed AS DOUBLE)),sum(CAST(isOverspeedFinished AS DOUBLE)),sum(CAST(overspeedTime AS DOUBLE)),sum(CAST(isFatigueDriving AS DOUBLE)),sum(CAST(isHthrottleStop AS DOUBLE)),sum(CAST(isOilLeak AS DOUBLE))
likun1000003,华AVM936,2017-01-02 23:00:28,23,3.0,3.0,1.0,1.0,32.0,36.0,36.0,441.0,41.0,3.0,9.0
shenxian1000004,华ADJ750,2017-01-06 08:01:38,8,4.0,1.0,7.0,7.0,72.0,33.0,33.0,314.0,37.0,1.0,7.0
xiexiao1000001,华AEB132,2017-01-09 03:01:09,3,1.0,4.0,5.0,5.0,28.0,33.0,33.0,275.0,41.0,2.0,4.0
xiezhi1000006,华A6CU11,2017-01-07 06:00:35,6,4.0,10.0,2.0,2.0,36.0,34.0,35.0,428.0,35.0,8.0,4.0
shenxian1000004,华ADJ750,2017-01-03 18:04:16,18,3.0,6.0,4.0,4.0,41.0,31.0,31.0,265.0,24.0,4.0,8.0
xiexiao1000001,华AEB132,2017-01-09 02:00:36,2,5.0,4.0,3.0,3.0,21.0,33.0,33.0,311.0,42.0,2.0,2.0
zengpeng1000000,华AZQ110,2017-01-08 21:01:08,21,4.0,2.0,4.0,4.0,57.0,34.0,34.0,304.0,35.0,4.0,2.0
zouan1000007,华A58M83,2017-01-10 05:01:25,5,2.0,5.0,3.0,3.0,13.0,41.0,41.0,475.0,34.0,3.0,3.0
shenxian1000004,华ADJ750,2017-01-08 20:00:36,20,6.0,4.0,1.0,1.0,0.0,30.0,31.0,405.0,32.0,6.0,5.0
xiexiao1000001,华AEB132,2017-01-05 16:00:30,16,3.0,7.0,3.0,3.0,24.0,31.0,31.0,270.0,42.0,3.0,4.0


In [18]:
# save the data
group_data.coalesce(1).write.csv(out, mode="overwrite")