In [3]:
import pyspark
from pyspark import SparkContext, SparkConf
from pyspark.sql import SparkSession
import os
import time

MASTER_URL = "spark://10.4.0.20:7077"

os.environ['PYSPARK_PYTHON'] = '/usr/bin/python3'

conf = (SparkConf()
        .set("spark.eventLog.enabled", "true")
        .set("spark.driver.host", "10.4.0.20")
        .set("spark.history.fs.logDirectory", "/tmp/spark-events")
        .set("spark.app.name", "step2-datacollector-extraction")
        .set("spark.driver.memory", "6G")
        .set("spark.jars.packages", "org.mongodb.spark:mongo-spark-connector_2.11:2.4.0") # mongo connector
        .setMaster(MASTER_URL))

# Analysis 2 - Classification

## Raw-Spark solution

In [6]:
sc = SparkContext(conf=conf)
spark = SparkSession(sc)

In [11]:
from pyspark.sql.functions import lead, col, hour, minute
from pyspark.sql import Window
from pyspark.sql.types import StructType, StructField, ArrayType, StringType, DoubleType, IntegerType, DateType, LongType

w = Window.partitionBy("uuid").orderBy("tick")

capability = "city_traffic"
default_uri = "mongodb://10.4.0.20:27017/data_collector_development"
default_collection = "sensor_values"
pipeline = "{'$match': {'capability': '"+capability+"'}}"


sch = (StructType()
    .add("nodeID", LongType())
    .add("tick", LongType())
    .add("uuid", StringType()))

df = (spark
    .read
    .format("com.mongodb.spark.sql.DefaultSource")
    .option("spark.mongodb.input.uri", "{0}.{1}".format(default_uri, default_collection))
    .option("pipeline", pipeline)
    .schema(sch)
    .load())

df.createOrReplaceGlobalTempView("raw-spark-entry")

AnalysisException: 'Invalid view name: raw-spark-entry;'

In [None]:

leadedDf = (df
    .withColumn("T1", lead(col("T0"), 1).over(w))
    .withColumn("(T1-T0)", col("T1") - col("T0"))
    .withColumn("round(T0)", (col("T0")/1800).cast("int")*1800)
    .withColumn("V", lead(col("U"), 1).over(w))
    .filter("`T1` is not null")
    .withColumn("TWindow", col("T0").cast("timestamp"))
    .withColumn("H", hour("TWindow"))
    .withColumn("M", ((minute("TWindow")/20).cast("int")*20)))

leadedDf.createOrReplaceGlobalTempView("traffic")

trafficDf = (spark
             .sql("select round(T0, 0) from global_temp.traffic")
             .groupby("round(T0, 0)")
             .count())