In [1]:
import pyspark
from pyspark import SparkContext, SparkConf
from pyspark.sql import SparkSession
import os
import time

MASTER_URL = "spark://172.24.0.71:7077"
# MASTER_URL = "spark://10.4.0.20:7077"

os.environ['PYSPARK_PYTHON'] = '/usr/bin/python3'

conf = (SparkConf()
        .set("spark.eventLog.enabled", "true")
        .set("spark.driver.host", "172.24.0.71")
        .set("spark.history.fs.logDirectory", "/tmp/spark-events")
        .set("spark.app.name", "step2-datacollector-extraction")
        .set("spark.driver.memory", "6G")
        .set("spark.jars.packages", "org.mongodb.spark:mongo-spark-connector_2.11:2.4.0") # mongo connector
        .setMaster(MASTER_URL))

# Analysis 2 - Classification

## Raw-Spark solution

In [2]:
sc = SparkContext(conf=conf)
spark = SparkSession(sc)

In [5]:
from pyspark.sql.functions import lead, col, hour, minute
from pyspark.sql import Window
from pyspark.sql.types import *

capability = "city_traffic"
default_uri = "mongodb://172.24.0.71:27017/data_collector_development"
default_collection = "sensor_values"
pipeline = "{'$match': {'capability': '"+capability+"'}}"

sch = (StructType()
    .add("nodeID", LongType())
    .add("tick", LongType())
    .add("uuid", StringType()))

df = (spark
    .read
    .format("com.mongodb.spark.sql.DefaultSource")
    .option("spark.mongodb.input.uri", "{0}.{1}".format(default_uri, default_collection))
    .option("pipeline", pipeline)
    .schema(sch)
    .load())

df.createOrReplaceGlobalTempView("rawspark")

In [7]:
spark.sql("select * from global_temp.rawspark")

DataFrame[nodeID: bigint, tick: bigint, uuid: string]

In [86]:
queries = [
    """
        create or replace temporary view q1 as
            select
                lead(tick, 1) over (partition by uuid order by tick) as T1,
                tick as T0,
                lead(nodeID, 1) over (partition by uuid order by tick) as V,
                nodeID as U
            from global_temp.rawspark
    """,
    """
        create or replace temporary view q2 as
            select
                timestamp(T0) as TWindow,
                T1,
                T0,
                U,
                V
            from q1
    """,
    """
        create or replace temporary view q3 as
            select
                hour(TWindow) as H,
                minute(TWindow) as M,
                T1, T0, U, V, TWindow
            from q2 
    """,
    """
        create or replace temporary view q4 as
            select
                CAST((M / 20)*20 AS INT) as M,
                H,
                U,
                V,
                (T1 - T0) as delta
            from q3
    """,
    """
        select 
            mean(delta) as meanT,
            stddev_pop(delta) as stddevT,
            U,
            V,
            H,
            M
        from q4
            group by U, V, H, M
    """
]

dataframe = [None]
for q in queries:
    dataframe[0] = spark.sql(q)

In [88]:
dataframe[0]

DataFrame[meanT: double, stddevT: double, U: bigint, V: bigint, H: int, M: int]

In [90]:
(dataframe[0]
    .write
    .format("parquet")
    .mode("overwrite")
    .save("/tmp/dataprocessor-report/analysis2.parquet"))

In [96]:
# sch = StructType()
# sch.add("meanT", DoubleType())
# sch.add("stddevT", DoubleType())
# sch.add("U", StringType())
# sch.add("V", StringType())
# sch.add("H", LongType())
# sch.add("M", LongType())

# df = spark.read.format("parquet").schema(sch).load("/tmp/dataprocessor-report/step2.parquet")

In [105]:
# df.createOrReplaceGlobalTempView("mymodel")