In [1]:
# Reading points metadata
raw_points_df = sqlContext.read.format('com.databricks.spark.csv') \
               .options(header='true', inferSchema='true') \
               .load('timeseries/input/points.csv')
raw_points_df = raw_points_df.drop("_c0")
raw_points_df.show(5)

+--------------------+------+----+---------+-----------+---------+------+------+-----------+-------+-------+----+------+---------+----+------------+------+-----+---------+------------------+---------+----+-------+---------+----+--------+--------------------+------+--------+----------+----------+-----------+----------+-----------+-----+------------------+-------+----+-----+------+------------+---------------+--------------------+--------------+----+-------+----+---+--------------------+---------+------+-------+------+---------+-------+--------------------+---------+--------------+----+--------+----+--------+------+-------+----------+----+---------+----+---------+-------+--------------+--------+-------+----+--------+-------+-----+--------+--------------+----+----------------+------+----+--------+------+---------+-------+----+-----+-----+------+----------+----+-----------+------+----+-----+----+----+-----+--------------+-------+----+--------------------+
|                  id|ahuRef| air|

In [2]:
#getting vertices data
vertices = raw_points_df.select("dis", "siteRef", "levelRef", "equipRef","unit", "kind")
vertices = vertices.withColumnRenamed("dis", "id")
vertices.show()

+------------+-------+----------+------------------+----+------+
|          id|siteRef|  levelRef|          equipRef|unit|  kind|
+------------+-------+----------+------------------+----+------+
|         OAT|   Site|Site Plant|Site Building Info|  °C|Number|
|         OAH|   Site|Site Plant|Site Building Info|   %|Number|
|   ACU-2_ENB|   Site|Site Plant|        Site ACU 2|null|  Bool|
|   ACU-2_STS|   Site|Site Plant|        Site ACU 2|null|  Bool|
|   ACU-2_SPD|   Site|Site Plant|        Site ACU 2|   %|Number|
|  ACU-2_SAPR|   Site|Site Plant|        Site ACU 2|  Pa|Number|
|ACU-2_SAPRSP|   Site|Site Plant|        Site ACU 2|  Pa|Number|
|   ACU-2_SAT|   Site|Site Plant|        Site ACU 2|  °C|Number|
| ACU-2_SATSP|   Site|Site Plant|        Site ACU 2|  °C|Number|
|   ACU-2_RAT|   Site|Site Plant|        Site ACU 2|  °C|Number|
|   ACU-2_HCV|   Site|Site Plant|        Site ACU 2|   %|Number|
|   ACU-2_CCV|   Site|Site Plant|        Site ACU 2|   %|Number|
|  ACU-2_CCV2|   Site|Sit

In [3]:
# get edges df consisting src,dest,relationship
from pyspark.sql import Row

schema = ["src", "dst", "relationship"]
points_cols = raw_points_df.columns

def loopEdges(row):
    src_val = row['dis']
    rows = []
    for col in points_cols:
        value = row[col]
        if value == "✓":
            edge = Row(src=src_val, dst=src_val, relationship=col)
            rows.append(edge)
    return rows

edges_rdd = raw_points_df.rdd.flatMap(lambda row: loopEdges(row))
edges = sqlContext.createDataFrame(edges_rdd)
edges = edges.select("src", "dst", "relationship")
edges.show()

+---------+---------+------------+
|      src|      dst|relationship|
+---------+---------+------------+
|      OAT|      OAT|         air|
|      OAT|      OAT|   analytics|
|      OAT|      OAT|         his|
|      OAT|      OAT|    imported|
|      OAT|      OAT|     outside|
|      OAT|      OAT|       point|
|      OAT|      OAT|      sensor|
|      OAT|      OAT|        temp|
|      OAH|      OAH|         air|
|      OAH|      OAH|   analytics|
|      OAH|      OAH|         his|
|      OAH|      OAH|    humidity|
|      OAH|      OAH|    imported|
|      OAH|      OAH|     outside|
|      OAH|      OAH|       point|
|      OAH|      OAH|      sensor|
|ACU-2_ENB|ACU-2_ENB|   analytics|
|ACU-2_ENB|ACU-2_ENB|         cmd|
|ACU-2_ENB|ACU-2_ENB|   discharge|
|ACU-2_ENB|ACU-2_ENB|         fan|
+---------+---------+------------+
only showing top 20 rows



In [4]:
#saving to csv
vertices.write.csv("timeseries/graph/metadata/vertices.csv", mode="overwrite", header=True)
edges.write.csv("timeseries/graph/metadata/edges.csv", mode="overwrite", header=True)