In [1]:
raw_points_df = sqlContext.read.format('com.databricks.spark.csv') \
               .options(header='true', inferSchema='true') \
               .load('timeseries/input/points.csv')
raw_points_df = raw_points_df.drop("_c0")
raw_points_df.show(5)

+--------------------+------+----+---------+-----------+---------+------+------+-----------+-------+-------+----+------+---------+----+------------+------+-----+---------+------------------+---------+----+-------+---------+----+--------+--------------------+------+--------+----------+----------+-----------+----------+-----------+-----+------------------+-------+----+-----+------+------------+---------------+--------------------+--------------+----+-------+----+---+--------------------+---------+------+-------+------+---------+-------+--------------------+---------+--------------+----+--------+----+--------+------+-------+----------+----+---------+----+---------+-------+--------------+--------+-------+----+--------+-------+-----+--------+--------------+----+----------------+------+----+--------+------+---------+-------+----+-----+-----+------+----------+----+-----------+------+----+-----+----+----+-----+--------------+-------+----+--------------------+
|                  id|ahuRef| air|

In [2]:
from pyspark.sql import Row
points_cols = raw_points_df.columns
points_schema = raw_points_df.schema
print(points_cols)
def changeToBool(row):
    cols = {}
    for col in points_cols:
        value = row[col]
        if value == "✓":
            cols[col] = 1
        else:
            cols[col] = value
    return Row(**cols)
            
points_rdd = raw_points_df.rdd.map(lambda row: changeToBool(row))
points_df = sqlContext.createDataFrame(points_rdd, points_schema)
points_df.show()

['id', 'ahuRef', 'air', 'analytics', 'armsAssetId', 'averazing', 'boiler', 'bypass', 'calendarRef', 'chilled', 'chiller', 'cmd', 'common', 'condenser', 'cool', 'coolingTower', 'damper', 'delta', 'dis', 'disMacro', 'discharge', 'dk', 'economy', 'effective', 'elec', 'elecHeat', 'elecMeterLoad', 'enable', 'entering', 'enum', 'envizi_CCV', 'envizi_CCV2', 'envizi_HCV', 'envizi_HCV2', 'equip', 'equipRef', 'exhaust', 'fan', 'fault', 'filter', 'gasMeterLoad', 'haystackConnRef', 'haystackHis', 'haystackHisOld', 'heat', 'heating', 'help', 'his', 'hisEnd', 'hisEndVal', 'hisErr', 'hisFunc', 'hisId', 'hisRollup', 'hisSize', 'hisStart', 'hisStatus', 'hisTsPrecision', 'hot', 'humidity', 'hvac', 'imported', 'kind', 'leaving', 'levelRef', 'low', 'maxFlowSp', 'min', 'minFlowSp', 'minimum', 'navName', 'nextTime', 'nextVal', 'occ', 'occSched', 'outside', 'point', 'pressure', 'projectAssetId', 'pump', 'regionRef', 'return', 'run', 'schedule', 'sensor', 'sitePoint', 'siteRef', 'sp', 'speed', 'stage', 'suppl

In [3]:
from au.com.gegroup.ts.writer import Writer
# Writer(dataframe, dataset, row_keys)
writer = Writer(points_df, "points_metadata_v0", "dis")
writer.mode("overwrite").write()

In [4]:
metadata = sqlContext.read.format("filodb.spark").option("dataset", "points_metadata_v0").load()
metadata.show()

+------+--------------------+-------+-----+-----+----------+---------------+-----+---+----------------+---------+--------+-------+----+--------+-------+--------------------+-------+--------+----+------+------------+---------+----+-------+----------+-------+--------+----+----------+---------+-----------+---------+------+--------------+----+--------------------+------+---------+---------+-------+-------+-------+---------+----+----+----------+--------------+-------+----+------+-----+--------------+------------------+------+------+----+---------+------------+--------------------+----+--------------------+--------+------+----------+--------+-----------+----+---------+--------+------+-------+--------------------+----+--------------------+----------+------+----+-----------+--------+-------+--------------+-----+------+------+----+----+----+------------+---------+-----------+----+-----+-----+------+-------+-----------+----------+----+----+------+-----+----+----+
|enable|            hisStart|eco

In [5]:
# Reading history file
history_df = sqlContext.read.format('com.databricks.spark.csv') \
               .options(header='true') \
               .load('timeseries/input/histories.csv')
               
history_df.cache()          
history_df.show(5)

+--------------------+----+----+---------+---------+---------+----------+------------+---------+-----------+---------+---------+---------+----------+---------+----------+----------+----------+------------+------------+------------+------------+----------------+----------------+----------+----------+----------+-------+-------+-------+-------+-------+---------+----------+----------+----------+-----------+-----------+-----------+------------+------------+------------+------------+---------------+---------------+---------------+---------------+---------------+---------+---------+---------+---------+-----------+---------+---------+----------+---------+---------+----------+----------+----------+----------+----------+----------+-------+-------+-------+-------+-------+---------+---------+---------+---------+----------------+----------------+---------------+---------------+---------------+---------------+---------+---------+----------+----------+----------+-------------+-------------+----------

In [8]:
import datetime, time
from pyspark.sql.types import *
columns = history_df.columns
if "Timestamp" in columns:
    columns.remove('Timestamp')

def clean_as_schema(row, columns):
    date_string = row['Timestamp']
    timestamp = datetime.datetime.strptime(date_string[:19], "%Y-%m-%dT%H:%M:%S")
    datetime_val = convert_to_timestamp(date_string)
    rows = []
    for column in columns:
        value = row[column]
        if value is not None and value != 'null' and value != '':
            new_row = [timestamp, datetime_val, column, value]
            rows.append(Row(*new_row))
    return rows


def convert_to_timestamp(date_string):
    date_string = date_string[:19]
    return int((datetime.datetime.strptime(date_string, "%Y-%m-%dT%H:%M:%S") - datetime.datetime.utcfromtimestamp(
        0)).total_seconds() * 1000*1000*1000)
        
cleaned_his_rdd = history_df.rdd.flatMap(lambda row: clean_as_schema(row, columns))

schema = StructType([
    StructField("timestamp", TimestampType(), False),
    StructField("datetime", LongType(), False),
    StructField("pointName", StringType(), False),
    StructField("raw_value", StringType(), False)
    ])
    
cleaned_his_df = sqlContext.createDataFrame(cleaned_his_rdd, schema)
cleaned_his_df.registerTempTable("cleanedHisDf")
cleaned_his_df.show()

+--------------------+-------------------+----------------+---------+
|           timestamp|           datetime|       pointName|raw_value|
+--------------------+-------------------+----------------+---------+
|2016-11-05 01:43:...|1478310223000000000|OAF-1-Tenant_ENB|    false|
|2016-11-05 01:43:...|1478310223000000000|RAF-1-Tenant_ENB|    false|
|2016-11-05 01:44:...|1478310246000000000|OAF-1-Tenant_STS|    false|
|2016-11-05 01:44:...|1478310246000000000|RAF-1-Tenant_STS|    false|
|2017-02-03 07:00:...|1486105205000000000|OAF-2-Tenant_ENB|     true|
|2017-02-03 07:00:...|1486105205000000000|RAF-2-Tenant_ENB|     true|
|2017-02-03 07:00:...|1486105210000000000|OAF-2-Tenant_STS|     true|
|2017-02-03 07:00:...|1486105211000000000|RAF-2-Tenant_STS|     true|
|2017-08-07 14:57:...|1502117823000000000|     STCWP-1_STS|     true|
|2017-08-07 14:57:...|1502117823000000000|     PTCWP-1_ENB|     true|
|2017-09-21 08:24:...|1505982247000000000|    Boiler-1_ENB|    false|
|2017-09-21 08:24:..

In [9]:
points_df.registerTempTable("points")
# We have registered both as temp table. Let's use sql to do the join
joined_his_df = sqlContext.sql("SELECT * from cleanedHisDf as h left join points as p on h.pointName = p.dis")
# This could be achieved by using api as well
joined_his_df = joined_his_df.drop("dis")
joined_his_df.show()

+--------------------+-------------------+----------+--------------------+--------------------+------+---+---------+-----------+---------+------+------+-----------+-------+-------+----+------+---------+----+------------+------+-----+--------+---------+----+-------+---------+----+--------+-------------+------+--------+----+----------+-----------+----------+-----------+-----+----------+-------+----+-----+------+------------+---------------+--------------------+--------------+----+-------+----+---+--------------------+----------+------+-------+-----+---------+-------+--------------------+---------+--------------+----+--------+----+--------+------+-------+----------+----+---------+----+---------+-------+--------------------+--------+-------+----+--------+-------+-----+--------+--------------+----+----------------+------+----+--------+------+---------+-------+----+-----+-----+------+----------+----+-----------+------+----+-----+----+----+-----+--------------+-------+----+-------------------

In [10]:
from pyspark.sql.functions import udf, col
def clean_raw_value(value, unit, kind):
    # todo write logic
    if "Bool" == kind:
        if "true" == value:
            return 1.0
        else:
            return 0.0
    else:
        return float(value.replace(unit,"").strip())

clean_udf = udf(lambda value, unit, kind: clean_raw_value(value, unit, kind), DoubleType())
joined_his_df = joined_his_df.withColumn("value", clean_udf(col("raw_value"), col("unit"), col("kind")))
joined_his_df.show()

+--------------------+-------------------+----------+--------------------+--------------------+------+---+---------+-----------+---------+------+------+-----------+-------+-------+----+------+---------+----+------------+------+-----+--------+---------+----+-------+---------+----+--------+-------------+------+--------+----+----------+-----------+----------+-----------+-----+----------+-------+----+-----+------+------------+---------------+--------------------+--------------+----+-------+----+---+--------------------+----------+------+-------+-----+---------+-------+--------------------+---------+--------------+----+--------+----+--------+------+-------+----------+----+---------+----+---------+-------+--------------------+--------+-------+----+--------+-------+-----+--------+--------------+----+----------------+------+----+--------+------+---------+-------+----+-----+-----+------+----------+----+-----------+------+----+-----+----+----+-----+--------------+-------+----+-------------------

In [11]:
def get_year_month(val):
    return datetime.datetime.fromtimestamp(val/(1000*1000*1000)).strftime("%Y-%m")
    
year_month_udf = udf(lambda date_val: get_year_month(date_val), StringType())
final_df = joined_his_df.withColumn("yearMonth", year_month_udf(col("datetime"))) # generating partition key as using only siteRef as partition may produce to large partition and cassandra partition should be < 1 GB
final_df = final_df.select("timestamp", "datetime", "yearMonth", "pointName", "siteRef", "levelRef", "equipRef", "value")
final_df.printSchema()
final_df.show()

root
 |-- timestamp: timestamp (nullable = false)
 |-- datetime: long (nullable = false)
 |-- yearMonth: string (nullable = true)
 |-- pointName: string (nullable = false)
 |-- siteRef: string (nullable = true)
 |-- levelRef: string (nullable = true)
 |-- equipRef: string (nullable = true)
 |-- value: double (nullable = true)

+--------------------+-------------------+---------+----------+-------+----------+----------+------------------+
|           timestamp|           datetime|yearMonth| pointName|siteRef|  levelRef|  equipRef|             value|
+--------------------+-------------------+---------+----------+-------+----------+----------+------------------+
|2017-10-10 23:45:...|1507679101000000000|  2017-10|ACU-2_SAPR|   Site|Site Plant|Site ACU 2|1.2893999814987183|
|2017-10-11 00:00:...|1507680001000000000|  2017-10|ACU-2_SAPR|   Site|Site Plant|Site ACU 2|1.3430999517440796|
|2017-10-11 00:15:...|1507680901000000000|  2017-10|ACU-2_SAPR|   Site|Site Plant|Site ACU 2| 2.1408998966

In [12]:
final_df.write.format("filodb.spark") \
        .option("dataset", "iot_history_hybrid") \
        .option("partition_keys", "siteRef,yearMonth") \
        .option("row_keys", "datetime,pointName,equipRef,levelRef") \
        .option("chunk_size", "500") \
        .mode("overwrite") \
        .save()