In [2]:
import tensorflow.keras as keras
from petastorm.tf_utils import tf_tensors
from petastorm import make_reader, make_batch_reader

In [3]:
def prep_data(source, window_size, sampling_factor, sample_strategy):
    """Pulls data from source and preps it into windows given strategy.
    
    Parameters
    ----------
    source: path to petastorm time series dataset
    window_size: size of window to use
    sampling_factor: factor to downsample by
    sample_strategy: strategy to use for sampling. One of 'mean' or 'boolean'  

    Returns
    -------
    windows: list of windows
    """
    with make_reader(source) as reader:
        dataset = tf_tensors(reader)
        X = np.fromiter(map(lambda x: x[0], dataset), dtype=np.float32)
        y = np.fromiter(map(lambda x: x[1], dataset), dtype=np.float32)
        windowed_ds = keras.utils.timeseries_dataset_from_array(X, y, window_size, sampling_rate = sampling_factor)
    
    return windowed_ds

In [4]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from petastorm.spark import SparkDatasetConverter, make_spark_converter

In [5]:
spark = SparkSession.builder.getOrCreate()

In [6]:
df = spark.read \
    .option("header", True) \
    .option("inferSchema", True) \
    .csv("./data/dataTraining.txt")

In [7]:
df.printSchema()

root
 |-- index: integer (nullable = true)
 |-- date: string (nullable = true)
 |-- Temperature: double (nullable = true)
 |-- Humidity: double (nullable = true)
 |-- Light: double (nullable = true)
 |-- CO2: double (nullable = true)
 |-- HumidityRatio: double (nullable = true)
 |-- Occupancy: integer (nullable = true)



In [8]:
df = df.withColumn("date", F.to_timestamp(F.col("date")))

df.show(5)

+-----+-------------------+-----------+--------+-----+------+-------------------+---------+
|index|               date|Temperature|Humidity|Light|   CO2|      HumidityRatio|Occupancy|
+-----+-------------------+-----------+--------+-----+------+-------------------+---------+
|    1|2015-02-04 17:51:00|      23.18|  27.272|426.0|721.25|0.00479298817650529|        1|
|    2|2015-02-04 17:51:59|      23.15| 27.2675|429.5| 714.0|0.00478344094931065|        1|
|    3|2015-02-04 17:53:00|      23.15|  27.245|426.0| 713.5|0.00477946352442199|        1|
|    4|2015-02-04 17:54:00|      23.15|    27.2|426.0|708.25|0.00477150882608175|        1|
|    5|2015-02-04 17:55:00|       23.1|    27.2|426.0| 704.5|0.00475699293331518|        1|
+-----+-------------------+-----------+--------+-----+------+-------------------+---------+
only showing top 5 rows



In [25]:
df.write.parquet("./data/dataTraining.parquet")

In [26]:
spark.conf.set(SparkDatasetConverter.PARENT_CACHE_DIR_URL_CONF, 'hdfs://./data/temp')

In [9]:
from pyspark.sql.window import Window

In [13]:
windowSpec = Window.orderBy(F.col("date").desc())

In [11]:
df \
    .select("date", "Temperature") \
    .withColumn("mean_temp", F.mean("Temperature").over(windowSpec)) \
    .orderBy("date") \
    .show()

+-------------------+-----------+------------------+
|               date|Temperature|         mean_temp|
+-------------------+-----------+------------------+
|2015-02-04 17:51:00|      23.18|23.138333333333335|
|2015-02-04 17:51:59|      23.15|23.125000000000004|
|2015-02-04 17:53:00|      23.15| 23.11666666666667|
|2015-02-04 17:54:00|      23.15|23.108333333333334|
|2015-02-04 17:55:00|       23.1| 23.09583333333333|
|2015-02-04 17:55:59|       23.1|23.091666666666665|
|2015-02-04 17:57:00|       23.1|23.091666666666665|
|2015-02-04 17:57:59|       23.1| 23.09166666666667|
|2015-02-04 17:58:59|       23.1|23.083333333333332|
|2015-02-04 18:00:00|     23.075|23.066666666666666|
|2015-02-04 18:01:00|     23.075|23.054166666666664|
|2015-02-04 18:02:00|       23.1|23.041666666666668|
|2015-02-04 18:03:00|       23.1|23.015833333333333|
|2015-02-04 18:04:00|      23.05|             22.99|
|2015-02-04 18:04:59|       23.0|22.963333333333335|
|2015-02-04 18:06:00|       23.0|22.9449999999

In [16]:
df.select("date", "Temperature").withColumn("nextTemp", F.lag("Temperature", 1).over(windowSpec)).show()

+-------------------+----------------+----------------+
|               date|     Temperature|        nextTemp|
+-------------------+----------------+----------------+
|2015-02-10 09:33:00|            21.1|            null|
|2015-02-10 09:32:00|            21.1|            21.1|
|2015-02-10 09:30:59|            21.1|            21.1|
|2015-02-10 09:29:59|           21.05|            21.1|
|2015-02-10 09:29:00|           21.05|           21.05|
|2015-02-10 09:28:00|           21.05|           21.05|
|2015-02-10 09:27:00|            21.0|           21.05|
|2015-02-10 09:26:00|          21.025|            21.0|
|2015-02-10 09:24:59|            21.0|          21.025|
|2015-02-10 09:23:59|            21.0|            21.0|
|2015-02-10 09:23:00|            21.0|            21.0|
|2015-02-10 09:22:00|            21.0|            21.0|
|2015-02-10 09:21:00|            21.0|            21.0|
|2015-02-10 09:20:00|        20.95875|            21.0|
|2015-02-10 09:19:00|         20.9175|        20

In [19]:
def prep_data_spark(spark, window_size_data, sampling_factor, sample_strategy, dataset_path=None):
    """
    Windows data that is already in a spark dataframe.

    Parameters
    ----------
    spark: spark session or sql dataframe
    dataset_path: str, Default None. path to parquet file
    window_size: size of window to use
    label_aggregation_strategy: strategy to use for label aggregation. One of 'mean' or 'boolean'  

    Returns
    -------
    windows: spark.sql.DataFrame containing windows
    """
    from pyspark.sql import SparkSession
    from pyspark.sql.dataframe import DataFrame
    from pyspark.sql.window import Window
    from pyspark.sql import functions as F
    
    TIME_COL = 'time'
    VALUE_COL = 'value'
    LABEL_COL = 'label'
    
    agg_strategies = {'mean': F.mean, 'boolean': F.max}
    
    if isinstance(spark, DataFrame):
        df = spark
    elif isinstance(spark, SparkSession) and dataset_path is not None:
        df = spark.read.parquet(dataset_path)
    else:
        raise TypeError("Expected spark context + filepath or spark.sql.DataFrame")
    
    # cast timestamp column to timestamp type
    df = df.withColumn(TIME_COL, df[TIME_COL].cast('timestamp'))

    # apply windowing to values pairs
    windowSpec = Window.orderBy(F.col(TIME_COL).desc())
    windowSpecLabels = Window.orderBy(F.col(TIME_COL).desc()).rowsBetween(-window_size_data, 0)
    
    agg_labels = agg_strategies[sample_strategy](LABEL_COL).over(windowSpecLabels)
    
    windowed_df = df.withColumn(f"{VALUE_COL}_1", F.lag(VALUE_COL, 1).over(windowSpec))
    
    if window_size > 1:
        for i in range(2, window_size + 1):
            windowed_df.withColumn(f"{VALUE_COL}_{i}", F.lag(VALUE_COL, i).over(windowSpec))

    # remove rows that can't fill a full window and add the labels to the df
    return windowed_df.where(F.col(f"{VALUE_COL}_{window_size}").isNotNull()).withColumn(LABEL_COL, agg_labels)