# Time Series Persister
Usage:
- Copy `TimeSeriesPersister.env.sample` to `TimeSeriesPersister.env`
- Edit `TimeSeriesPersister.env` to match your environment
- Execute the cells

**Read Configuration From `.env` File**

In [None]:
#r "nuget: dotenv.net, 3.0.0"
using System.Text.RegularExpressions;
using dotenv.net;

var envVars = DotEnv.Fluent().WithEnvFiles("./TimeSeriesPersister.env").Read();

// Event hub (Kafka source)
var eventHubConnectionString = envVars["EVENTHUB_CONNECTIONSTRING"];
var eventHubsNamespace = Regex.Match(eventHubConnectionString, "sb://([^.]+).").Groups[1].Value;
var eventHubsInstance = Regex.Match(eventHubConnectionString, "EntityPath=(.+)$").Groups[1].Value;

// Delta Table (sink)
var storageAccountKey = envVars["STORAGE_ACCOUNT_KEY"];
var storageAccountName = envVars["STORAGE_ACCOUNT_NAME"];
var delta_lake_container_name = envVars["DELTA_LAKE_CONTAINER_NAME"];
var blobName = envVars["BLOB_NAME"];

**Setup**

In [None]:
#r "nuget:Microsoft.Spark"
#r "nuget: Microsoft.Spark.Extensions.Delta, 2.1.0"
using System;
using System.Collections.Generic;
using Microsoft.Spark.Sql;
using Microsoft.Spark.Sql.Streaming;
using Microsoft.Spark.Sql.Types;
using Microsoft.Spark.Extensions.Delta;
using static Microsoft.Spark.Sql.Functions;

// 9093 is the port used to communicate with Event Hubs, see [troubleshooting guide](https://docs.microsoft.com/azure/event-hubs/troubleshooting-guide)
string bootstrapServers = $"{eventHubsNamespace}.servicebus.windows.net:9093";
string eh_sasl = $"org.apache.kafka.common.security.plain.PlainLoginModule required username=\"$ConnectionString\" password=\"{eventHubConnectionString}\";";
var path = $"abfss://{delta_lake_container_name}@{storageAccountName}.dfs.core.windows.net/{blobName}";

var spark = SparkSession.Builder().Config($"fs.azure.account.key.{storageAccountName}.dfs.core.windows.net", storageAccountKey).GetOrCreate();

Loading extensions from `Microsoft.Data.Analysis.Interactive.dll`

In [None]:
var streamingDf = spark
    .ReadStream()
    .Format("kafka")
    .Option("kafka.bootstrap.servers", bootstrapServers)
    .Option("subscribe", eventHubsInstance)
    .Option("kafka.sasl.mechanism", "PLAIN")
    .Option("kafka.security.protocol", "SASL_SSL")
    .Option("kafka.sasl.jaas.config", eh_sasl)
    .Option("kafka.request.timeout.ms", "60000")
    .Option("kafka.session.timeout.ms", "60000")
    //.Option("failOnDataLoss", "false")
    //.Option("checkpointLocation", "/tmp/kafka_cp.txt")
    .Load()
    .WriteStream()
    .Trigger(Trigger.ProcessingTime(2000))
    .ForeachBatch((df, id) =>
    {
            df = df
            .WithColumn("year", Functions.Year(df["timestamp"]))
            .WithColumn("month", Functions.Month(df["timestamp"]))
            .WithColumn("day", Functions.DayOfMonth(df["timestamp"]))
            .WithColumn("time-series", Functions.Col("value").Cast("string"))
            .Select("year", "month", "day", "time-series");

            df.PrintSchema();
            df.Show();

            df
            .Write()
            .PartitionBy("year", "month", "day")
            .Format("delta")
            .Mode(SaveMode.Append)
            .Save(path);
    })
    .Start();

streamingDf.AwaitTermination();

In [None]:
var spark = SparkSession.Builder().Config($"fs.azure.account.key.{storageAccountName}.dfs.core.windows.net", storageAccountKey).GetOrCreate();
var path = $"abfss://{delta_lake_container_name}@{storageAccountName}.dfs.core.windows.net/{blobName}";
var receivedTimeSeries = spark.Read().Format("delta").Load(path);

receivedTimeSeries.PrintSchema();
receivedTimeSeries.Show(100);

root
 |-- body: binary (nullable = true)
 |-- partition: string (nullable = true)
 |-- offset: string (nullable = true)
 |-- sequenceNumber: long (nullable = true)
 |-- enqueuedTime: timestamp (nullable = true)
 |-- publisher: string (nullable = true)
 |-- partitionKey: string (nullable = true)
 |-- properties: map (nullable = true)
 |    |-- key: string
 |    |-- value: string (valueContainsNull = true)
 |-- systemProperties: map (nullable = true)
 |    |-- key: string
 |    |-- value: string (valueContainsNull = true)
 |-- year: integer (nullable = true)
 |-- month: integer (nullable = true)
 |-- day: integer (nullable = true)

+--------------------+---------+-----------+--------------+--------------------+---------+------------+--------------------+----------------+----+-----+---+
|                body|partition|     offset|sequenceNumber|        enqueuedTime|publisher|partitionKey|          properties|systemProperties|year|month|day|
+--------------------+---------+-----------+----