
# Extract data from TimescaleDB, and load it to Delta Lake. 

This notebook shows you how to import data from JDBC TimescaleDB database and load it into Delta Lake using Python.

## Step 0: Set Timezone & Get the parameters

In [0]:
# set the timezone of the spark session
# otherwise the timezone information of data from TimescaleDB disapper
spark.conf.set("spark.sql.session.timeZone", "Asia/Bangkok")

# the resample period has to be in minute
trigger_period = 5
table_name = "bronze_table"
destination_table_name = "bronze_delta_table"


## Step 1: Connection information

First define some variables to programmatically create these connections.

In [0]:
driver = "org.postgresql.Driver"

database_host = "alto-workshop-timescaledb.postgres.database.azure.com"
database_port = "5432" # update if you use a non-default port
database_name = "postgres" # eg. postgres
user = "solemnLizard"
password = "af6f4b55-48e0-4fe1-a2b6-67869a28776e"

url = f"jdbc:postgresql://{database_host}:{database_port}/{database_name}"

print(url)

jdbc:postgresql://alto-workshop-timescaledb.postgres.database.azure.com:5432/postgres


## Step 2: Constructing the filter

We will filter out the data between certain time periods.

For now, we will get the current timestamp with ```pendulum``` library. In this case, we can't really do retry or backfilling. We may need to improve this somehow. 

In [0]:
import pendulum

end = pendulum.now(tz='Asia/Bangkok')
# check if the end is every resample_period minutes
if end.minute % trigger_period == 0:
    # change the seconds and micro seconds to 0
    end = end.set(second=0, microsecond=0)
else:
    # if not, round it down to the nearest resample_period minutes
    end = end.subtract(minutes=end.minute % trigger_period)
    # change the seconds and micro seconds to 0
    end = end.set(second=0, microsecond=0)

# start is resample_period minutes before end
start = end.subtract(minutes=trigger_period)

extract_query = f"""(SELECT * FROM {table_name} WHERE timestamp >= '{start}' AND timestamp < '{end}') as filtered_data"""


## Step 2: Reading the data

We will extract the data from **TimescaleDB** with the above filter applied.

In [0]:
source_table = (spark.read
    .format("jdbc")
    .option("driver", driver)
    .option("url", url)
    .option("dbtable", extract_query)
    .option("user", user)
    .option("password", password)
    .load()
)

In [0]:
# view the dataframe
if not source_table.isEmpty():
  display(source_table)
else:
  print(f"There is no data between {start} and {end}")

timestamp,device_id,aggregation_type,datapoint,value
2023-09-29T21:40:00.000+0700,eb27641363d2b2a091jdar,mode_1min,online_status,"""online"""
2023-09-29T21:40:00.000+0700,eb27641363d2b2a091jdar,mode_1min,presence_state,"""occupied"""
2023-09-29T21:40:00.000+0700,eb27641363d2b2a091jdar,mean_1min,sensitivity,100.0
2023-09-29T21:40:00.000+0700,eba63b92a045a9e8dbibaj,mean_1min,noise,76.0
2023-09-29T21:40:00.000+0700,eba63b92a045a9e8dbibaj,mean_1min,temperature,25.0
2023-09-29T21:40:00.000+0700,eba63b92a045a9e8dbibaj,mean_1min,co2,683.0
2023-09-29T21:40:00.000+0700,eba63b92a045a9e8dbibaj,mean_1min,pm25,4.0
2023-09-29T21:40:00.000+0700,eba63b92a045a9e8dbibaj,mean_1min,illuminance,17.0
2023-09-29T21:40:00.000+0700,eba63b92a045a9e8dbibaj,mean_1min,humidity,68.0
2023-09-29T21:40:00.000+0700,ebc130be2d36e91da6nj92,mode_1min,state,"""off"""


## Step 3: Mount ADLS Gen2 storage container

In [0]:
if source_table.isEmpty():
  print(f"There is no data between {start} and {end}")
else:
  source_table.write.mode("append").saveAsTable(destination_table_name)