# Uploading data to InfluxDB

## Requirements

1. [InfluxDB installed](https://www.influxdata.com/downloads/).
2. Export InfluxDB API Key as `INFLUXDB_TOKEN` environment variable.
3. Download preprocessed CSV data using `../scripts/fetch_data.py`.

## Processing

In [1]:
import pandas as pd
import os
import influxdb_client
from influxdb_client.client.write_api import SYNCHRONOUS
import csv

### Setting up the InfluxDB connection

In [2]:
DATA_PREROCESSED_DIR = "../data_csv"
DATA_PREPROCESSED_FILE = DATA_PREROCESSED_DIR + "/hk_temp.csv"

bucket = "radem"
org = "radem"
token = os.environ.get("INFLUXDB_TOKEN")
url="http://localhost:8086"

client = influxdb_client.InfluxDBClient(
    url=url,
    token=token,
    org=org
)

write_api = client.write_api(write_options=SYNCHRONOUS)

### Reading preprocessed CSV data

In [26]:
df = pd.read_csv(DATA_PREPROCESSED_FILE)

# time,CEU Temperature (1),P&IDH Temperature (2),EDH Temperature (3),DDH Temperature (4),PCU Temperature (5)

# Convert time to ns for InfluxDB
df['time_ns'] = pd.to_datetime(df['time'], format="%Y-%m-%d %H:%M:%S").astype('int64')
df["CEU Temperature (1)"] = df["CEU Temperature (1)"].astype("int64")
df["P&IDH Temperature (2)"] = df["P&IDH Temperature (2)"].astype("int64")
df["EDH Temperature (3)"] = df["EDH Temperature (3)"].astype("int64")
df["DDH Temperature (4)"] = df["DDH Temperature (4)"].astype("int64")
df["PCU Temperature (5)"] = df["PCU Temperature (5)"].astype("int64")


df

Unnamed: 0,time,CEU Temperature (1),P&IDH Temperature (2),EDH Temperature (3),DDH Temperature (4),PCU Temperature (5),time_ns
0,2023-09-01 00:00:03,-20,-19,-18,-20,-19,1693526403000000000
1,2023-09-01 00:01:03,-20,-19,-18,-20,-19,1693526463000000000
2,2023-09-01 00:02:03,-20,-19,-18,-20,-19,1693526523000000000
3,2023-09-01 00:03:03,-20,-18,-18,-20,-19,1693526583000000000
4,2023-09-01 00:04:03,-20,-19,-18,-20,-19,1693526643000000000
...,...,...,...,...,...,...,...
327428,2024-04-17 08:31:42,-13,-11,-11,-13,-12,1713342702000000000
327429,2024-04-17 08:32:42,-13,-11,-11,-13,-12,1713342762000000000
327430,2024-04-17 08:33:42,-13,-11,-11,-13,-12,1713342822000000000
327431,2024-04-17 08:34:42,-13,-11,-11,-13,-12,1713342882000000000


### Converting to InfluxDB Line Protocol

In [45]:
batch_size = 1000000

print(f"BATCH_SIZE = {batch_size}")
print(f"INPUT_SIZE = {len(df)}")

time_start = pd.Timestamp.now()
count = 0
for batch in range(0, len(df), batch_size):    
    batch_end = min(batch+batch_size-1, len(df)-1)
    batch_indices = slice(batch, batch_end)

    print(f"Processing batch of {batch_indices.stop - batch_indices.start + 1} records, from {batch_indices.start} to {batch_indices.stop}.")

    # Convert time to datetime and then to timestamp in nanoseconds
    df.loc[batch_indices, 'timestamp'] = pd.to_datetime(df.loc[batch_indices, 'time']).astype('int64')

    # Use vectorized operations to construct the line
    # df.loc[batch_indices, 'line'] = (
    #     "temp1_ceu " +
    #     "value=" + df.loc[batch_indices, 'CEU Temperature (1)'].astype(str) + "i " +
    #     df.loc[batch_indices, 'time_ns'].astype(str)
    # )
    # df.loc[batch_indices, 'line'] = (
    #     "temp2_pidh " +
    #     "value=" + df.loc[batch_indices, 'P&IDH Temperature (2)'].astype(str) + "i " +
    #     df.loc[batch_indices, 'time_ns'].astype(str)
    # )
    # df.loc[batch_indices, 'line'] = (
    #     "temp3_edh " +
    #     "value=" + df.loc[batch_indices, 'EDH Temperature (3)'].astype(str) + "i " +
    #     df.loc[batch_indices, 'time_ns'].astype(str)
    # )
    # df.loc[batch_indices, 'line'] = (
    #     "temp4_ddh " +
    #     "value=" + df.loc[batch_indices, 'DDH Temperature (4)'].astype(str) + "i " +
    #     df.loc[batch_indices, 'time_ns'].astype(str)
    # )
    df.loc[batch_indices, 'line'] = (
        "temp5_pcu " +
        "value=" + df.loc[batch_indices, 'PCU Temperature (5)'].astype(str) + "i " +
        df.loc[batch_indices, 'time_ns'].astype(str)
    )

    count += len(df.loc[batch_indices, 'line'])

time_total = pd.Timestamp.now() - time_start
print(f"Processed {count} records in {time_total.total_seconds()} seconds")
print(f"SUCCESS")

BATCH_SIZE = 1000000
INPUT_SIZE = 327433
Processing batch of 327433 records, from 0 to 327432.
Processed 327433 records in 0.19677 seconds
SUCCESS


### Save data to InfluxDB Line Protocol file

Example line: `my_measurement,event_type=e,channel=0 value=123 1556813561098000000`


In [46]:
df['line'].to_csv(DATA_PREROCESSED_DIR + "/influx_line_protocol_temp.line", index=False, header=False, quoting=csv.QUOTE_NONE, sep='\n')

### Read data from InfluxDB Line Protocol file

In [47]:
df_lines = pd.read_csv(DATA_PREROCESSED_DIR + "/influx_line_protocol_temp.line", header=None, sep='\0', names=['line'])

In [48]:
df_lines

Unnamed: 0,line
0,temp5_pcu value=-19i 1693526403000000000
1,temp5_pcu value=-19i 1693526463000000000
2,temp5_pcu value=-19i 1693526523000000000
3,temp5_pcu value=-19i 1693526583000000000
4,temp5_pcu value=-19i 1693526643000000000
...,...
327428,temp5_pcu value=-12i 1713342702000000000
327429,temp5_pcu value=-12i 1713342762000000000
327430,temp5_pcu value=-12i 1713342822000000000
327431,temp5_pcu value=-12i 1713342882000000000


### Upload data to InfluxDB

In [49]:
batch_size = 1000000
for batch in range(0, len(df_lines), batch_size):
    batch_end = min(batch+batch_size-1, len(df_lines)-1)
    batch_indices = slice(batch, batch_end)

    print(f"Uploading batch of {batch_indices.stop - batch_indices.start + 1} records, from {batch_indices.start} to {batch_indices.stop}.")

    write_api.write(bucket, org, df_lines.loc[batch_indices, 'line'])

write_api.flush()

Uploading batch of 327433 records, from 0 to 327432.


In [19]:
with open("../data_processed/influx_line_protocol_v2.line") as f:
    lines = f.readlines()
