In [None]:
%matplotlib inline
import os
import json
import math
import datetime
import numpy as np
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
plt.rcParams['agg.path.chunksize'] = 10000
plt.rcParams['figure.figsize'] = [12, 8]
plt.style.use('ggplot')
pd.set_option('display.max_columns', None)
pd.set_option('display.float_format', lambda x: f'{x:.3f}')

from tqdm import tqdm

In [None]:
%%time

import sys
import pandas as pd
import boto3
import datetime

AWS_S3_BUCKET = "siemensstack-siemensindustryedgedemo614cc6a9-1j50wrw7e75bn"

data = None

s3_client = boto3.resource(
    "s3"
)

date = datetime.datetime.now().date();
year = date.strftime("%Y")

s3Bucket = s3_client.Bucket(AWS_S3_BUCKET)
for my_bucket_object in s3Bucket.objects.all():
    if my_bucket_object.key.startswith(year):
        response = boto3.client('s3').get_object(Bucket=AWS_S3_BUCKET, Key=my_bucket_object.key)
        status = response.get("ResponseMetadata", {}).get("HTTPStatusCode")
        if status == 200:
            csvdata = pd.read_csv(response.get("Body"))
            if data is None:
                data = csvdata
            else:
                data = pd.concat([data, csvdata], axis=0)
        else:
            print(f"Unsuccessful S3 get_object response. Status - {status}")
data['playload.vals.ts'] = pd.to_datetime(data['playload.vals.ts'], format='%Y-%m-%dT%H:%M:%S.%fZ')

In [None]:
data.shape

In [None]:
data.dtypes

In [None]:
data.head()

In [None]:
data.describe()

In [None]:
import json

meta = json.load(open('meta.json'))
meta

In [None]:
dataPointDefinitions = meta['connections'][0]['dataPoints'][0]['dataPointDefinitions']
dataPointDefinitions

In [None]:
dataPointDefinitions_df = pd.DataFrame(dataPointDefinitions)
dataPointDefinitions_df

In [None]:
dataPointDefinitions_df.dtypes

## Analyze Data

In [None]:
freq = '1Min'  # '1Min' or '1H'
id_feature = 'clientID'
label_feature = 'playload.vals.val'  # 'pdu3_current1' or 'pdu3_current2'
time_feature = 'playload.vals.ts'
sparse_features = ['clientID', 'topic', 'protocol', 'playload.seq', 'playload.vals.id', 'playload.vals.qc']
dynamic_dense_features = []
start_time = data[time_feature].min()
end_time = data[time_feature].max()
print('start_time:', start_time)
print('end_time:', end_time)

In [None]:
for sparse_feature in sparse_features:
    print(sparse_feature+':', len(data[sparse_feature].unique()), data[sparse_feature].unique()[:5], '... na:', sum(data[sparse_feature].isna()))

In [None]:
def get_timeseries(df, dense_feature):
    df_group = df.groupby([id_feature])
    dense_df = pd.DataFrame({time_feature: [start_time, end_time]})
    dense_df.set_index(time_feature, inplace=True)
    # print(dense_df)
    dense_df = dense_df.resample(freq).asfreq()
    for name, group in df_group:
        tmp_df = pd.DataFrame({name: group[dense_feature], time_feature:group[time_feature]})
        tmp_df.set_index(time_feature, inplace=True)
        if dense_feature == id_feature:
            tmp_df = tmp_df.resample(freq).mean()
        else:
            tmp_df = tmp_df.resample(freq).mean()
#         print(tmp_df)
        dense_df = dense_df.join(tmp_df)
    num_timeseries = len(df[id_feature].unique())
    if dense_feature == id_feature:
        dense_df = dense_df.resample(freq).mean()
    else:
        dense_df = dense_df.resample(freq).mean()    
        
    # TODO fill NaN
    dense_df = dense_df.replace([np.inf, -np.inf], np.nan)
    dense_df.fillna(method='ffill', inplace=True)
    dense_df.fillna(method='bfill', inplace=True)
    dense_df.fillna(0, inplace=True)
        
    timeseries = []
    for i in range(num_timeseries):
        timeseries.append(dense_df.iloc[:,i])
#     print(timeseries)
    return timeseries

In [None]:
def visualize_timeseries(timeseries, dense_feature):
    row_num = math.ceil(len(timeseries)/2)
    fig, axs = plt.subplots(min(row_num, 2), 2, figsize=(20, 20), sharex=True)
    axx = axs.ravel()
    for i in range(0, min(len(timeseries), 4)):
        timeseries[i].plot(ax=axx[i])
        axx[i].set_xlabel("date")
        axx[i].set_ylabel(dense_feature)
        axx[i].grid(which='minor', axis='x')

In [None]:
def save_timeseries(timeseries, filename):
    with open(filename, 'wb') as fp:
        data = [
            {
                "start": str(timeseries[i].index[0]),
                "target": timeseries[i].tolist()
            }
            for i in range(len(timeseries))
        ]
        for d in data:
            fp.write(json.dumps(d).replace('NaN', '"NaN"').encode("utf-8"))
            fp.write("\n".encode('utf-8'))

In [None]:
data[data['playload.vals.id']==101]

In [None]:
data[data['playload.vals.id']==105]

In [None]:
!mkdir -p output

In [None]:
%%time

print('playload.vals.id:', '101', sum(data[data['playload.vals.id']==101]['playload.vals.val'].isna()))
data_timeseries = get_timeseries(data[data['playload.vals.id']==101], label_feature)
visualize_timeseries(data_timeseries, label_feature)
save_timeseries(data_timeseries, 'output/'+label_feature+'.json')

In [None]:
%%time

print('playload.vals.id:', '105', sum(data[data['playload.vals.id']==105]['playload.vals.val'].isna()))
data_timeseries = get_timeseries(data[data['playload.vals.id']==105], label_feature)
visualize_timeseries(data_timeseries, label_feature)
save_timeseries(data_timeseries, 'output/'+label_feature+'.json')

In [None]:
%%time

for vals_id in list(data['playload.vals.id'].unique()):
    print('playload.vals.id:', vals_id, sum(data[data['playload.vals.id']==vals_id]['playload.vals.val'].isna()))
    data_timeseries = get_timeseries(data[data['playload.vals.id']==vals_id], label_feature)
    visualize_timeseries(data_timeseries, label_feature)
    save_timeseries(data_timeseries, 'output/'+label_feature+'_'+str(vals_id)+'.json')

In [None]:
# 2022-04-13 to 2022-04-19

DATETIME_START_OF_TRAIN = "2022-04-13 00:00:00"
DATETIME_END_OF_TRAIN = "2022-04-17 23:00:00"
DATETIME_START_OF_TEST = DATETIME_END_OF_TRAIN
DATETIME_END_OF_TEST = "2022-04-18 00:00:00"
DATETIME_START_OF_PREDICT = DATETIME_END_OF_TEST
DATETIME_END_OF_PREDICT = "2022-04-19 07:00:00"

freq = '1Min'
prediction_length = 10
context_length = 24*60

target_vals_id = 105  # 101, 105, ...

In [None]:
start_dataset = pd.Timestamp(DATETIME_START_OF_TRAIN, freq=freq)
end_training = pd.Timestamp(DATETIME_END_OF_TRAIN, freq=freq)
start_test = pd.Timestamp(DATETIME_START_OF_TEST, freq=freq)
end_test = pd.Timestamp(DATETIME_END_OF_TEST, freq=freq)
start_predict = pd.Timestamp(DATETIME_START_OF_PREDICT, freq=freq)
end_predict = pd.Timestamp(DATETIME_END_OF_PREDICT, freq=freq)
print('start_dataset:', start_dataset)
print('end_training:', end_training)
print('start_test:', start_test)
print('end_test:', end_test)
print('start_predict:', start_predict)
print('end_predict:', end_predict)

In [None]:
max_length = 0
if freq == '1Min':
    max_length = (end_predict-start_dataset).days*24*60
elif freq == '1H':
    max_length = (end_predict-start_dataset).days*24
elif freq == '1D':
    max_length = (end_predict-start_dataset).days
elif freq == '1M':
    year1 = start_dataset.year
    month1 = start_dataset.month
    year2 = end_predict.year
    month2 = end_predict.month
    max_length = (year2-year1)*12+(month2-month1)
print('max_length:', max_length)

In [None]:
%%time

ids = []
data_group = data.groupby(id_feature)
cnt = 0
for name, group in data_group:
    if cnt % 1000 == 0:
        print('cnt:', cnt)
    cnt += 1
    # print(name)
    # print(group)
    new_name = str(name)
    # print(new_name)
    ids.append(new_name)

num_timeseries = len(ids)
print('num_timeseries:', num_timeseries)

In [None]:
timeseries = get_timeseries(data[data['playload.vals.id']==target_vals_id], label_feature)

In [None]:
dynamic_dense_timeseries = []
for vals_id in list(data['playload.vals.id'].unique()):
    if vals_id!=target_vals_id:
        dense_timeseries = get_timeseries(data[data['playload.vals.id']==vals_id], label_feature)
        dynamic_dense_timeseries.append(dense_timeseries)

### Train and Test splits

In [None]:
training_data = [
    {
        "start": str(timeseries[i].index[0]),
        "target": timeseries[i][start_dataset:end_training][:-1].tolist(),  # We use -1, because pandas indexing includes the upper bound 
        "dynamic_feat": [dense_timeseries[i][start_dataset:end_training][:-1].tolist() for dense_timeseries in dynamic_dense_timeseries],
#         "cat": [property_cat[i]],
        "id": ids[i]
    }
    for i in range(num_timeseries)
]
print(len(training_data), len(timeseries[0][start_dataset:end_training][:-1].tolist()))

In [None]:
# 1Min
test_data = []
for i in range(num_timeseries):
    j = 1
    end = end_training+datetime.timedelta(minutes=j*prediction_length)
    while end <= end_test:
#         print(end)
        test_data_i = {
            "start": str(timeseries[i].index[0]),
            "target": timeseries[i][start_dataset:end][:-1].tolist(),
            "dynamic_feat": [dense_timeseries[i][start_dataset:end][:-1].tolist() for dense_timeseries in dynamic_dense_timeseries],
    #         "cat": [property_cat[i]],
            "id": ids[i]
        }
        j += 1
        end = end_training+datetime.timedelta(minutes=j*prediction_length)
        test_data.append(test_data_i)
        print(len(test_data), len(timeseries[0][start_dataset:end][:-1].tolist()))

In [None]:
# 1Min
predict_data = []
for i in range(num_timeseries):
    j = 1
    end = end_test+datetime.timedelta(minutes=j*prediction_length)
    while end <= end_predict:
#         print(end)
        predict_data_i = {
            "start": str(timeseries[i].index[0]),
            "target": timeseries[i][start_dataset:end][:-1].tolist(),
            "dynamic_feat": [dense_timeseries[i][start_dataset:end][:-1].tolist() for dense_timeseries in dynamic_dense_timeseries],
    #         "cat": [property_cat[i]],
            "id": ids[i]
        }
        j += 1
        end = end_test+datetime.timedelta(minutes=j*prediction_length)
        predict_data.append(predict_data_i)
        print(len(predict_data), len(timeseries[0][start_dataset:end][:-1].tolist()))

In [None]:
def write_dicts_to_file(path, data):
    with open(path, 'wb') as fp:
        for d in data:
            fp.write(json.dumps(d).replace('NaN', '"NaN"').encode("utf-8"))
            fp.write("\n".encode('utf-8'))

In [None]:
%%time
write_dicts_to_file("train_"+freq+".json", training_data)
write_dicts_to_file("test_"+freq+".json", test_data)
write_dicts_to_file("predict_"+freq+".json", predict_data)