In [41]:
from crontab import CronTab
import datetime as dt
import numpy as np
import os
import pandas as pd
from pathlib import Path
from sqlalchemy import select, text
from sqlalchemy.orm import sessionmaker
import sys
import tensorflow as tf
from time import strftime
import timeit

# Add path of subdirectory containing own modules
modules_path = os.path.join(os.getcwd(), 'data_collect_app')
if modules_path not in sys.path:
    sys.path.append(modules_path)

import finrail_db

In [42]:
# Define directory for tensorboard log files
def dir_logs(parent_dir='tf_log'):
    return Path(parent_dir) / strftime('%Y_%m_%d_%H_%M_%S')


2024-03-07 14:21:11.623123: I tensorflow/tsl/profiler/lib/profiler_session.cc:104] Profiler session initializing.
2024-03-07 14:21:11.623160: I tensorflow/tsl/profiler/lib/profiler_session.cc:119] Profiler session started.
2024-03-07 14:21:11.628688: I tensorflow/tsl/profiler/lib/profiler_session.cc:131] Profiler session tear down.


In [3]:
def tweak_train(df_):
    '''Function takes DataFrame as returned from SQL-query and returns processed DataFrame
    Transformations:
        - DataType: update to all columns
        - Introducing columns "commuter" and "long_distance" by grouping by date and train category
          and then unstacking ones
        - pushing the date information from index to own column
        - Renaming and setting back nested column names
        
    '''
    return (df_
    .astype({
        'date': 'datetime64',
        'train_cat': 'category',
        'total_length': np.float32
    })
    .groupby(['date', 'train_cat'])
    .max().unstack()
    .reset_index()
    .set_axis(['date', 'commuter', 'long_distance'], axis=1)
           )

# Open fire and read stored SQL query to variable
with open('sql_query.txt', 'r') as w:
    sql_query_str = w.read()
    
# Open SQL connection and send query. This query will:
# 1. Sum length of all wagon in a journey section
# 2. Choose maximum length of all wagons among journey sections for each train
# 3. Sum length of wagons for all trains per day, grouped by train category (Commuter, Long-distance)
with engine.connect() as connection:
    df = pd.read_sql_query(text(sql_query_str), connection)

In [4]:
# Creates tables in finrail db, returns database engine
engine = finrail_db.create_tables(db_str='mysql+mysqlconnector://root:admin123@localhost:5000/finrail')
# Apply tweak_train to output of SQL query to obtain desired time series
df = tweak_train(df)

date             datetime64[ns]
commuter                float32
long_distance           float32
dtype: object

In [5]:
def timeseries_window(data, seq_length, shift=1, stride=1):
    '''Function takes dataset and returns dataset containing windows with data from input dataset.
    Parameters:
        data <tf.data.Dataset> input dataset
        seq_length <int> defines length of windows in output dataset
        shift <int> defines how many time steps of gap are between two consecutive windows
        stride <int> defines how many time steps are between two consecutive output data points
        
    Return:
        <tf.data.Dataset> Dataset containing windows of seq_length based on input dataset data
    '''
    data = data.window(size=seq_length, shift=shift, stride=stride, drop_remainder=True)
    data = data.flat_map(lambda x: x) # flatten nested Dataset structure returned by .window()
    return data.batch(seq_length) # batch of size seq_length will give one window in each batch

def timeseries_dataset_seq2seq(data, forecast_length=1, seq_length=7):
    '''Function takes Dataset and returns Dataset with windows suitable to train a 
    sequence to sequence RNN
    Parameters:
        data <tf.data.Dataset> input dataset
        forecast_length <int> number of time steps to be forecasted into the future
        seq_length <int> length of sequences fed to RNN (number of consecutive time steps 
        in one training instance)
    '''
    data = timeseries_window(data, forecast_length+1) # First dimension one time step longer than
                                                      # forecast_length, as targets are generated as well
    data = timeseries_window(data, seq_length) # Second dimension consists of windows of size sequence length
    return data.map(lambda x: (x[:, 0], x[:, 1:])) # map to tuple (training instance, target)

In [37]:
#training set until 2020 including, scaling down training data by a factor of 1E5
commuter_train = tf.data.Dataset.from_tensor_slices(df['commuter'][:1847].values / 1E5) 
#creating sequences and targets for training
commuter_train = timeseries_dataset_seq2seq(commuter_train, 14, 30)
# cache dataset to avoid previos calculation to be done every epoch during training
commuter_train = commuter_train.cache()
# Shuffle training data, reshuffling after every epoch for better convergence
commuter_train = commuter_train.shuffle(500, seed=42, reshuffle_each_iteration=True)
# Batch training data 
commuter_train = commuter_train.batch(32).prefetch(1)

In [22]:
#Input layer stack that defines input shape and will scale down inputs by a factor of 1E5
input_processing = tf.keras.Sequential([
    tf.keras.layers.Input(shape=(None, 1)),
    tf.keras.layers.Normalization(mean=0, variance=1E10)
])

#Output layer that will scale up predictions by a factor of 1E5
output_processing = tf.keras.Sequential([
    tf.keras.layers.Normalization(mean=0, variance=1E-10)
])

#RNN laer stack for a sequence to sequence model for univariate time series
rnn_seq2seq = tf.keras.Sequential([
    tf.keras.layers.LSTM(32, return_sequences=True),
    tf.keras.layers.Dense(14, activation='linear')  
])

#Complete model including Input, Output and RNN layer stacks
rnn_seq2seq_complete = tf.keras.Sequential([
    input_processing,
    rnn_seq2seq,
    output_processing
])

#Model used during training, to avoid calculating scaling on every iteration
rnn_seq2seq_training = tf.keras.Sequential([
    tf.keras.layers.Input(shape=(None, 1)),
    rnn_seq2seq
])


In [46]:
# Define callback for Tensorboard update
current_dir = dir_logs()
callback_tensorboard = tf.keras.callbacks.TensorBoard(current_dir, profile_batch=100)

rnn_seq2seq_training.compile(optimizer='adam', loss='mse')
rnn_seq2seq_training.fit(commuter_train, epochs = 50, callbacks=[callback_tensorboard])

Epoch 1/50


2024-03-07 14:26:26.461721: I tensorflow/tsl/profiler/lib/profiler_session.cc:104] Profiler session initializing.
2024-03-07 14:26:26.461761: I tensorflow/tsl/profiler/lib/profiler_session.cc:119] Profiler session started.
2024-03-07 14:26:26.463046: I tensorflow/tsl/profiler/lib/profiler_session.cc:131] Profiler session tear down.


Epoch 2/50
Epoch 3/50


2024-03-07 14:26:29.470059: I tensorflow/tsl/profiler/lib/profiler_session.cc:104] Profiler session initializing.
2024-03-07 14:26:29.470090: I tensorflow/tsl/profiler/lib/profiler_session.cc:119] Profiler session started.
2024-03-07 14:26:29.493162: I tensorflow/tsl/profiler/lib/profiler_session.cc:70] Profiler session collecting data.
2024-03-07 14:26:29.498201: I tensorflow/tsl/profiler/lib/profiler_session.cc:131] Profiler session tear down.


Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


<keras.src.callbacks.History at 0x7f1ccc985030>

In [106]:
result = rnn_seq2seq.predict(a)
result[:, -1, :]



array([[89215.28 , 54716.67 , 49396.44 , 84032.18 , 96815.91 , 86950.54 ,
        91648.24 , 87465.06 , 54566.914, 49603.492, 83723.25 , 98576.27 ,
        86204.13 , 91678.1  ]], dtype=float32)

In [68]:
commuter_train

<_BatchDataset element_spec=(TensorSpec(shape=(None, None), dtype=tf.float32, name=None), TensorSpec(shape=(None, None, None), dtype=tf.float32, name=None))>

In [102]:
a = tf.Tensor()
for i, (train, target) in enumerate(commuter_train):
    a = train
    if i > 2:
        break
a = a[0, :]
a = a[np.newaxis, :, np.newaxis]
a.shape

TensorShape([1, 30, 1])

In [39]:
commuter_test = tf.data.Dataset.from_tensor_slices(df['commuter'][1847:]) # Data from 2021 onwards
#commuter_test = timeseries_window(commuter_test, 60)
#j = int(0)
#a = 0
#for i in commuter_test.as_numpy_iterator():
#    if (j < 2):
#        print(rnn_seq2seq(i[np.newaxis, :, np.newaxis].copy()).shape)
#    j += 1

def eval_seq2seq_model(model, data, forecast_length=1, seq_length=7, batch_size=100):
    data = timeseries_window(data, forecast_length+seq_length)
    data = data.map(lambda x: (x[:seq_length], x[seq_length:]))

    data = data.batch(batch_size)
    # Predict and keep only last sequence of prediction
    prediction = tf.data.Dataset.from_tensor_slices(rnn_seq2seq.predict(data)[:, -1, :])
    prediction = prediction.batch(batch_size)
    data = tf.data.Dataset.zip(data, prediction)
    mse = np.zeros(14, dtype=np.float32)
    for i in data.as_numpy_iterator():
        sequences, target = i[0]
        pred = i[1]
        mse = np.sum(np.square(pred - target), axis=0) / batch_size
        print(mse[0]/1E6)
        mse += mse
    print('\n')
    print(mse[0]/1E6)
    #for i in data.as_numpy_iterator():
    #    result = rnn_seq2seq(i[0][:, :, np.newaxis])
    #    target = i[1][:, np.newaxis, :]
    #    print(result[0, -1, :] - target[0, :, :])
    #    print(np.sqrt(np.sum(np.square(result[:, -1, :] - target)) / batch_size / forecast_length))

eval_seq2seq_model(rnn_seq2seq_complete, commuter_test, forecast_length=14, seq_length=30)

6377.252864
6401.092608
6634.546688
6466.147328
6719.310848
6609.618432
6410.134016
6175.944704
7012.801024
7358.948864
6293.075456
1215.83744


2431.67488


In [5]:
test_rnn = tf.keras.Sequential([
    tf.keras.layers.Input(shape=(None, 1)),
    tf.keras.layers.LSTM(3, return_sequences=True)
])
test_rnn.compile(loss='mse', optimizer='adam')
x_training_data = np.random.rand(500, 1)
x_train = tf.data.Dataset.from_tensor_slices(x_training_data)
x_train = timeseries_dataset_seq2seq(x_train)
y_training_data = np.random.rand(500, 3)
y_train = tf.data.Dataset.from_tensor_slices(y_training_data)


test_rnn.fit(x=x_train, epochs=10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.src.callbacks.History at 0x7f83c8841f60>

In [47]:
test_rnn.predict(np.random.rand(1, 10, 1))
np.random.rand(1, 60, 1).shape



(1, 60, 1)

In [182]:
a = np.zeros(14, dtype=np.float32)

for i in range(22):
    b = np.random.rand(14)
    a += b
    print(b)
a

[0.67388077 0.18984745 0.61122116 0.31137044 0.50503867 0.18929037
 0.61636287 0.97836272 0.70692446 0.79188471 0.11336993 0.01475838
 0.83786145 0.50861334]
[0.81867838 0.79164872 0.11088158 0.29023907 0.57815831 0.0342671
 0.00434693 0.2741532  0.15625099 0.78368318 0.18649465 0.0013391
 0.85996041 0.29445972]
[0.01330934 0.24549398 0.08873156 0.71743076 0.21148537 0.72991601
 0.66889605 0.83391747 0.81458398 0.35312731 0.35237431 0.73738273
 0.96517253 0.52211691]
[0.08045153 0.92695746 0.30384688 0.22169256 0.85231981 0.36898274
 0.69632564 0.17785737 0.99774497 0.93419654 0.98740287 0.81108123
 0.90829649 0.51305405]
[0.23834562 0.90531048 0.00863958 0.08093868 0.22606262 0.85072973
 0.22908413 0.73069026 0.762691   0.32499139 0.65763599 0.64558172
 0.69650676 0.10202842]
[0.60028532 0.07613098 0.38184849 0.11085824 0.82751313 0.58460833
 0.13727838 0.87527711 0.51458954 0.46814201 0.28440379 0.12228945
 0.7698344  0.23207313]
[0.32205817 0.39806014 0.68102314 0.28397828 0.0140300

array([11.24642  , 13.268541 , 11.237595 ,  7.1986403,  8.0593405,
       10.96287  ,  9.327009 , 12.3163805, 11.230288 , 10.433632 ,
        8.914517 , 12.7504015, 11.564948 ,  8.877827 ], dtype=float32)

In [None]:
# This blocks evaluates all possible keys in the nested dictionary "wagon" in compositions of one day

properties_dict = dict()
for train in k.json():
    for journey in (train['journeySections']):
        for wagon in journey['wagons']:
            for i, prop in enumerate(wagon.keys()):
                try:
                    properties_dict[prop]
                except:
                    properties_dict[prop] = prop
print(properties_dict.keys())

In [None]:
r.json()

In [26]:
Session = sessionmaker(bind=engine)
session = Session()
session.add(bsp)
session.commit()

In [5]:
with open('test.txt', 'w') as w:
    w.write('haha')