# Test with markdown only

In [None]:
from crontab import CronTab
import datetime as dt
import json
import matplotlib.pyplot as plt
import numpy as np
import os
import pandas as pd
from pathlib import Path
import re
import requests
from scipy.stats import shapiro, pearsonr
import seaborn as sns
from sqlalchemy import (select, text, create_engine, Column, Integer, VARCHAR, DATE, DATETIME, 
                        ForeignKey, Boolean, FLOAT, func)
from sqlalchemy.orm import sessionmaker, declarative_base
import sys
import tensorflow as tf
from time import strftime
import timeit

# Add path of subdirectory containing own modules
modules_path = [
    os.path.join(os.getcwd(), 'data_collect_app'), # location of finrail_db
    os.path.join(os.getcwd(), 'prediction_rnn_app') # location of finrail_rnn_model
]
for i, path in enumerate(modules_path):
    if path not in sys.path:
        sys.path.append(path)

# Load own code (this code is used by the containers, too. Therefore stored )
import finrail_db
import finrail_rnn_model

# Load tensorboard
%load_ext tensorboard

# Set random seed for reproduceability
tf.keras.utils.set_random_seed(42)

In [None]:
# Define directory for tensorboard log files
def dir_logs(parent_dir='tf_log'):
    '''Function takes a parent directory path and adds a subdirectory composed of the current date and time.
    This is useful for Tensorboard logs, so logs of different training runs are stored in different
    directories.
    Parameters:
        parent_dir <str> name of the parent directory
    
    Returns:
        <file path object> directory containing current date and time'''
    return Path(parent_dir) / strftime('%Y_%m_%d_%H_%M_%S')

# Read data from database to Dataframe
# 1. Create engine on data base
engine = create_engine('mysql+mysqlconnector://root:admin123@localhost:5000/finrail')
# 2. Read query from file
path_query = os.path.join(os.getcwd(), 'prediction_rnn_app/timeseries_query.txt')
with open(path_query, 'r') as f:
    sql_timeseries_query = f.read()
# 3. Execute query and store result in Dataframe 
# (2 series on daily basis called "commuter" and "long-distance")
df = finrail_rnn_model.read_timeseries_from_database(engine=engine, str_query=sql_timeseries_query)
# Clean and add one-hot-encoded information about next day in series
df = finrail_rnn_model.tweak_timeseries(df)

# Preparation of training, validation and test dataset
# Seperate datasets for commuter and long_distance, due to problem with overfitting
# in one of the series, when model is fitted to both series simultaneously

#training set until 2022 including
commuter_train = finrail_rnn_model.prepare_training_dataset(
    df, ['commuter', 'next_day_H', 'next_day_S', 'next_day_W'], (0, 2577), seq_length=21
)
long_distance_train = finrail_rnn_model.prepare_training_dataset(
    df, ['long_distance', 'next_day_H', 'next_day_S', 'next_day_W'], (0, 2577), seq_length=21
)

#validation set is 2023
commuter_val = finrail_rnn_model.prepare_training_dataset(
    df, ['commuter', 'next_day_H', 'next_day_S', 'next_day_W'], (2577, 2942), batch_size=500, 
    reshuffle_each_iteration=False, seq_length=21
)
long_distance_val = finrail_rnn_model.prepare_training_dataset(
    df, ['long_distance', 'next_day_H', 'next_day_S', 'next_day_W'], (2577, 2942), batch_size=500,
    reshuffle_each_iteration=False, seq_length=21
)

#test set from 2024-01-01 to latest date in data
commuter_test = finrail_rnn_model.prepare_training_dataset(
    df, ['commuter', 'next_day_H', 'next_day_S', 'next_day_W'], (2942, None), batch_size=500,
    reshuffle_each_iteration=False, seq_length=21
)
long_distance_test = finrail_rnn_model.prepare_training_dataset(
    df, ['long_distance', 'next_day_H', 'next_day_S', 'next_day_W'], (2942, None), batch_size=500, 
    reshuffle_each_iteration=False, seq_length=21
)