In [None]:
import os
import numpy as np
import pandas as pd
import tensorflow as tf
from datetime import datetime
from datetime import timedelta

In [None]:
# The dataset has been converted into batches via "split_csv.ipynb". 1 batch contains 10 stocks ipynb files. 
# Change the batch folder to do training model and predictions for certain stocks. 
BATCH_FOLDER = 'batch_26'

TRAINING_MODEL_PATH = '../trainingModel'
DATASET_PATH = f'../trainingDataset/{BATCH_FOLDER}'
LOG_DIR = '../trainingLogs'
ACCURACY_THRESHOLD = 90
MAX_RETRIES = 3
WINDOW_SIZE = 32
BATCH_SIZE = 64
SHUFFLE_BUFFER_SIZE = 1000
SPLIT_TIME = 3000

In [None]:
def normalize_feature(data):
    mean = np.mean(data)
    std = np.std(data)
    normalized_data = (data - mean) / std
    return normalized_data, (mean, std)

In [None]:
def denormalize_data(data, stats):
    stats = np.array(stats)
    means = stats[:, 0]
    stds = stats[:, 1]
    return data * stds + means

In [None]:
def model_forecast(model, data, window_size):
    forecast = []
    for time in range(len(data) - window_size):
        forecast.append(model.predict(data[time:time + window_size][np.newaxis]))
    return np.array(forecast).squeeze()

In [None]:
def parse_data_from_file(filename):
    data = np.loadtxt(filename, delimiter=',', skiprows=1, usecols=(1, 2, 3))
    low, high, close = data[:, 0], data[:, 1], data[:, 2]

    low_normalized, stats_low = normalize_feature(low)
    high_normalized, stats_high = normalize_feature(high)
    close_normalized, stats_close = normalize_feature(close)

    features = np.stack([low_normalized, high_normalized, close_normalized], axis=1)
    times = np.arange(len(data))

    return times, features, stats_low, stats_high, stats_close

In [None]:
def windowed_dataset(series, window_size):
    dataset = tf.data.Dataset.from_tensor_slices(series)
    dataset = dataset.window(window_size + 1, shift=1, drop_remainder=True)
    dataset = dataset.flat_map(lambda window: window.batch(window_size + 1))
    dataset = dataset.shuffle(SHUFFLE_BUFFER_SIZE)
    dataset = dataset.map(lambda window: (window[:-1], window[-1]))
    dataset = dataset.batch(BATCH_SIZE).prefetch(tf.data.AUTOTUNE)
    return dataset

In [None]:
def train_val_split(time, data, split_time=SPLIT_TIME):
    return time[:split_time], data[:split_time], time[split_time:], data[split_time:]

In [None]:
def create_model():
    model = tf.keras.models.Sequential([
        tf.keras.Input(shape=(WINDOW_SIZE, 3)),
        tf.keras.layers.Conv1D(filters=32, kernel_size=3, strides=1, padding="causal", activation="relu"),
        tf.keras.layers.LSTM(64, return_sequences=True, dropout=0.3, recurrent_dropout=0.2),
        tf.keras.layers.LSTM(32, dropout=0.2, recurrent_dropout=0.1),
        tf.keras.layers.Dense(16, activation="relu", kernel_regularizer=tf.keras.regularizers.l2(0.01)),
        tf.keras.layers.Dense(3)
    ])

    optimizer = tf.keras.optimizers.Adam(learning_rate=0.001)

    model.compile(
        loss=tf.keras.losses.Huber(delta=1.0),
        optimizer=optimizer,
        metrics=["mae"]
    )

    return model

In [None]:
def calculate_accuracy(true_values, predicted_values, tolerance=0.05):
    differences = np.abs(true_values - predicted_values)
    within_tolerance = differences <= (tolerance * true_values)
    accuracy = np.mean(within_tolerance) * 100
    return accuracy

In [None]:
def compute_metrics(true_series, forecast):
    """Computes MSE and MAE metrics for the forecast"""
    mse = tf.keras.losses.MSE(true_series, forecast)
    mae = tf.keras.losses.MAE(true_series, forecast)
    return mse, mae

In [None]:
def process_file(filename):
    print(f"Processing file: {filename}")
    time, features, stats_low, stats_high, stats_close = parse_data_from_file(filename)

    time_train, features_train, time_valid, features_valid = train_val_split(time, features)

    train_dataset = windowed_dataset(features_train, WINDOW_SIZE)
    validation_dataset = windowed_dataset(features_valid, WINDOW_SIZE)

    retries = 0
    while retries < MAX_RETRIES:
        model = create_model()

        # Buat nama subdirektori log berdasarkan nama file dan waktu pelatihan
        file_log_dir = os.path.join(
            LOG_DIR,
            f"{os.path.basename(filename).split('.')[0]}_{datetime.now().strftime('%Y%m%d-%H%M%S')}"
        )

        tensorboard_callback = tf.keras.callbacks.TensorBoard(
            log_dir=file_log_dir,
            histogram_freq=1,
            write_graph=True,
            write_images=True
        )

        early_stopping = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)
        reduce_lr = tf.keras.callbacks.ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=5, min_lr=1e-6)

        model.fit(
            train_dataset,
            epochs=50,
            validation_data=validation_dataset,
            callbacks=[early_stopping, reduce_lr, tensorboard_callback],
            verbose=1
        )

        rnn_forecast = model_forecast(model, features[SPLIT_TIME - WINDOW_SIZE:], WINDOW_SIZE)
        stats = [stats_low, stats_high, stats_close]
        rnn_forecast_denorm = denormalize_data(rnn_forecast, stats)
        series_valid_denorm = denormalize_data(features_valid, stats)

        mse, mae = compute_metrics(series_valid_denorm, rnn_forecast_denorm)
        print(f"mse: {mse.numpy().mean():.2f}, mae: {mae.numpy().mean():.2f} for forecast")

        accuracy = calculate_accuracy(series_valid_denorm, rnn_forecast_denorm, tolerance=0.05)

        if accuracy < 70.0:
            print(f"Accuracy ({accuracy:.2f}%) is below 70%. Skipping to the next file.")
            break

        if accuracy >= ACCURACY_THRESHOLD:
            model_filename = f"{os.path.basename(filename).split('.')[0]}_A{int(accuracy)}_M{int(mae.numpy().mean())}.h5"
            model_save_path = os.path.join(TRAINING_MODEL_PATH, model_filename)
            model.save(model_save_path)
            print(f"Model saved to {model_save_path} with accuracy {accuracy:.2f}% and MAE {mae.numpy().mean():.2f}")
            break
        else:
            print(f"Attempt {retries + 1}: Accuracy ({accuracy:.2f}%) below threshold ({ACCURACY_THRESHOLD}%). Retrying...")
            retries += 1

    if retries == MAX_RETRIES:
        print(f"Max retries reached for {filename}. Skipping to next file.")

In [None]:
for file in os.listdir(DATASET_PATH):
    if file.endswith('.csv'):
        filepath = os.path.join(DATASET_PATH, file)
        process_file(filepath)

In [None]:
def format_decimal(value):
    """Format angka dalam format desimal dengan 2 tempat desimal."""
    return "{:.0f}".format(value)

In [None]:
def forecast_and_save(models_path, dataset_path, output_path, window_size):
    """
    Load models, forecast 7 days ahead, and save predictions to CSV files.
    """
    for model_file in os.listdir(models_path):
        if model_file.endswith('.h5'):
            model_path = os.path.join(models_path, model_file)
            model = tf.keras.models.load_model(model_path)

            # Extract the corresponding dataset file
            dataset_name = model_file.split('_')[0] + '.csv'
            dataset_file = os.path.join(dataset_path, dataset_name)

            if not os.path.exists(dataset_file):
                print(f"Dataset file for {model_file} not found. Skipping...")
                continue

            print(f"Forecasting for model {model_file} using dataset {dataset_name}...")

            # Load and preprocess the dataset
            _, features, stats_low, stats_high, stats_close = parse_data_from_file(dataset_file)
            stats = [stats_low, stats_high, stats_close]

            # Use the last `window_size` data points for forecasting
            input_data = features[-window_size:]
            input_data = input_data[np.newaxis, :]  # Add batch dimension

            # Forecast for 7 days
            forecast = []
            for _ in range(7):
                pred = model.predict(input_data)
                forecast.append(pred.squeeze())
                # Append the prediction to the input data for next step
                input_data = np.roll(input_data, -1, axis=1)
                input_data[0, -1] = pred

            # Denormalize the forecast
            forecast = np.array(forecast)
            forecast_denorm = denormalize_data(forecast, stats)

            # Load the original dataset to get the last date
            original_data = pd.read_csv(dataset_file)
            last_date = pd.to_datetime(original_data['timestamp'].iloc[-1])

            # Generate dates for the forecast
            forecast_dates = [last_date + timedelta(days=i) for i in range(1, 8)]

            # Format the forecast and save to CSV
            output_file = os.path.join(output_path, f"{os.path.splitext(model_file)[0]}_forecast.csv")
            with open(output_file, 'w') as f:
                # Write header
                f.write('timestamp,low,high,close\n')
                # Write each formatted row
                for date, row in zip(forecast_dates, forecast_denorm):
                    formatted_row = [date.strftime('%Y-%m-%d')] + [format_decimal(value) for value in row]
                    f.write(','.join(formatted_row) + '\n')

            print(f"Saved forecast to {output_file}")