### Import packages and prepare data

In [None]:
# Import packages
# Basic
import pandas as pd
import numpy as np
import random
import matplotlib.pyplot as plt
from termcolor import colored

# Data analyze
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

# Deep learning
import tensorflow as tf
from tensorflow import keras
from sklearn.model_selection import GridSearchCV, RepeatedKFold, cross_val_score
from sklearn.metrics import mean_absolute_error, r2_score, root_mean_squared_error

In [None]:
# Read the dataset
df = pd.read_csv('Real_data.csv')

# Data preprocess
df.rename(columns={'ID_new': 'CID'}, inplace=True)
df = df[['CID', 'AMT', 'TIME', 'TD', 'LNDV']]
# Apply exponential function to 'LNDV' while keeping zeros unchanged
df['LNDV'] = df['LNDV'].apply(lambda x: np.exp(x) if x != 0 else 0)
df['NDV'] = df.groupby('CID')['LNDV'].shift(-1)
df = df.dropna(subset=['NDV'])

# Data normalization
df_old = df.copy()
scaler = StandardScaler()
df[['AMT', 'TD', 'LNDV', 'NDV']] = scaler.fit_transform(df[['AMT','TD', 'LNDV', 'NDV']])

In [None]:
# Split data into training and test sets
cids = df['CID'].unique()
train_cids, test_cids = train_test_split(cids, test_size=0.3, random_state=12)
train_df = df[df['CID'].isin(train_cids)]
test_df = df[df['CID'].isin(test_cids)]

X_train = train_df[['TD', 'AMT', 'LNDV']]
y_train = train_df['NDV']
X_test = test_df[['TD', 'AMT', 'LNDV']]
y_test = test_df['NDV']

### Run DL methods to get the predictions

In [None]:
def reverse_normalize_column(normalized_col, original_col):
    mean = original_col.mean()
    std = original_col.std()
    return normalized_col * std + mean

In [None]:
early_stopping = keras.callbacks.EarlyStopping(
    monitor='val_loss',  # Monitor the training loss
    patience=20  # Stop training if no improvement for 20 consecutive epochs
)

In [None]:
# Set the model
# This is a Neural Network which 4 layers. 
model = keras.Sequential([
    keras.Input(shape=(X_train.shape[1],)),
    keras.layers.Flatten(),
    keras.layers.Dense(64, activation=tf.nn.relu),
    keras.layers.Dense(8, activation=tf.nn.relu),
    keras.layers.Dense(1), 
])

# Options: SGD, RMSprop, Adagrad, Adadelta, Adam, Adamax, Nadam
opt = keras.optimizers.Adamax(learning_rate=0.01)
model.summary()

In [None]:
model.compile(optimizer=opt, loss='mean_squared_error', metrics=['mean_absolute_error', 'root_mean_squared_error', 'r2_score'])
model.fit(X_train, y_train, epochs=100, batch_size=64, validation_split=0.3, callbacks=[early_stopping], verbose=0)

In [None]:
y_pred = model.predict(X_test)
pred_real = reverse_normalize_column(y_pred, df_old['NDV'])

In [None]:
# Calculate RMSE
rmse = root_mean_squared_error(y_test, y_pred)
print(f"Root Mean Squared Error (RMSE): {rmse}")

# Calculate R^2 Score
r2 = r2_score(y_test, y_pred)
print(f"R^2 Score: {r2}")

In [None]:
X_train = X_train.values.reshape(-1, 1, 3)
X_test  = X_test.values.reshape(-1, 1, 3)

In [None]:
# Set the GRU model 
grumodel = keras.Sequential([
    keras.Input(shape=(1,3)),
    keras.layers.GRU(64),
    keras.layers.Dense(32, activation=tf.nn.softmax),
    keras.layers.Dense(8, activation=tf.nn.relu),
    keras.layers.Dense(1), 
])

# Options: RMSprop, Adam, Adamax, Nadam
opt = keras.optimizers.RMSprop(learning_rate=0.001)
grumodel.summary()

In [None]:
grumodel.compile(optimizer=opt,loss='mean_squared_error', metrics=['mean_absolute_error', 'root_mean_squared_error', 'r2_score'])
grumodel.fit(X_train, y_train, epochs=100, batch_size=64, validation_split=0.3, callbacks=[early_stopping], verbose=0)

In [None]:
y_pred_gru = grumodel.predict(X_test)
pred_real_gru = reverse_normalize_column(y_pred_gru, df_old['NDV'])

In [None]:
# Calculate RMSE
rmse_gru = root_mean_squared_error(y_test, y_pred_gru)
print(f"Root Mean Squared Error (RMSE): {rmse_gru}")

# Calculate R^2 Score
r2_gru = r2_score(y_test, y_pred_gru)
print(f"R^2 Score: {r2_gru}")

In [None]:
# Set the LSTM model
lstmmodel = keras.Sequential([
    keras.Input(shape=(1,3)),
    keras.layers.LSTM(64),
    keras.layers.Dense(32, activation=tf.nn.softmax),
    keras.layers.Dense(8, activation=tf.nn.relu),
    keras.layers.Dense(1), 
])

# Options: RMSprop, Adam, Adamax, Nadam
opt = keras.optimizers.RMSprop(learning_rate=0.01)
lstmmodel.summary()

In [None]:
lstmmodel.compile(optimizer=opt,loss='mean_squared_error', metrics=['mean_absolute_error', 'root_mean_squared_error', 'r2_score'])
lstmmodel.fit(X_train, y_train, epochs=100, batch_size=64, validation_split=0.3, callbacks=[early_stopping], verbose=0)

In [None]:
y_pred_lstm = lstmmodel.predict(X_test)
pred_real_lstm = reverse_normalize_column(y_pred_lstm, df_old['NDV'])

In [None]:
# Calculate RMSE
rmse_lstm = root_mean_squared_error(y_test, y_pred_lstm)
print(f"Root Mean Squared Error (RMSE): {rmse_lstm}")

# Calculate R^2 Score
r2_lstm = r2_score(y_test, y_pred_lstm)
print(f"R^2 Score: {r2_lstm}")

### Summarize the results for evaluation

In [None]:
# Flatten the arrays
pred_real_flat = pred_real.flatten()
pred_real_gru_flat = pred_real_gru.flatten()
pred_real_lstm_flat = pred_real_lstm.flatten()

# Create the DataFrame
pred_df = pd.DataFrame({
    'NN': pred_real_flat,
    'GRU': pred_real_gru_flat,
    'LSTM': pred_real_lstm_flat
})

In [None]:
# Reset the indices of both dataframes
test_data_reset = test_df.reset_index(drop=True)
pred_df_reset = pred_df.reset_index(drop=True)
# Concatenate the dataframes along axis 1
res = pd.concat([test_data_reset, pred_df_reset], axis=1)

In [None]:
df_new = res.copy()

# Reverse normalization for 'AMT', 'TD', 'LNDV', and 'NDV'
columns_to_reverse = ['AMT', 'TD', 'LNDV', 'NDV']
for col in columns_to_reverse:
    df_new[col] = reverse_normalize_column(df_new[col], df_old[col])

df_new = df_new[["CID", "TIME","LNDV", "NDV", "NN", "GRU", "LSTM"]]