In [None]:
import sqlite3
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow import keras
from tcn import TCN, tcn_full_summary

In [None]:
import os

def fetch_selected_data(source_db_path, table_name, selected_columns):
    conn = sqlite3.connect(source_db_path)
    cursor = conn.cursor()

    columns_str = ','.join(selected_columns)
    query = f"SELECT {columns_str} FROM {table_name}"
    cursor.execute(query)

    data = pd.DataFrame(cursor.fetchall(), columns=selected_columns)

    conn.close()

    return data

In [None]:
selected_columns = ['progName', 'jobID', 'relativeStartTime','startMonthDaySin', 'startMonthDayCos', 'startDaytimeSin', 'startDaytimeCos',
                    'runTime', 'numProc', 'numOST', 'stripeSize', 'totalFile', 'totalIOReq',
                    'totalMetaReq', 'mdsCPUMean', 'mdsOPSMean', 'seqWritePct', 'seqReadPct', 'consecWritePct', 'consecReadpct', 'writeBytesTotal', 'readBytesTotal',
                    'totalReadReq', 'totalWriteReq', 'totalOpenReq', 'totalSeekReq', 'totalStatReq']

source_db_path = './total_final.db'
table_name = 'total_final'
data = fetch_selected_data(source_db_path, table_name, selected_columns)

In [None]:
# for columns that has numeric data remove data under 0
numeric_columns = data.iloc[:, data.columns.get_loc('runTime'):].select_dtypes(include=['int', 'float']).columns
data_positive = data[(data[numeric_columns] > 0).all(axis=1)]

In [None]:
trigono_columns = ['startMonthDaySin', 'startMonthDayCos', 'startDaytimeSin', 'startDaytimeCos']

relativeStartTime_max = data_positive['relativeStartTime'].max()
min_val_runtime = 0
max_val_runtime = 0
for column in data_positive.columns[2:]:
    if column in trigono_columns:
        # (value + 1) /2
        data_positive[column] = (data_positive[column] + 1.0) / 2.0
    elif column == 'relativeStartTime':
        # proportion of time regards to max end time
        data_positive[column] = data_positive[column] / relativeStartTime_max
    elif column == 'stripeSize':
        # rank scaling
        data_positive['stripeSizeRank'] = data_positive['stripeSize'].rank(method='average')
        data_positive['stripeSize'] = (data_positive['stripeSizeRank'] - 1) / (len(data_positive['stripeSize']) - 1)
        data_positive.drop('stripeSizeRank', axis=1, inplace=True)
    else:
        # log(x+0.01) transformation
        data_positive[column] = np.log(data_positive[column] + 0.01)
        # min-max normalization
        min_val = data_positive[column].min()
        max_val = data_positive[column].max()
        data_positive[column] = (data_positive[column] - min_val) / (max_val - min_val)
        if column == 'runTime':
            min_val_runtime = min_val
            max_val_runtime = max_val

In [None]:
data_positive = data_positive[data_positive['runTime'] != 0]

In [None]:
# encode 'progName' to int
data_positive['progName_encoded'] = data_positive['progName'].astype('category').cat.codes

In [None]:
X = data_positive.drop(['runTime','progName', 'jobID', 'progName_encoded'], axis=1).values
X_progName = data_positive['progName_encoded'].values
y = data_positive['runTime'].values

# use first 80% as training data
split_index = int(len(X) * 0.8)
X_train, X_test = X[:split_index], X[split_index:]
X_progName_train, X_progName_test = X_progName[:split_index], X_progName[split_index:]
y_train, y_test = y[:split_index], y[split_index:]

In [None]:
split_index = int(len(X_train) * 0.8)
X_train, X_valid = X_train[:split_index], X_train[split_index:]
X_progName_train, X_progName_valid = X_progName_train[:split_index], X_progName_train[split_index:]
y_train, y_valid = y_train[:split_index], y_train[split_index:]

In [None]:
def create_sequences(X, X_progName, y, sequence_length):
    X_seq, progName_seq, y_seq = [], [], []

    for i in range(len(X) - sequence_length + 1):
        X_seq.append(X[i:(i + sequence_length)])
        progName_seq.append(X_progName[i:(i + sequence_length)])
        y_seq.append(y[i + sequence_length - 1])
        
    return np.array(X_seq), np.array(progName_seq), np.array(y_seq)

In [None]:
from keras.models import Model
from keras.layers import Dense, Dropout, Input, Embedding, Concatenate, Concatenate
from keras.metrics import MeanAbsolutePercentageError, MeanAbsoluteError
from keras.callbacks import EarlyStopping

def build_model(sequence_length, n_features, progName_size, embedding_dim):

    numeric_input = Input(shape=(sequence_length, n_features), name='numeric_input')
    progName_input = Input(shape=(sequence_length,), name='progName_input')
    progName_embedding = Embedding(input_dim=progName_size+1, output_dim=embedding_dim, input_length=sequence_length)(progName_input)
    
    combined_input = Concatenate(axis=-1)([progName_embedding, numeric_input])

    tcn_output = TCN(padding='causal', return_sequences=False)(combined_input)
  
    # tcn_output = Dropout(0.05)(tcn_output)
    
    output = Dense(1, activation='linear')(tcn_output)

    model = Model(inputs=[progName_input, numeric_input], outputs=output)

    return model

In [None]:
# directory for saving model
model_dir = 'saved_models'
if not os.path.exists(model_dir):
    os.makedirs(model_dir)

sequence_lengths = 40
n_features = X_train.shape[1]
progName_size = data_positive['progName'].nunique()
embedding_dim = 30

# early stopping callback
early_stopping = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)

# transform data to sequence
X_train_seq, X_progName_train_seq, y_train_seq = create_sequences(X_train, X_progName_train, y_train, sequence_length)
X_valid_seq, X_progName_valid_seq,  y_valid_seq = create_sequences(X_valid, X_progName_valid, y_valid, sequence_length)
X_test_seq, X_progName_test_seq, y_test_seq = create_sequences(X_test, X_progName_test, y_test, sequence_length)

model = build_model(sequence_length, n_features, progName_size, embedding_dim)
model.compile(optimizer="adam", loss="mse",
              metrics=MeanAbsolutePercentageError())
model.fit([X_progName_train_seq, X_train_seq], y_train_seq, 
          epochs = 100,
          validation_data=([X_progName_valid_seq, X_valid_seq], y_valid_seq), 
          callbacks=[early_stopping])

eval_result = model.evaluate([X_progName_test_seq, X_test_seq], y_test_seq)
current_mse = eval_result[0]
current_mape = eval_result[1]
print(f"Sequence length {sequence_length} MSE: {current_mae} MAPE: {current_mape}")

model_path = os.path.join(model_dir, f"tsala_runtime.h5")
model.save(model_path)

In [None]:
X_test_seq, X_progName_test_seq, y_test_seq = create_sequences(X_test, X_progName_test, y_test, best_sequence_length)
predicted_data = best_model.predict([X_progName_test_seq, X_test_seq])

In [None]:
from sklearn.metrics import r2_score
r_squared = r2_score(y_test_seq, predicted_data)
print(f'R2: {r_squared: .2f}')

In [None]:
original_y_true = reverse_transform(y_test_seq, min_val_runtime, max_val_runtime)
original_y_predict = reverse_transform(np.array(predicted_data), min_val_runtime, max_val_runtime)