In [6]:
import pandas as pd
import numpy as np
import seaborn as sns
from tensorboard.backend.event_processing.event_accumulator import EventAccumulator
import matplotlib.pyplot as plt
import os
import re

from datetime import datetime
import plotly
import plotly.offline as pyo
import plotly.io as pio
import plotly.express as px
from plotly.subplots import make_subplots
import plotly.graph_objects as go

pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)
pd.set_option('display.max_rows', 10000)
pd.set_option('mode.chained_assignment', None)
pd.set_option('display.float_format', lambda x: '%.3f' % x)
pio.renderers.default = 'notebook+pdf'

PLOTLY_COLORS = plotly.colors.DEFAULT_PLOTLY_COLORS

In [55]:
S3_BASE_DIR = 's3://dissertation-data-dmiller'

SEQ_CONTAINER_EXP_10 = {
    'LSTM SEQ 1':  os.path.join(S3_BASE_DIR,'lstm-experiments/ordinal-sequence-length-10-window-1/lightning_logs/version_0'),
    'LSTM SEQ 10': os.path.join(S3_BASE_DIR, 'lstm-experiments/ordinal-sequence-length-10-window-10/lightning_logs/version_0'),
    'LSTM SEQ 20': os.path.join(S3_BASE_DIR, 'lstm-experiments/ordinal-sequence-length-20-window-10/lightning_logs/version_0'),
    'LSTM SEQ 30': os.path.join(S3_BASE_DIR, 'lstm-experiments/ordinal-sequence-length-30-window-10/lightning_logs/version_0'),
    'LSTM SEQ 40': os.path.join(S3_BASE_DIR, 'lstm-experiments/ordinal-sequence-length-40-window-20/lightning_logs/version_1'),
 }

SEQ_CONTAINER_EXP_20 = {
    'LSTM SEQ 1':  os.path.join(S3_BASE_DIR,'lstm-experiments/ordinal-sequence-length-1-window-20/lightning_logs/version_0'),
    'LSTM SEQ 10': os.path.join(S3_BASE_DIR,'lstm-experiments/ordinal-sequence-length-10-window-20/lightning_logs/version_1'),
    'LSTM SEQ 20': os.path.join(S3_BASE_DIR,'lstm-experiments/ordinal-sequence-length-20-window-10/lightning_logs/version_0'),
    'LSTM SEQ 30': os.path.join(S3_BASE_DIR,'lstm-experiments/ordinal-sequence-length-30-window-20/lightning_logs/version_0'),
    'LSTM SEQ 40': os.path.join(S3_BASE_DIR,'lstm-experiments/ordinal-sequence-length-30-window-20/lightning_logs/version_0'),
}  

SEQ_CONTAINER_EXP_30 = {
    'LSTM SEQ 1': os.path.join(S3_BASE_DIR, 'lstm-experiments/ordinal-sequence-length-10/lightning_logs/version_3'),
    'LSTM SEQ 10': os.path.join(S3_BASE_DIR, 'lstm-experiments/ordinal-sequence-length-10/lightning_logs/version_1'),
    'LSTM SEQ 20': os.path.join(S3_BASE_DIR, 'lstm-experiments/ordinal-sequence-length-20/lightning_logs/version_0'),
    'LSTM SEQ 30': os.path.join(S3_BASE_DIR, 'lstm-experiments/ordinal-sequence-length-30/lightning_logs/version_0'),
    'LSTM SEQ 30 H': os.path.join(S3_BASE_DIR, 'lstm-experiments/heuristic-ordinal-sequence-length-30/lightning_logs/version_8'),
    'LSTM SEQ 40': os.path.join(S3_BASE_DIR, 'lstm-experiments/ordinal-sequence-length-40/lightning_logs/version_0')
}

METRIC_MATRIX = {
    'losses': ['train_loss_e', 'val_loss_e', 'Model', 'Experiment'],
}


In [129]:
def _get_metrics_from_tensorboard(event_acc, scalar):
    train_metrics, val_metrics = event_acc.Scalars(f'{scalar}/train'), event_acc.Scalars(f'{scalar}/valid')
    train_df, val_df = pd.DataFrame(train_metrics), pd.DataFrame(val_metrics)

    train_df.drop(columns=['wall_time'], inplace=True)
    val_df.drop(columns=['wall_time'], inplace=True)

    train_df.rename(columns={'value': f'train_{scalar}'}, inplace=True)
    val_df.rename(columns={'value': f'val_{scalar}'}, inplace=True)
    results = pd.merge(train_df, val_df, on='step', how='outer')
    results = results.dropna()
    results = results.drop(columns=['step'])
    return results

def top_metrics(results):
    metric_container = []
    for experiment_name, df in results.items():
        metric_container.append(_best_metrics(df, experiment_name))
    
    return pd.DataFrame(metric_container)

def tensorboard_results(log_dir, experiment_name):

    loss_list, acc_list, prec_list, rec_list = [], [], [], []

    events = EventAccumulator(log_dir)
    events.Reload()
        
    loss_list.append(_get_metrics_from_tensorboard(events, 'loss_e'))
    acc_list.append(_get_metrics_from_tensorboard(events, 'acc'))
    prec_list.append(_get_metrics_from_tensorboard(events, 'prec'))
    rec_list.append(_get_metrics_from_tensorboard(events, 'rec'))

    loss, acc, prec, rec = (
        pd.concat(loss_list).reset_index().drop(columns=['index']),
        pd.concat(acc_list).reset_index().drop(columns=['index']),
        pd.concat(prec_list).reset_index().drop(columns=['index']),
        pd.concat(rec_list).reset_index().drop(columns=['index'])
    )
   
    out_df = pd.concat([loss, acc, prec, rec], axis=1)
    out_df['Model'] = experiment_name 
    return out_df

def _best_metrics(df, name):

    # last row from columns
    return {
        'Experiment': name,
        'BCE Loss Train': df['train_loss_e'].iloc[-1].round(4),
        'BCE Loss': df['val_loss_e'].iloc[-1].round(4),
        'Accuracy': df['val_acc'].iloc[-1].round(4),
        'Precision': df['val_prec'].iloc[-1].round(4),
        'Recall': df['val_rec'].iloc[-1].round(4),
    }

def combine_subsets(df_matrix, col_subset, exp):
    model_container = []
    for model, df in df_matrix.items():
        model_subset = df[col_subset]
        model_container.append(model_subset)

    
    out_df = pd.concat(model_container)
    return out_df


def tf_to_disk(seq_container, exp):
    tf_matrix = {
        k: tensorboard_results(v, k) for k, v in seq_container.items()
    }
    
    for f_name, df in tf_matrix.items():
        f_out = re.sub(r'[^a-zA-Z0-9]', '_', f_name).lower()
        if not os.path.exists(os.path.join('result_csv', f'result_csv_{exp}')):
            os.makedirs(os.path.join('result_csv', f'result_csv_{exp}')) 
        df.to_csv(os.path.join('result_csv', f'result_csv_{exp}', f'{f_out}.csv'), index=False)

        
def generate_metrics(exp):
    
    path = os.path.join('result_csv', f'result_csv_{exp}')
    metric_container = {
        re.sub(r'[^a-zA-Z0-9]', ' ', f_name).upper().replace('CSV', '').strip(): pd.read_csv(os.path.join(path, f_name)) for f_name in os.listdir(path)
    }
    
    for k, v in metric_container.items():
        v['Experiment'] = exp
        metric_container[k] = v
    return metric_container


def metric_to_disk(file_sub_list, metric):
    file_matrix = []
    for file_sub in file_sub_list:
        files = generate_metrics(file_sub)
        file_matrix.append(combine_subsets(files, METRIC_MATRIX[metric], file_sub))
    
    df = pd.concat(file_matrix)
    df.to_csv(os.path.join('result_csv', f'result_csv_{metric}.csv'), index=False)
    

def plot_graph(df, subset, exp, title):
    df = df[df['Experiment'] == exp]
    df = df[subset]
    fig = px.line(df, x='step', y='train_loss_e' if 'Training' in title else 'val_loss_e', color='Model', line_group='Model')
    
    fig.update_layout(
        xaxis_title='Epoch',
        yaxis_title='BCE Loss',
        width=800,
        height=400
    )
    
    # fig.show()
    
    path = os.path.join('lstm_plots', f'lstm_plots_{exp}', f'loss_{"train" if "Training" in title else "val"}.png')
    fig.write_image(path)
 



In [19]:
# tf_to_disk(SEQ_CONTAINER_EXP_10, 10)
# tf_to_disk(SEQ_CONTAINER_EXP_20, 20)
# tf_to_disk(SEQ_CONTAINER_EXP_30, 30)
# metric_to_disk([10, 20, 30], 'losses')

In [71]:
losses = pd.read_csv('result_csv/result_csv_losses.csv')
losses['step'] = losses.groupby(['Model', 'Experiment']).cumcount()
losses = losses[losses['step'] < 80]

In [130]:
for graph_sub in [10, 20, 30]:
    plot_graph(losses, ['step', 'train_loss_e', 'val_loss_e', 'Model', 'Experiment'], graph_sub, 'Training BCE Loss')
    plot_graph(losses, ['step', 'train_loss_e', 'val_loss_e', 'Model', 'Experiment'], graph_sub, 'Validation BCE Loss')

