# Examine History Files
> Reads in all csv files in the target directory  
> for each  
> * finds the lowest val_pred loss epoch
> * plots a subplot of entire training run, best epoch in RED  
> * After, plots a list ordered by pred loss of losses, accuracies, best epoch, total epochs


In [None]:
import os, glob
import numpy as np
import pandas as pd
import matplotlib.image as mpimg
from matplotlib import pyplot as plt

# don't truncate the column displays!
pd.set_option('display.max_colwidth', -1)

In [None]:
def processHistoryFile( history, figure=None, plotCol=0, showImage=False):
    """
    Read a single train history csv file, plot it, return useful data
    
    history - filename of csv history file
    figure - figure to plot into
    plotCol - 0 or 1 for which column to plot in
    showImage - Display the png of the graph after plotting the history
    
    Returns: best epoch (pd.Series: ['epoch', 'acc', 'pred_acc', 'loss', 'pred_loss', 'num_epochs', 'model'])
    """
    dictRet = {}
    plotCol %= 2
    
    # read in the file
    pdHist = pd.read_csv(history)
    
    # get something printable
    modelName = ' '.join(history.split('/')[-1].split('_')[0:-1])
    
    if figure is None:
        figure = plt.figure(figsize=(14,6))
    
    # get best epoch
    if 'val_out_pred_loss' in pdHist:
        acc = 'out_pred_acc'
        pred_acc = 'val_out_pred_acc'
        loss = 'out_pred_loss'
        pred_loss = 'val_out_pred_loss'
    else:
        acc = 'acc'
        pred_acc = 'val_acc'
        loss = 'loss'
        pred_loss = 'val_loss'
    
    # save the best
    pdBest = pdHist.iloc[pdHist[pred_loss].idxmin()][['epoch', acc, pred_acc, loss, pred_loss]]
    pdBest = pdBest.rename({ acc : 'acc', 
                             pred_acc : 'pred_acc', 
                             loss : 'loss', 
                             pred_loss : 'pred_loss'})
    pdBest['num_epochs'] = len(pdHist)
    pdBest['model'] = modelName
    #dictBest = {}
    #pdBest = pdHist.iloc[pdHist[pred_loss].idxmin()]
    
    #dictBest['epoch'] = pdBest.epoch
    #dictBest['acc'] = pdBest[acc]
    #dictBest['val_acc'] = pdBest[val_acc]
    #dictBest['loss'] = pdBest[loss]
    #dictBest['pred_loss'] = pdBest[pred_loss]
    #dictBest['numEpochs'] = len(pdHist)
    
    # plot it (figure already created)
    ax=plt.subplot(2,2,1 + plotCol)
    pdHist[[loss, pred_loss]].plot(ax=ax, sharex=True, title=modelName)
    ax.plot( pdBest['epoch'], pdBest['pred_loss'], 'or', markersize=6)
    ax.set_ylim(0.0, 0.035)
    ax=plt.subplot(2,2,3 + plotCol)
    pdHist[[acc, pred_acc]].plot(ax=ax, sharex=True )
    ax.plot( pdBest['epoch'], pdBest['pred_acc'], 'or', markersize=6)
    ax.set_ylim(0.96, 1.0)
    plt.xlabel("epoch (best epoch in red)")
    #figure.text(x=0.15 + plotCol * 0.4, y=0.92, s=modelName, size="x-large")
    
    if plotCol == 1:
        plt.show()
    
    if showImage:
        try:
            fig = plt.figure(figsize=(10, 14))
            img = mpimg.imread(history.replace("_historylog.csv", ".png"))
            ax=plt.subplot(1,1,1)
            ax.imshow(img)
            ax.axis('off')
            plt.show()
        except:
            print( "No image found for this history")
    
    return pdBest
    

In [None]:
# testing
#history = "result/glove_learn_caps_historylog.csv"
#processed = processHistoryFile( history, showImage=True)


In [None]:
def processHistoryDir( directory):
    """
    Process training output history.csv files
    
    directory - the directory to walk through relative to current directory or absolute path
    
    Returns: a pandas dataframe containing best epoch of each history file
    NOTE - this function does not walk recursively
    """
    pdBest = pd.DataFrame( )
    plotCol = 0

    # process all csv files
    for history in sorted(glob.glob(directory + "/*.csv")):

        plotCol %= 2
        if plotCol == 0:
            fig = plt.figure(figsize=(14, 6))

        pdBest = pdBest.append( processHistoryFile( history, fig, plotCol, False))

        plotCol += 1
    
    pdBest.sort_values('pred_loss', inplace=True)
    pdBest.reset_index(drop=True, inplace=True)
    
    # rearrange column order
    cols = list(pdBest.columns.values)
    cols.pop(cols.index('model'))
    pdBest = pdBest[['pred_loss', 'pred_acc', 'loss', 'acc', 'epoch', 'num_epochs', 'model']]
    
    return pdBest

# Run all of above, then run this cell to process a directory of history files

In [None]:
pdBest = processHistoryDir( "result")

In [None]:
# default sort orders by prediction loss
pdBest

In [None]:
# take a look sorted by prediction accuracy
pdBest.sort_values('pred_acc', ascending=False)