In [1]:
from datetime import timedelta
from typing import Tuple
import pandas as pd
import numpy as np
import os, json
from trials.Utilities import get_spec
from typing import List
from functools import reduce
import plotly.graph_objs as graph_objs
import matplotlib.cm as cm
from scipy.spatial import Delaunay
import scipy.linalg as linalg
import time




Loading dataset from file... This may take a few minutes...
Instructions for updating:
Use eager execution and: 
`tf.data.TFRecordDataset(path)`
Loaded dataset in 36 seconds


In [2]:
def load_epoch_data(trial: str, epoch: str) -> np.array:
    heredity_path = os.path.join('D://', 'Graph Data', trial, epoch, 'Heredity')

    data: List[float] = []
    for file_name in os.listdir(heredity_path):
        file_path = os.path.join(heredity_path, file_name)

        hashes = json.load(open(file_path, 'r', encoding='utf8'))
        specs = [get_spec(hsh, stop_halfway=True) for hsh in hashes]
        scores = [ind.get_data().total_accuracy for ind in specs]

        data.append(np.median(scores))

    return np.array(data)


def value_to_color(z_val: np.ndarray, color_map, v_min, v_max):
    # map the normalized value zval to a corresponding color in the colormap

    if v_min > v_max:
        raise ValueError("VMax must be > VMin")
    t = (z_val - v_min) / float((v_max - v_min))  # normalize val

    R, G, B, alpha = color_map(t)

    return f"rgb({int(R * 255 + 0.5):d},{int(G * 255 + 0.5):d},{int(B * 255 + 0.5):d})"


def tri_indices(simplices: np.ndarray):
    # simplices is a numpy array defining the simplices of the triangularization
    # returns the lists of indices i, j, k
    return ([triplet[c] for triplet in simplices] for c in range(3))


def plotly_trisurf(
    x: List[float],
    y: List[float],
    z: List[float],
    simplices: np.ndarray,
    color_map=cm.RdBu,
    plot_edges=None,
):
    # x, y, z are lists of coordinates of the triangle vertices
    # simplices are the simplices that define the triangularization;
    # simplices  is a numpy array of shape (no_triangles, 3)
    # insert here the  type check for input data

    points3D = np.vstack((x, y, z)).T

    # vertices of the surface triangles
    tri_vertices = map(lambda index: points3D[index], simplices)

    # mean values of z-coordinates of
    z_mean = [np.mean(tri[:, 2]) for tri in tri_vertices]
    # triangle vertices
    min_z_mean = np.min(z_mean)
    max_z_mean = np.max(z_mean)
    face_color = [value_to_color(zz, color_map, min_z_mean, max_z_mean) for zz in z_mean]

    I, J, K = tri_indices(simplices)
    triangles = graph_objs.Mesh3d(
        x=x,
        y=y,
        z=z,
        facecolor=face_color,
        i=I,
        j=J,
        k=K,
    )

    if plot_edges is None:  # the triangle sides are not plotted
        return [triangles]
    else:
        # define the lists Xe, Ye, Ze, of x, y, resp z coordinates of edge end points for each triangle
        # None separates data corresponding to two consecutive triangles
        lists_coord = [
            [[T[k % 3][c] for k in range(4)] + [None] for T in tri_vertices]
            for c in range(3)
        ]
        Xe, Ye, Ze = [reduce(lambda x, y: x + y, lists_coord[k]) for k in range(3)]

        # define the lines to be plotted
        lines = graph_objs.Scatter3d(
            x=Xe, y=Ye, z=Ze, mode="lines", line=dict(color="rgb(50,50,50)", width=1.5)
        )
        return [triangles, lines]


def fit_surface(x: np.ndarray, y: np.ndarray, z: np.ndarray, resolution: int = 5) -> Tuple[np.ndarray, np.ndarray, np.ndarray]:
    assert resolution >= 2

    fit_data = np.c_[x, y, z]
    mn = np.min(fit_data, axis=0)
    mx = np.max(fit_data, axis=0)

    X, Y = np.meshgrid(np.linspace(mn[0], mx[0], resolution), np.linspace(mn[1], mx[1], resolution))

    A = np.c_[fit_data[:,0], fit_data[:,1], np.ones(fit_data.shape[0])]
    C, cx, cy, cz = linalg.lstsq(A, fit_data[:,2])

    Z = C[0]*X + C[1]*Y + C[2]

    return X, Y, Z



In [3]:
last_check = time.time()
last_missing = 0

In [4]:
curr_missing = 0

N_POP = 100
N_DROP = 5
STOP_HALFWAY = True

data_median = dict()
data_average = dict()
data_std = dict()

for tournament_size in range(5, N_POP - N_DROP):
    row_median = dict()
    row_average = dict()
    row_std = dict()
    col_key = f'R={tournament_size}'

    for best_size in range(5, N_POP):
        if best_size >= tournament_size:
            continue

        n_random = int(tournament_size)
        n_select = int(best_size)
        n_remain = N_POP - n_select

        row_key = f'Top {best_size}'

        if n_remain < 0:
            row_median[row_key] = float('NaN')
            row_average[row_key] = float('NaN')
            row_std[row_key] = float('NaN')
            continue

        trial_name = f"Select {best_size} Tournament n={int(tournament_size)}% + Drop 5 Worst"
        heredity_path = os.path.join('D://', 'Graph Data', trial_name, 'Epoch 1', 'Heredity')
        if not os.path.exists(heredity_path):
            curr_missing += 1
            continue

        generations = os.listdir(heredity_path)

        file_name = generations[-1]
        file_path = os.path.join(heredity_path, file_name)

        hashes = json.load(open(file_path, 'r', encoding='utf8'))
        specs_half = [get_spec(hsh, stop_halfway=True) for hsh in hashes]
        scores_half = [ind.get_data().total_accuracy for ind in specs_half]

        # specs_full = [get_spec(hsh, stop_halfway=True) for hsh in hashes]
        # scores_full = [ind.get_data().total_accuracy for ind in specs_full]

        row_median[row_key] = np.median(scores_half)
        row_average[row_key] = np.average(scores_half)
        row_std[row_key] = np.std(scores_half)

    data_median[col_key] = row_median
    data_average[col_key] = row_average
    data_std[col_key] = row_std


In [5]:
# time_since_check = time.time() - last_check
# new_trials = last_missing - curr_missing
# secs_per_model = new_trials/time_since_check
# secs_till_done = curr_missing/secs_per_model
# duration = timedelta(seconds=secs_till_done)
# print(f'Done loading data, {curr_missing} trials still missing')
# print(f'{new_trials} models since last check, {new_trials/time_since_check:0.2f} models per minute, {duration} till completion')
# last_missing = curr_missing
# last_check = time.time()

In [16]:
def create_comparison_graph(
        comparison_from: pd.DataFrame,
        comparison_to: str,
        color_scale
):
    baseline_score = load_epoch_data(comparison_to, 'Epoch 1')[-1]
    comparison_from = comparison_from - baseline_score
    comparison_from = comparison_from * 100

    ys=[] # top X
    xs=[] # tournament size
    zs=[] # value

    for r in range(1, 100):
        for t in range(1, 100):
            try:
                value = comparison_from.loc[f'Top {t}', f'R={r}']
            except KeyError:
                continue

            if np.isnan(value):
                continue
            if value == 0:
                continue
            ys.append(t)
            xs.append(r)
            zs.append(value)

    triangles = Delaunay(np.vstack([xs, ys]).T)
    triangles = plotly_trisurf(xs, ys, zs, triangles.simplices, color_map=cm.RdBu)
    base_axis = dict(
        showbackground=True,
        backgroundcolor="rgb(230, 230,230)",
        gridcolor="rgb(255, 255, 255)",
        zerolinecolor="rgb(255, 255, 255)"
    )
    layout = graph_objs.Layout(
        width=900,
        height=500,
        margin=dict(l=0, r=0, b=0, t=0),
        scene=dict(
            xaxis=dict(**base_axis, title='Tournament Size'),
            yaxis=dict(**base_axis, title='Top N'),
            zaxis=dict(**base_axis, title='Accuracy vs. Greedy (%)'),
            aspectratio=dict(x=1, y=1, z=0.5),
        )
    )
    figure = graph_objs.Figure(layout=layout, data=triangles)
    # figure.add_surface(z=[baseline_score, baseline_score])
    # figure.add_surface(x=[0, 100], z=[0, 100], y=[baseline_score*100, baseline_score*100])
    tx, ty, tz = fit_surface(xs, ys, zs)
    figure.add_surface(x=tx, y=ty, z=tz, colorscale=color_scale, opacity=0.25)

    return figure


In [17]:
figure = create_comparison_graph(
    pd.DataFrame(data_median),
    f"Baseline - Select 5 Random + Drop 5 Greedy",
    color_scale='RdBu'
)
figure.show()