In [41]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os, json
from trials.Utilities import get_spec

In [42]:
from typing import List
from functools import reduce
import plotly.graph_objs as graph_objs

import matplotlib.cm as cm
from scipy.spatial import Delaunay

def load_epoch_data(trial: str, epoch: str) -> np.array:
    heredity_path = os.path.join('graphs', trial, epoch, 'Heredity')

    data: List[float] = []
    for file_name in os.listdir(heredity_path):
        file_path = os.path.join(heredity_path, file_name)

        hashes = json.load(open(file_path, 'r', encoding='utf8'))
        specs = [get_spec(hsh, stop_halfway=True) for hsh in hashes]
        scores = [ind.get_data().total_accuracy for ind in specs]

        data.append(np.median(scores))

    return np.array(data)

def value_to_color(z_val: np.ndarray, color_map, v_min, v_max):
    # map the normalized value zval to a corresponding color in the colormap

    if v_min > v_max:
        raise ValueError("VMax must be > VMin")
    t = (z_val - v_min) / float((v_max - v_min))  # normalize val

    R, G, B, alpha = color_map(t)

    return f"rgb({int(R * 255 + 0.5):d},{int(G * 255 + 0.5):d},{int(B * 255 + 0.5):d})"


def tri_indices(simplices: np.ndarray):
    # simplices is a numpy array defining the simplices of the triangularization
    # returns the lists of indices i, j, k
    return ([triplet[c] for triplet in simplices] for c in range(3))


def plotly_trisurf(
    x: List[float],
    y: List[float],
    z: List[float],
    simplices: np.ndarray,
    color_map=cm.RdBu,
    plot_edges=None,
):
    # x, y, z are lists of coordinates of the triangle vertices
    # simplices are the simplices that define the triangularization;
    # simplices  is a numpy array of shape (no_triangles, 3)
    # insert here the  type check for input data

    points3D = np.vstack((x, y, z)).T

    # vertices of the surface triangles
    tri_vertices = map(lambda index: points3D[index], simplices)

    # mean values of z-coordinates of
    z_mean = [np.mean(tri[:, 2]) for tri in tri_vertices]
    # triangle vertices
    min_z_mean = np.min(z_mean)
    max_z_mean = np.max(z_mean)
    face_color = [value_to_color(zz, color_map, min_z_mean, max_z_mean) for zz in z_mean]

    I, J, K = tri_indices(simplices)
    triangles = graph_objs.Mesh3d(
        x=x,
        y=y,
        z=z,
        facecolor=face_color,
        i=I,
        j=J,
        k=K,
    )

    if plot_edges is None:  # the triangle sides are not plotted
        return [triangles]
    else:
        # define the lists Xe, Ye, Ze, of x, y, resp z coordinates of edge end points for each triangle
        # None separates data corresponding to two consecutive triangles
        lists_coord = [
            [[T[k % 3][c] for k in range(4)] + [None] for T in tri_vertices]
            for c in range(3)
        ]
        Xe, Ye, Ze = [reduce(lambda x, y: x + y, lists_coord[k]) for k in range(3)]

        # define the lines to be plotted
        lines = graph_objs.Scatter3d(
            x=Xe, y=Ye, z=Ze, mode="lines", line=dict(color="rgb(50,50,50)", width=1.5)
        )
        return [triangles, lines]



In [46]:
N_POP = 100
STOP_HALFWAY = True

missing = 0

data_median = dict()
data_average = dict()
data_std = dict()
for tournament_size in range(1, 100):
    row_median = dict()
    row_average = dict()
    row_std = dict()
    col_key = f'R={tournament_size}'

    for best_size in range(1, 100):
        if best_size >= tournament_size:
            continue

        n_random = int(tournament_size)
        n_select = int(best_size)
        n_remain = N_POP - n_select

        row_key = f'Top {best_size}'

        if n_remain < n_random:
            row_median[row_key] = float('NaN')
            row_average[row_key] = float('NaN')
            row_std[row_key] = float('NaN')
            continue

        trial_name = f"Tournament N-{int(tournament_size)}% {int(best_size)} Best Half"
        heredity_path = os.path.join('graphs', trial_name, 'Epoch 1', 'Heredity')
        if not os.path.exists(heredity_path):
            missing += 1
            continue

        generations = os.listdir(heredity_path)

        file_name = generations[-1]
        file_path = os.path.join(heredity_path, file_name)

        hashes = json.load(open(file_path, 'r', encoding='utf8'))
        specs_half = [get_spec(hsh, stop_halfway=True) for hsh in hashes]
        scores_half = [ind.get_data().total_accuracy for ind in specs_half]

        # specs_full = [get_spec(hsh, stop_halfway=True) for hsh in hashes]
        # scores_full = [ind.get_data().total_accuracy for ind in specs_full]

        row_median[row_key] = np.median(scores_half)
        row_average[row_key] = np.average(scores_half)
        row_std[row_key] = np.std(scores_half)

    data_median[col_key] = row_median
    data_average[col_key] = row_average
    data_std[col_key] = row_std

print(f'Done loading data, {missing} trials still missing')

Done loading data, 905 trials still missing


In [50]:
pd.DataFrame(data_median)

Unnamed: 0,R=1,R=2,R=3,R=4,R=5,R=6,R=7,R=8,R=9,R=10,...,R=90,R=91,R=92,R=93,R=94,R=95,R=96,R=97,R=98,R=99
Top 1,,0.827349,0.837966,0.835086,0.839343,0.837590,0.834435,0.833909,0.834135,0.835837,...,0.840520,,,,0.835587,,,,0.838016,
Top 2,,,0.832757,0.828425,0.829077,0.830579,0.833759,0.838091,0.832282,0.837365,...,0.837190,0.839493,0.839794,0.835061,0.838116,0.838467,0.839769,,,
Top 3,,,,0.834711,0.833784,0.830379,0.829978,0.836213,0.829377,0.833734,...,0.838992,,,,,,0.835762,,,
Top 4,,,,,0.829377,0.835236,0.827699,0.830629,0.833909,0.834285,...,0.832031,,,0.834986,,,,,,
Top 5,,,,,,0.836363,0.834560,0.832532,0.836438,0.835512,...,0.833734,,,,,0.831330,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Top 94,,,,,,,,,,,...,,,,,,,,,,
Top 95,,,,,,,,,,,...,,,,,,,,,,
Top 96,,,,,,,,,,,...,,,,,,,,,,
Top 97,,,,,,,,,,,...,,,,,,,,,,


In [47]:
baseline_score = load_epoch_data('Baseline Random Half', 'Epoch 1')[-1]
baseline_score

0.8310546875

In [52]:
df_median = pd.DataFrame(data_median)
df_median = df_median.dropna(how='all', axis=0).dropna(how='all', axis=1)
flat = [value for sublist in df_median.values for value in sublist]
flat = list(filter(lambda value: not np.isnan(value), flat))
df_median = df_median - baseline_score
df_median = df_median * 100

ys=[] # top X
xs=[] # r num
zs=[] # value

for r in range(1, 100):
    for t in range(1, 100):
        try:
            value = df_median.loc[f'Top {t}', f'R={r}']
        except KeyError:
            continue

        if np.isnan(value):
            continue
        if value == 0:
            continue
        ys.append(t)
        xs.append(r)
        zs.append(value)

triangles = Delaunay(np.vstack([xs, ys]).T)
triangles = plotly_trisurf(xs, ys, zs, triangles.simplices)
base_axis = dict(
    showbackground=True,
    backgroundcolor="rgb(230, 230,230)",
    gridcolor="rgb(255, 255, 255)",
    zerolinecolor="rgb(255, 255, 255)",
)
layout = graph_objs.Layout(
    title='Tournament Selection: Value over Baseline',
    width=800,
    height=800,
    scene=dict(
        xaxis=dict(**base_axis, title='Tournament Size'),
        yaxis=dict(**base_axis, title='Top N'),
        zaxis=dict(**base_axis, title='Value over Baseline (%)'),
        aspectratio=dict(x=1, y=1, z=0.5),
    ),
)
fig1 = graph_objs.Figure(data=triangles, layout=layout)
fig1.show()