# Read analysed

In [None]:
import utils
import altair as alt
import pandas as pd
import re
import math
from analysePhenotype import readAnalysed

## First, let's take a look at the raw data

In [None]:
paths = [
    #'/mnt/extraspace/joakim/210906/seeded/2d',
    #'/mnt/extraspace/joakim/210906/seeded/3d',
    #'/mnt/extraspace/joakim/210906/stochastic/2d',
    #'/mnt/extraspace/joakim/210906/stochastic/3d'
    '/mnt/extraspace/joakim/210924/seeded/2d',
    '/mnt/extraspace/joakim/210924/seeded/3d',
    '/mnt/extraspace/joakim/210924/stochastic/2d',
    '/mnt/extraspace/joakim/210924/stochastic/3d'
]
alt.data_transformers.disable_max_rows()

### How much time did it take?

In [None]:
def printTiming(path):
    ! grep "Total run time" $path/*.out

In [None]:
for path in paths:
    printTiming(path)

### How large are the files?

In [None]:
def printSize(path):
    ! ls -lh $path/*.h5

In [None]:
for path in paths:
    printSize(path)

## How many did we sample and how many were valid?

In [None]:
def statsFromPath(path):
    outputMessages = ! grep -h "Done!" $path/polycubes*.out
    nValid = nUnbounded = nNondet = 0
    for row in outputMessages:
        nP, nU, nN = [int(n) for n in re.compile('(\d+)').findall(row)]
        nValid += nP
        nUnbounded += nU
        nNondet += nN
    assemblyMode, nDim = path.split('/')[-2:]
    df = pd.DataFrame({
        'type': ['Valid', 'Non-deterministic', 'Unbounded'],
        'count': [nValid, nUnbounded, nNondet]
    })
    df['assemblyMode'] = assemblyMode
    df['nDim'] = nDim
    
    print('Loaded {} {}:'.format(assemblyMode, nDim))
    print('  Sampled {:,d} rules in total'.format(nValid + nNondet + nUnbounded))
    print('  Found {:,d} valid rules. Also found {:,d} unbounded and {:,d} nondeterministic rules'.format(nValid, nUnbounded, nNondet))
    
    return df
def statsFromPaths(paths):
    return pd.concat([statsFromPath(path) for path in paths])

In [None]:
alt.Chart(statsFromPaths(paths)).mark_bar().encode(
    x=alt.X('count:Q', title="Samples"),
    y=alt.Y('assemblyMode:N', title=None),
    color=alt.Color('type:O', title="Rule validity", scale=alt.Scale(scheme='set1')),
    row = alt.Row('nDim:N', title=None)
)

## How many phenotypes of each size did we find?

In [None]:
oeisA000162 = [1, 1, 2, 8, 29, 166, 1023, 6922, 48311, 346543, 2522522, 18598427, 138462649, 1039496297, 7859514470, 59795121480]
oeisA000162Plot = alt.Chart(pd.DataFrame({'count': oeisA000162, 'size': range(len(oeisA000162))})
).mark_line(color='red', clip=True).encode(
    alt.X("size:Q", scale=alt.Scale(domain=(1,100))),
    alt.Y("count:Q")
)

In [None]:
oeisA000988 = [1, 1, 1, 2, 7, 18, 60, 196, 704, 2500, 9189, 33896, 126759, 476270, 1802312, 6849777, 26152418]
oeisA000988Plot = alt.Chart(pd.DataFrame({'count': oeisA000988, 'size': range(len(oeisA000988))})
).mark_line(color='red', clip=True).encode(
    alt.X("size:Q", scale=alt.Scale(domain=(1,100))),
    alt.Y("count:Q")
)

In [None]:
(oeisA000988Plot | oeisA000988Plot) & (oeisA000162Plot | oeisA000162Plot)

In [None]:
def readAndAssign(path):
    [f] = ! ls $path/*.ftr
    df = readAnalysed(f)
    assemblyMode, nDim = path.split('/')[-2:]
    df['assemblyMode'] = assemblyMode
    df['nDim'] = nDim
    return df
def readMulti(paths):
    return pd.concat([readAndAssign(path) for path in paths])

In [None]:
def distrPlot(path):
    [f] = ! ls $path/*.ftr
    assemblyMode, nDim = path.split('/')[-2:]
    df = readAnalysed(f)
    return alt.Chart(df).mark_bar(clip=True).encode(
        alt.X("size:Q", scale=alt.Scale(domain=(1,100))),
        alt.Y("count()", scale=alt.Scale(type='log', domain=(1, 100000)), title="Output count"),
        tooltip=["count()", "size"]
    ).properties(
        width=300,
        height=200,
        title="{} {}".format(assemblyMode, nDim)
    ) + (oeisA000162Plot if nDim == '3d' else oeisA000988Plot)

In [None]:
(distrPlot('/mnt/extraspace/joakim/210924/seeded/2d') | distrPlot('/mnt/extraspace/joakim/210924/stochastic/2d')) & (distrPlot('/mnt/extraspace/joakim/210924/seeded/3d') | distrPlot('/mnt/extraspace/joakim/210924/stochastic/3d'))

In [None]:
alt.Chart(readMulti(paths)).mark_bar().encode(
    alt.X("size:O"),
    alt.Y("count()", scale=alt.Scale(type='log'), title="Phenotype count"),
    tooltip=["count()", "size"],
    column = alt.Column('assemblyMode', title="Assembly mode"),
    row = alt.Row('nDim:N', title=None)
).properties(
    width=300,
    height=200,
    title="Distribution of phenotypes sizes"
)

In [None]:
alt.Chart(readMulti(['/mnt/extraspace/joakim/210918/seeded/2d',
    '/mnt/extraspace/joakim/210918/stochastic/2d'])).mark_bar().encode(
    alt.X("size:O"),
    alt.Y("count()", scale=alt.Scale(type='log'), title="Phenotype count"),
    tooltip=["count()", "size"],
    color = alt.Color('assemblyMode', title="Assembly mode")
).properties(
    width=300,
    height=200,
    title="Distribution of phenotypes sizes"
)

## Zoo plots

In [None]:
def listShapes(path, nSampled, nDim, method, minCount=1, nMer = None, limit=100):
    title = 'Frequency vs complexity, {:.1E} samples of {}D {}'.format(nSampled, nDim, method)
    tmp = "{}/{}/{}d".format(path, method, nDim)
    [filename] = !ls $tmp/*.ftr
    print(filename)
    df = readAnalysed(filename)
    if nMer is not None:
        df = df.loc[df['size'] == nMer]
    df = df.sort_values(['count'], ascending=False)
    return df['minNt_r'].tolist()[:limit], df['count'].tolist()[:limit]

In [None]:
listShapes(
    path = '/mnt/extraspace/joakim/refcalc',
    nSampled = 1e9, nDim = 2, method = 'seeded', minCount = 1, nMer=16, limit=50
)

In [None]:
listShapes(
    path = '/mnt/extraspace/joakim/210924',
    nSampled = 1e8, nDim = 3, method = 'seeded', minCount = 1, nMer=8, limit=50
)

In [None]:
def plotZoo(path, size, sampled, dims=[400,400], sizeScaling='log'):
    df = readAnalysed(path)
    source = df.loc[df['size'] == size]
    width = math.ceil(math.sqrt(len(source) * dims[0]/dims[1]))
    assemblyMode, nDim = path.split('/')[-3:-1]
    return alt.Chart(source).transform_window(
        rank="rank()",
        sort=[
            alt.SortField("count", order="descending"),
            alt.SortField("minLz_r", order="descending"), # Needed if counts are equal
        ]
    ).transform_calculate(
        freq='datum.count/{}'.format(sampled),
        url='"https://akodiat.github.io/polycubes/?rule="+datum.minLz_r',
        row="floor((datum.rank-1)/{})".format(width),
        col="(datum.rank-1) % {}".format(width)
    ).mark_point(
        filled=True, size=100
    ).encode(
        x=alt.X("col:O", axis=None),
        y=alt.Y("row:O", axis=None),
        tooltip=['count', 'rank:Q', 'minLz', 'minNc', 'minNt', 'sizeId', 'minNt', 'minLz_r'],
        size = alt.Size('freq:Q', title="Frequency", scale=alt.Scale(type=sizeScaling)),
        color = alt.Color('freq:Q', scale=alt.Scale(scheme="redyellowgreen")),
        href='url:N'
    ).properties(
        width=dims[0],
        height=dims[1],
        title = '{}-mer frequencies ({} {})'.format(size, assemblyMode, nDim)
    ).configure_view(
        strokeWidth=0
    )

In [None]:
plotZoo('/mnt/extraspace/joakim/210906/seeded/2d/out_24426_analysed.ftr', 16, 1e7, [300,200])

In [None]:
plotZoo('/mnt/extraspace/joakim/210924/stochastic/2d/out_5281_analysed.ftr', 16, 1e8, [300,200])

In [None]:
plotZoo('/mnt/extraspace/joakim/210906/seeded/2d/out_24426_analysed.ftr', 8, 1e7, [300,200])

In [None]:
plotZoo('/mnt/extraspace/joakim/210906/seeded/3d/out_13170_analysed.ftr', 16, 1e7, [600,400])

In [None]:
plotZoo('/mnt/extraspace/joakim/210906/seeded/3d/out_13170_analysed.ftr', 16, 1e7, [600,400], sizeScaling='linear')

# Genotypes

We also have the full dataset with each genotype rule

In [None]:
import h5py
def parseRule(rule):
    try:
        return rule if isinstance(rule, str) else rule.decode()
    except:
        print("Could not parse {}".format(rule))
        return rule
def getRules(path, n, shape, index):
    f = h5py.File(path, 'r')
    return (parseRule(v) for v in f[n][shape]["pheno_{}".format(index)])

Let's check out one of the 8-mers

In [None]:
def calcPhenoRules(path, n, shape, index):
    [pathH5] = ! ls $path/*.h5
    data = []
    for r in getRules(pathH5, n, shape, index):
        rSimpl = utils.simplifyRuleset(utils.parseHexRule(r))
        simplHex = utils.ruleToHex(rSimpl)
        data.append({
            'lz': utils.lzFromHexRule(simplHex),
            'nc': max(face['color'] for rule in rSimpl for face in rule),
            'nt': len(rSimpl),
            'rule': simplHex
        })
    print(data[0]['rule'])
    return pd.DataFrame(data)

def plotComplDistr(path, n, shape, index, measure='lz'):
    base = alt.Chart(calcPhenoRules(path, n, shape, index))
    return base.mark_bar().encode(
        alt.X("{}:Q".format(measure), bin=alt.Bin(maxbins=50)),
        alt.Y("count()"),
        tooltip=["count()", "lz", "nc", "nt"]
    ).properties(
        width=500,
        height=200,
        title="Complexity distribution for {} of dimensions {}".format(n, shape)
    ) + base.mark_rule(color='red').encode(
        x='mean({}):Q'.format(measure),
        size=alt.value(5)
    )

In [None]:
plotComplDistr('/mnt/extraspace/joakim/210906/seeded/3d', '6-mer', "2.2.2", 0)

In [None]:
plotComplDistr('/mnt/extraspace/joakim/210906/seeded/3d', '6-mer', "2.2.2", 0, measure = 'nc)

In [None]:
plotComplDistr('/mnt/extraspace/joakim/210906/seeded/3d', '16-mer', "3.3.2", 0)

In [None]:
plotComplDistr('/mnt/extraspace/joakim/210906/seeded/3d', '16-mer', "4.4.1", 0)

In [None]:
plotComplDistr('/mnt/extraspace/joakim/210906/seeded/3d', '8-mer', "2.2.2", 0)

In [None]:
plotComplDistr('/mnt/extraspace/joakim/210906/seeded/2d', '8-mer', "3.3.1", 0)