# Read analysed

In [None]:
import utils
import altair as alt
import pandas as pd
import re
import math
from analysePhenotype import readAnalysed

## First, let's take a look at the raw data

In [None]:
paths = [
    '/mnt/extraspace/joakim/210906/seeded/2d',
    '/mnt/extraspace/joakim/210906/seeded/3d',
    '/mnt/extraspace/joakim/210906/stochastic/2d',
    '/mnt/extraspace/joakim/210906/stochastic/3d'
]
alt.data_transformers.disable_max_rows()

### How much time did it take?

In [None]:
def printTiming(path):
    ! grep "Total run time" $path/*.out

In [None]:
for path in paths:
    printTiming(path)

## How many did we sample and how many were valid?

In [None]:
def statsFromPath(path):
    outputMessages = ! grep -h "Done!" $path/polycubes*.out
    nValid = nUnbounded = nNondet = 0
    for row in outputMessages:
        nP, nU, nN = [int(n) for n in re.compile('(\d+)').findall(row)]
        nValid += nP
        nUnbounded += nU
        nNondet += nN
    assemblyMode, nDim = path.split('/')[-2:]
    df = pd.DataFrame({
        'type': ['Valid rules', 'Non-deterministic', 'Unbounded'],
        'count': [nValid, nUnbounded, nNondet]
    })
    df['assemblyMode'] = assemblyMode
    df['nDim'] = nDim
    
    print('Loaded {} {}:'.format(assemblyMode, nDim))
    print('  Sampled {:,d} rules in total'.format(nValid + nNondet + nUnbounded))
    print('  Found {:,d} valid rules. Also found {:,d} unbounded and {:,d} nondeterministic rules'.format(nValid, nUnbounded, nNondet))
    
    return df
def statsFromPaths(paths):
    return pd.concat([statsFromPath(path) for path in paths])

In [None]:
alt.Chart(statsFromPaths(paths)).mark_bar().encode(
    x=alt.X('count:Q', title="Samples"),
    y=alt.Y('nDim:N', title=None),
    color='type:N',
    row = alt.Row('assemblyMode', title=None)
).properties(
    title='Proportion of valid rules in different datasets'
)

## How many phenotypes of each size did we find?

In [None]:
def readAndAssign(path):
    [f] = ! ls $path/*.ftr
    df = readAnalysed(f)
    assemblyMode, nDim = path.split('/')[-2:]
    df['assemblyMode'] = assemblyMode
    df['nDim'] = nDim
    return df
def readMulti(paths):
    return pd.concat([readAndAssign(path) for path in paths])

In [None]:
alt.Chart(readMulti(paths)).mark_bar().encode(
    alt.X("size:O"),
    alt.Y("count()", scale=alt.Scale(type='log')),
    tooltip=["count()", "size"],
    row = 'assemblyMode:N',
    column = 'nDim:N'
).properties(
    width=500,
    height=200,
    title="Distribution of phenotypes sizes"
)

## Zoo plots

In [None]:
def plotZoo(path, size, sampled, dims=[400,400], sizeScaling='log'):
    df = readAnalysed(path)
    source = df.loc[df['size'] == size]
    width = math.ceil(math.sqrt(len(source) * dims[0]/dims[1]))
    assemblyMode, nDim = path.split('/')[-3:-1]
    return alt.Chart(source).transform_window(
        rank="rank()",
        sort=[
            alt.SortField("count", order="descending"),
            alt.SortField("minLz_r", order="descending"), # Needed if counts are equal
        ]
    ).transform_calculate(
        freq='datum.count/{}'.format(sampled),
        row="floor((datum.rank-1)/{})".format(width),
        col="(datum.rank-1) % {}".format(width)
    ).mark_point(
        filled=True, size=100
    ).encode(
        x=alt.X("col:O", axis=None),
        y=alt.Y("row:O", axis=None),
        tooltip=['count', 'rank:Q', 'minLz', 'minNc', 'minNt', 'sizeId', 'minNt', 'minLz_r'],
        size = alt.Size('freq:Q', title="Frequency", scale=alt.Scale(type=sizeScaling)),
        color = alt.Color('freq:Q', scale=alt.Scale(scheme="redyellowgreen")),
    ).properties(
        width=dims[0],
        height=dims[1],
        title = '{}-mer frequencies ({} {})'.format(size, assemblyMode, nDim)
    ).configure_view(
        strokeWidth=0
    )

In [None]:
plotZoo('/mnt/extraspace/joakim/210906/seeded/2d/out_24426_analysed.ftr', 16, 1e7, [300,200])

In [None]:
plotZoo('/mnt/extraspace/joakim/210906/seeded/3d/out_13170_analysed.ftr', 16, 1e7, [600,400])

In [None]:
plotZoo('/mnt/extraspace/joakim/210906/seeded/3d/out_13170_analysed.ftr', 16, 1e7, [600,400], sizeScaling='linear')

# Genotypes

We also have the full dataset with each genotype rule

In [None]:
import h5py
def parseRule(rule):
    try:
        return rule if isinstance(rule, str) else rule.decode()
    except:
        print("Could not parse {}".format(rule))
        return rule
def getRules(path, n, shape, index):
    f = h5py.File(path, 'r')
    return (parseRule(v) for v in f[n][shape]["pheno_{}".format(index)])

Let's check out one of the 8-mers

In [None]:
def calcPhenoRules(path, n, shape, index):
    [pathH5] = ! ls $path/*.h5
    data = []
    for r in getRules(pathH5, n, shape, index):
        rSimpl = utils.simplifyRuleset(utils.parseHexRule(r))
        simplHex = utils.ruleToHex(rSimpl)
        data.append({
            'lz': utils.lzFromHexRule(simplHex),
            'nc': max(face['color'] for rule in rSimpl for face in rule),
            'nt': len(rSimpl),
            'rule': simplHex
        })
    print(data[0]['rule'])
    return pd.DataFrame(data)

def plotComplDistr(path, n, shape, index, measure='lz'):
    base = alt.Chart(calcPhenoRules(path, n, shape, index))
    return base.mark_bar().encode(
        alt.X("lz:Q", bin=alt.Bin(maxbins=50)),
        alt.Y("count()"),
        tooltip=["count()", "lz", "nc", "nt"]
    ).properties(
        width=500,
        height=200,
        title="Complexity distribution for {} of dimensions {}".format(n, shape)
    ) + base.mark_rule(color='red').encode(
        x='mean({}):Q'.format(measure),
        size=alt.value(5)
    )

In [None]:
plotComplDistr('/mnt/extraspace/joakim/210906/seeded/3d', '6-mer', "2.2.2", 0)

In [None]:
plotComplDistr('/mnt/extraspace/joakim/210906/seeded/3d', '16-mer', "3.3.2", 0)

In [None]:
plotComplDistr('/mnt/extraspace/joakim/210906/seeded/3d', '16-mer', "4.4.1", 0)

In [None]:
plotComplDistr('/mnt/extraspace/joakim/210906/seeded/3d', '8-mer', "2.2.2", 0)