# Frequency vs rank

In [None]:
import utils
import altair as alt
import pandas as pd
from analysePhenotype import readAnalysed

Check what files we have to analyse:

In [None]:
! ls -lh /mnt/extraspace/joakim/210906/*/*/*.ftr

Drop phenotypes found less than 100 times

We likely want more than 5000 rows

In [None]:
alt.data_transformers.disable_max_rows()

Initial setup:

In [None]:
def plotRank(path, nSampled, nDim, method, minCount=1, nMer = None, saveHTML=False):
    title = 'Frequency vs rank, {:.1E} samples of {}D {}'.format(nSampled, nDim, method)
    df = readAnalysed(path)
    df = df.loc[df['count'] >= minCount]
    if nMer is not None:
        df = df.loc[df['size'] == nMer]
        title = '{}-mer frequency vs rank, {:.1E} samples of {}D {}'.format(nMer, nSampled, nDim, method)
    chart = alt.Chart(df).mark_circle(size=60).encode(
        alt.X("rank:O",scale=alt.Scale(type='log')),
        alt.Y('freq:Q', scale=alt.Scale(type='log'), title="Frequency"),
        href='url:N',
        color=alt.Color('minLz:Q', scale=alt.Scale(scheme="inferno")),
        tooltip=['url:N', 'count', 'freq:Q', 'minLz', 'minNc', 'minNt']
    ).transform_calculate(
        freq='datum.count/{}'.format(nSampled),
        url='"https://akodiat.github.io/polycubes/?rule="+datum.minLz_r',
        symmetries='1+datum.rotsymms+datum.reflsymms+datum.invsymms'
    ).properties(width=350, height=200
    ).transform_window(
        rank="rank()",
        sort=[alt.SortField("count", order="descending")]
    ).properties(title=title)
    if saveHTML:
        chart.save('freq_vs_rank.html')
    return chart

In [None]:
plotRank(
    path = '/mnt/extraspace/joakim/210906/seeded/2d/out_24426_analysed.ftr',
    nSampled = 1e7, nDim = 2, method = 'seeded', minCount = 1
)

In [None]:
plotRank(
    path = '/mnt/extraspace/joakim/210906/stochastic/2d/out_18511_analysed.ftr',
    nSampled = 1e7, nDim = 2, method = 'stochastic', minCount = 1
)

# 16 mers

In [None]:
plotRank(
    path = '/mnt/extraspace/joakim/210906/seeded/2d/out_24426_analysed.ftr',
    nSampled = 1e7, nDim = 2, method = 'seeded', minCount = 10, nMer = 16
)

In [None]:
plotRank(
    path = '/mnt/extraspace/joakim/210906/stochastic/2d/out_18511_analysed.ftr',
    nSampled = 1e7, nDim = 2, method = 'stochastic', minCount = 1, nMer = 16
)