In [None]:
npz_size = 45185

In [None]:
sizes = {(0.001, 3): 3572, (0.01, 3): 1295, (0.1, 5): 289, (0.1, 4): 242, (0.1, 3): 190, (0.1, 1): 132, (0.1, 2): 114}

In [None]:
fixed_precision = {decimals: mb for (precision, decimals), mb in sizes.items() if precision == 0.1}

In [None]:
fixed_precision

In [None]:
fixed_decimals = {precision: mb for (precision, decimals), mb in sizes.items() if decimals == 3}

In [None]:
fixed_decimals

In [None]:
import matplotlib.pyplot as plt
import numpy as np
from matplotlib.colors import LogNorm

In [None]:
%matplotlib inline

In [None]:
plt.scatter(fixed_decimals.keys(), fixed_decimals.values())
plt.xlabel('Minimum LD Score Kept')
plt.ylabel('MB')
plt.title('3 Decimals')
# plt.show()
plt.savefig('fixed_decimals.png', dpi=1000)

In [None]:
plt.scatter(fixed_precision.keys(), fixed_precision.values())
plt.xlabel('Decimals')
plt.ylabel('MB')
plt.title('0.1 Minimum LD Score Kept')
# plt.show()
plt.savefig('fixed_precision.png', dpi=1000)

In [None]:
plt.scatter([a for a, _ in sizes.keys()], [a for _, a in sizes.keys()], s=np.array(list(sizes.values())) / 4)
plt.ylabel('Decimals')
plt.xlabel('Minimum LD Score Kept')
for (x,y), v in sizes.items():
    plt.annotate(v, (x,y), (x-.004, y+.4))
plt.show()

In [None]:
lossless = {21: 18560, 2: 104785, 20: 27320, 22: 18912}

In [None]:
plt.scatter(lossless.keys(), np.array(list(lossless.values()))/1000)
plt.xlabel('Chromosome')
plt.xlim(0, 23)
plt.ylabel('GB')

plt.show()

In [None]:
import pandas as pd

In [None]:
df = pd.read_csv('../data/hyperparameters.csv', delim_whitespace=True, skiprows=1, header=None)[[4, 8]]
df.columns = ['MBs', 'name']

In [None]:
df['decimals'] = df.name.str.split('d').str[1].str[:1].astype(float)

In [None]:
df['precision'] = df.name.str.split('p').str[1].str.split('d').str[0].str.replace(".h5", "").str.replace('_', '.').astype(float)

In [None]:
df['MBs'] = df['MBs'].str.rstrip('MB').astype(int)

In [None]:
df = df[['decimals', 'precision', 'MBs']]

In [None]:
plt.scatter(df.precision, df.decimals, df['MBs'])
plt.ylabel('Decimals')
plt.xlabel('Minimum LD Score Kept')
plt.xscale("log")
# for (x,y), v in sizes.items():
#     plt.annotate(v, (x,y), (x-.004, y+.4))
plt.show()

In [None]:
three = df[df.decimals == 3]
# three = three[three.precision < .5]

plt.scatter(three.precision, three['MBs'])
plt.xlabel('Minimum LD Score Kept')
plt.ylabel('MB')
plt.title('3 Decimals')
# plt.show()
plt.savefig('fixed_decimals.png', dpi=1000)

In [None]:
df.sort_values('MBs', ascending=False)

In [None]:
df.MBs = np.round(100 * df.MBs / npz_size, 2)

In [None]:
import seaborn as sns

In [None]:
sns.set_theme('paper')

In [None]:
ax = sns.scatterplot(data=df.dropna(), x='precision', y='decimals', size="MBs", hue="MBs", palette=sns.color_palette('crest', as_cmap=True),
                sizes=(50,250)

)

ax.legend(loc='center left', bbox_to_anchor=(1, 0.5), borderpad=2, labelspacing=2, title="Size in MB")
plt.ylabel('Decimals')
plt.yticks(range(0,6))
plt.xlabel('Minimum LD Score Kept')
plt.xscale("log")
plt.tight_layout()
# plt.savefig('/tmp/hyper.png', dpi=1000)
plt.show()

# maybe I can just add to this and then add lines/overwrite labels

In [None]:
#scratch

ax = sns.scatterplot(data=df, x='precision', y='decimals', size="MBs", hue="MBs", palette=sns.color_palette('crest', as_cmap=True),
                sizes=(50,250)

)

ax.legend(loc='center left', bbox_to_anchor=(1, 0.5), borderpad=2, labelspacing=2, title="Size in MB")
plt.ylabel('Decimals')
plt.yticks(range(0,6))
plt.xlabel('Minimum LD Score Kept')
plt.xscale("log")
plt.tight_layout()
# plt.savefig('/tmp/hyper.png', dpi=1000)
plt.show()

In [None]:
# keep all LD
full_ld = df.copy()
full_ld.precision.fillna(0, inplace=True)
# full_ld.dropna(inplace=True)
ax = sns.scatterplot(data=full_ld, x='precision', y='decimals', size="MBs", hue="MBs", palette=sns.color_palette('crest', as_cmap=True),
                sizes=(50,250)

)

ax.legend(loc='center left', bbox_to_anchor=(1, 0.5), borderpad=2, labelspacing=2, title="Size in MB")
plt.ylabel('Decimals')
plt.yticks(range(0,6))
plt.xticks([0], labels=[None])
plt.xlim((-.0001, .0001))
plt.xlabel('All LD Scores Kept')
plt.tight_layout()
# plt.savefig('/tmp/hyper.png', dpi=1000)
plt.show()

In [None]:
# keep all decimals
# full_decimals = df[df.decimals.isna()]
# full_decimals.decimals = 0
# full_decimals.dropna(inplace=True)
full_decimals = df.copy()
full_decimals.decimals.fillna(0, inplace=True)
ax = sns.scatterplot(data=full_decimals, x='precision', y='decimals', size="MBs", hue="MBs", palette=sns.color_palette('crest', as_cmap=True),
                sizes=(50,250)

)

ax.legend(loc='center left', bbox_to_anchor=(1, 0.5), borderpad=2, labelspacing=2, title="Size in MB")
plt.ylabel('Full Decimals Kept')
plt.yticks([0], labels=[None])
plt.ylim((-.5, .5))
plt.xlabel('Minimum LD Score Kept')
plt.xscale("log")
plt.tight_layout()
# plt.savefig('/tmp/hyper.png', dpi=1000)
plt.show()

In [None]:
lossless = df.fillna(0)
ax = sns.scatterplot(data=lossless, x='precision', y='decimals', size="MBs", hue="MBs", palette=sns.color_palette('crest', as_cmap=True),
                sizes=(50,250))


ax.legend(loc='center left', bbox_to_anchor=(1, 0.5), borderpad=2, labelspacing=2, title="Size in MB")

plt.xticks([0], labels=[None])
plt.xlim((-.0001, .0001))
plt.xlabel('All LD Scores Kept')
plt.ylabel('Full Decimals Kept')
plt.yticks([0], labels=[None])
plt.ylim((-.5, .5))
plt.tight_layout()
# plt.savefig('/tmp/hyper.png', dpi=1000)
plt.show()

In [None]:
lossless

In [None]:
fig, (ax1, ax2) = plt.subplots(2,2,sharex='all', sharey='all')

ax = sns.scatterplot(data=df, x='precision', y='decimals', size="MBs", hue="MBs", palette=sns.color_palette('crest', as_cmap=True),
                sizes=(50,250), ax=ax1[1]

)

ax.legend(loc='center left', bbox_to_anchor=(1, 0.5), borderpad=2, labelspacing=2, title="Size in MB")
plt.ylabel('Decimals')
plt.yticks(range(0,6))
plt.xlabel('Minimum LD Score Kept')
plt.xscale("log")
plt.tight_layout()

# full_ld = df.copy()
# full_ld.precision.fillna(0, inplace=True)
# # full_ld.dropna(inplace=True)
# ax = sns.scatterplot(data=full_ld, x='precision', y='decimals', size="MBs", hue="MBs", palette=sns.color_palette('crest', as_cmap=True),
#                 sizes=(50,250), ax=ax1[0]

# )

# ax.get_legend().remove()
# # plt.ylabel('Decimals')
# # plt.yticks(range(0,6))
# # plt.xticks([0], labels=[None])
# # plt.xlim((-.0001, .0001))
# # plt.xlabel('All LD Scores Kept')
# # plt.tight_layout()

# full_decimals = df.copy()
# full_decimals.decimals.fillna(0, inplace=True)
# ax = sns.scatterplot(data=full_decimals, x='precision', y='decimals', size="MBs", hue="MBs", palette=sns.color_palette('crest', as_cmap=True),
#                 sizes=(50,250), ax=ax2[1]

# )

# ax.get_legend().remove()

# plt.ylabel('Full Decimals Kept')
# plt.yticks([0], labels=[None])
# plt.ylim((-.5, .5))
# plt.xlabel('Minimum LD Score Kept')
# plt.xscale("log")

# lossless = df.fillna(0)
# ax=sns.scatterplot(data=lossless, x='precision', y='decimals', size="MBs", hue="MBs", palette=sns.color_palette('crest', as_cmap=True),
#                 sizes=(50,250), ax=ax2[0])


# ax.get_legend().remove()


# plt.xticks([0], labels=[None])
# plt.xlim((-.0001, .0001))
# plt.xlabel('All LD Scores Kept')
# plt.ylabel('Full Decimals Kept')
# plt.yticks([0], labels=[None])
# plt.ylim((-.5, .5))

In [None]:
# remove meaningless values
df = df[df.precision.fillna(1e-4) >= 10**(-df.decimals.fillna(6))]

In [None]:
pivoted = df.fillna('Full')
pivoted = pivoted.pivot(index='decimals', columns='precision', values='MBs')
pivoted = pivoted[list(pivoted.columns[-1:]) + list(pivoted.columns[:-1])]
pivoted = pivoted.iloc[::-1]
pivoted

In [None]:
cmap = plt.get_cmap().copy()
cmap.set_bad('white')

sns.heatmap(pivoted, annot=True, yticklabels=['Full', 5,4,3,2,1], linewidths=1, cmap=cmap, norm=LogNorm())
plt.xlabel('Minimum LD Score Kept')

plt.ylabel('Decimals')
plt.title('% of Original Size (Chromosome 21)')

# plt.show()
plt.savefig('../graphics/plots/full_heatmap.png', dpi=1000)


In [None]:
sns.heatmap(pivoted.iloc[1:, 1:], annot=True, yticklabels=[5,4,3,2,1], linewidths=1, cmap=cmap, norm=LogNorm())
plt.xlabel('Minimum LD Score Kept')

plt.ylabel('Decimals')

plt.title('% of Original Size (Chromosome 21)')


# plt.show()
plt.savefig('../graphics/plots/lossy_heatmap.png', dpi=1000)



In [None]:
df.decimals = df.decimals.fillna(6)
df.precision = df.precision.fillna(10**(-3.5))

ax = sns.scatterplot(data=df, x='precision', y='decimals', size="MBs", hue="MBs", palette=sns.color_palette('crest', as_cmap=True),
                sizes=(50,250)

)

ax.legend(loc='center left', bbox_to_anchor=(1, 0.5), borderpad=2, labelspacing=2, title="% of Original Size")
plt.ylabel('Decimals')
plt.yticks(range(1,7), labels=[1,2,3,4,5, 'Full'])
plt.xlabel('Minimum LD Score Kept')
plt.xscale("log")
plt.xticks([10**(-3.5), 1e-3, 1e-2, 1e-1], labels=['Full', 1e-3, 1e-2, 1e-1])

plt.axhline(y=5.5, ls='--', lw=3, color='black')
plt.axvline(x=10**(-3.25),  ls='--', lw=3, color='black')

plt.tight_layout()
plt.savefig('../graphics/plots/full.png', dpi=1000)

lossless = df[(df.decimals == df.decimals.max()) & (df.precision == df.precision.min())].iloc[0]
ax.annotate(lossless.MBs, (lossless.precision, lossless.decimals),
            xytext=(lossless.precision, lossless.decimals + .3),
            arrowprops={'arrowstyle': "-", 'color': 'black'})

smallest = df[df.MBs == df.MBs.min()].iloc[0]

ax.annotate(smallest.MBs, (smallest.precision, smallest.decimals),
            xytext=(smallest.precision - .1, smallest.decimals + .3),
            arrowprops={'arrowstyle': "-", 'color': 'black'})

greater_than_one = df[df.MBs > 1]
greater_than_one = greater_than_one[greater_than_one.MBs == greater_than_one.MBs.min()].iloc[0]
ax.annotate(greater_than_one.MBs, (greater_than_one.precision, greater_than_one.decimals),
            xytext=(greater_than_one.precision - .01, greater_than_one.decimals + .2))

smaller_than_one = df[df.MBs < 1]
smaller_than_one = smaller_than_one[smaller_than_one.MBs == smaller_than_one.MBs.max()].iloc[0]
ax.annotate(smaller_than_one.MBs, (smaller_than_one.precision, smaller_than_one.decimals),
            xytext=(smaller_than_one.precision - .01, smaller_than_one.decimals + .2))

# plt.xticks([0, .001, .01, .1])
plt.tight_layout()
plt.savefig('../graphics/plots/full_annotated.png', dpi=1000)
# plt.show()

In [None]:
three = df[(df.decimals == 3) & (df.precision > 10**(-3.5))]
three = three[three.precision < .5]

ax = sns.scatterplot(data=three, x='precision', y='MBs', palette=sns.color_palette('crest', as_cmap=True),
                sizes=(50,250)

)

plt.xlabel('Minimum LD Score Kept')
plt.ylabel('% of Original Size')
plt.title('3 Decimals')
# plt.xscale("log")


# plt.show()
plt.tight_layout()
plt.savefig('../graphics/plots/fixed_decimals.png', dpi=1000)

In [None]:
df.decimals = df.decimals.fillna(6)
df.precision = df.precision.fillna(10**(-3.5))

ax = sns.scatterplot(data=df[df.decimals < 6], x='precision', y='decimals', size="MBs", hue="MBs", palette=sns.color_palette('crest', as_cmap=True),
                sizes=(50,250)

)

ax.legend(loc='center left', bbox_to_anchor=(1, 0.5), borderpad=2, labelspacing=2, title="% of Original Size")
plt.ylabel('Decimals')
plt.yticks(range(1,6), labels=[1,2,3,4,5])
plt.xlabel('Minimum LD Score Kept (logscale)')
plt.xscale("log")
plt.xticks([10**(-3.5), 1e-3, 1e-2, 1e-1], labels=['Full', 1e-3, 1e-2, 1e-1])

plt.axvline(x=10**(-3.25),  ls='--', lw=3, color='black')

plt.tight_layout()
plt.savefig('../graphics/plots/full_precision.png', dpi=1000)

smallest = df[df.MBs == df.MBs.min()].iloc[0]

ax.annotate(smallest.MBs, (smallest.precision, smallest.decimals),
            xytext=(smallest.precision - .1, smallest.decimals + .3),
            arrowprops={'arrowstyle': "-", 'color': 'black'})

greater_than_one = df[df.MBs > 1]
greater_than_one = greater_than_one[greater_than_one.MBs == greater_than_one.MBs.min()].iloc[0]
ax.annotate(greater_than_one.MBs, (greater_than_one.precision, greater_than_one.decimals),
            xytext=(greater_than_one.precision - .01, greater_than_one.decimals + .2))

smaller_than_one = df[df.MBs < 1]
smaller_than_one = smaller_than_one[smaller_than_one.MBs == smaller_than_one.MBs.max()].iloc[0]
ax.annotate(smaller_than_one.MBs, (smaller_than_one.precision, smaller_than_one.decimals),
            xytext=(smaller_than_one.precision - .01, smaller_than_one.decimals + .2))

# plt.xticks([0, .001, .01, .1])
plt.tight_layout()
plt.savefig('../graphics/plots/full_precision_annotated.png', dpi=1000)
# plt.show()

In [None]:
df.decimals = df.decimals.fillna(6)
df.precision = df.precision.fillna(10**(-3.5))

ax = sns.scatterplot(data=df[(df.decimals < 6) & (df.precision > 10**(-3.5))], x='precision', y='decimals', size="MBs", hue="MBs", palette=sns.color_palette('crest', as_cmap=True),
                sizes=(50,250)

)

ax.legend(loc='center left', bbox_to_anchor=(1, 0.5), borderpad=2, labelspacing=2, title="% of Original Size")
plt.ylabel('Decimals')
plt.yticks(range(1,6), labels=[1,2,3,4,5])
plt.xlabel('Minimum LD Score Kept (logscale)')
plt.xscale("log")
# plt.xticks([10**(-3.5), 1e-3, 1e-2, 1e-1], labels=['Full', 1e-3, 1e-2, 1e-1])

plt.tight_layout()
plt.savefig('../graphics/plots/lossy.png', dpi=1000)

smallest = df[df.MBs == df.MBs.min()].iloc[0]

ax.annotate(smallest.MBs, (smallest.precision, smallest.decimals),
            xytext=(smallest.precision - .1, smallest.decimals + .3),
            arrowprops={'arrowstyle': "-", 'color': 'black'})

greater_than_one = df[df.MBs > 1]
greater_than_one = greater_than_one[greater_than_one.MBs == greater_than_one.MBs.min()].iloc[0]
ax.annotate(greater_than_one.MBs, (greater_than_one.precision, greater_than_one.decimals),
            xytext=(greater_than_one.precision - .01, greater_than_one.decimals + .2))

smaller_than_one = df[df.MBs < 1]
smaller_than_one = smaller_than_one[smaller_than_one.MBs == smaller_than_one.MBs.max()].iloc[0]
ax.annotate(smaller_than_one.MBs, (smaller_than_one.precision, smaller_than_one.decimals),
            xytext=(smaller_than_one.precision - .01, smaller_than_one.decimals + .2))

# plt.xticks([0, .001, .01, .1])
plt.tight_layout()
plt.savefig('../graphics/plots/lossy_annotated.png', dpi=1000)
# plt.show()

In [None]:
fixed_precision = df[(df.decimals < 6) & (df.precision == .06)]

ax = sns.scatterplot(data=fixed_precision, x='decimals', y='MBs', palette=sns.color_palette('crest', as_cmap=True),
                sizes=(50,250)

)

plt.xlabel('Decimals')
plt.ylabel('% of Original Size')
plt.title('.06 Minimum LD Score Kept')
plt.xticks(range(1,6))
# plt.xscale("log")


plt.tight_layout()
plt.savefig('../graphics/plots/fixed_precision.png', dpi=1000)

In [None]:
fixed_precision = df[(df.decimals < 6) & (df.precision == .1)]

ax = sns.scatterplot(data=fixed_precision, x='decimals', y='MBs', palette=sns.color_palette('crest', as_cmap=True),
                sizes=(50,250)

)

plt.xlabel('Decimals')
plt.ylabel('% of Original Size')
plt.title('.1 Minimum LD Score Kept')
plt.xticks(range(1,6))
# plt.xscale("log")


plt.tight_layout()
plt.savefig('../graphics/plots/fixed_precision.png', dpi=1000)

In [None]:
ax = sns.jointplot(data=df, x='precision', y='decimals', hue="MBs")
plt.xscale("log")



In [None]:
sns.heatmap(df[['precision', 'decimals']])

In [None]:
plt.imshow(df)
# plt.xscale('log')