In [None]:
import pandas as pd 
from collections import defaultdict
import random
import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib.pyplot import figure
%matplotlib inline
random.seed(2)
#figure(figsize=(12,6), dpi=300)
def diff(L1, L2):
    h, l1 = 0,0
    nbin = min(len(L1), len(L2))
    for i in range(0, nbin):
        x, y = L1[i], L2[i]
        h += (x != y)
        l1 += abs(x-y)
    return h/nbin, l1/nbin
def to_dict(item):
    item = pd.read_csv(item)
    return item.to_dict(orient = 'list')
map_ = {'gk': 'Ginkgo','nb':'Error Model \nInferred topology', 'vf0':'Error Model \nTrue topology'}

In [None]:
#n = 20
random.seed(52)
df = pd.DataFrame(columns = ['Method', 'Distance', 'Value'])
for i in range(0,9):
    #gt
    gt = 'BD_output/simulated/20cell_halfevents/rep' + str(i) + '/gt_leaf.segcopy'
    gt = pd.read_csv(gt, sep = '\t')
    gt.drop(columns = ['CHR', 'START','END '], inplace = True)
    gt = gt.to_dict(orient = 'list')
    #ginkgo
    ginkgo = 'BD_output/simulated/20cell_halfevents/rep' + str(i) + '/gt.cnp'
    ginkgo = pd.read_csv(ginkgo, sep = '\t')
    ginkgo.drop(columns = ['CHR', 'START','END'], inplace = True)
    ginkgo = ginkgo.iloc[: , 1:]
    ginkgo = ginkgo.to_dict(orient = 'list')
    #others
    nb = 'BD_output/simulated/20cell_halfevents/rep' + str(i) + '/cnp_vf0_mcctree.csv'
    vf0 = 'BD_output/simulated/20cell_halfevents/rep' + str(i) + '/cnp_vf0_truetree.csv'
    nb = to_dict(nb)
    vf0 = to_dict(vf0)
    dl, dh = defaultdict(int),defaultdict(int)
    gt = {key.strip(): item for key, item in gt.items()}
    l = list(gt.keys())
    l = random.sample(l, 25)
    for leaf in l:
        gk1, gk2 = diff(gt[leaf], ginkgo[leaf])
        dh['gk'] += min(gk1, gk2)
        dl['gk'] += max(gk1, gk2)
        nb1, nb2 = diff(gt[leaf], nb[leaf])
        dh['nb'] += min(nb1, nb2)
        dl['nb'] += max(nb1, nb2)
        vf01, vf02 = diff(gt[leaf], vf0[leaf])
        dh['vf0'] += min(vf01, vf02)
        dl['vf0'] += max(vf01, vf02)
    dl = {item/(1.0*len(list(gt.keys()))) for key, item in dl.items()}
    dh = {item/(1.0*len(list(gt.keys()))) for key, item in dh.items()}
    for key, item in dl.items():
        df2 = [map_[key],'Average L1 norm between true and estiamted integer copy number', item]
        df.loc[len(df)] = df2
        #df = df.append(df2,ignore_index = True)
    for key, item in dh.items():
        df2 = [map_[key],'Average Hamming distance between true and estimated integer copy number',item]
        df.loc[len(df)] = df2
        #df = df.append(df2,ignore_index = True)
df.to_csv('BD_output/figure_data/n20_c90_cnp.csv')

In [None]:
#sns.set(rc={'axes.facecolor':'white', 'figure.facecolor':'white'})
#sns.set_context("paper")
figure(figsize=(9,6), dpi=300)
df = pd.read_csv('BD_output/figure_data/n20_c90_cnp.csv')
sns.set_theme(style="whitegrid")
#sns.set_theme(style="white")
#sns.set(rc = {'figure.figsize':(16,8)})
ax = sns.boxplot(x="Method", y="Value", hue="Distance", data=df, linewidth=1.5, width = 0.5,whis = 2,
                 order=[map_['gk'],map_['nb'], map_['vf0']],
                hue_order= ['Average Hamming distance between true and estimated integer copy number','Average L1 norm between true and estiamted integer copy number'])
ax.spines['right'].set_visible(False)
ax.spines['top'].set_visible(False)
plt.ylim(0, 0.5)
plt.ylabel('Value', fontsize = 16)
plt.xlabel('N = 20, c = 90', fontsize = 16)
plt.legend(loc='upper right', borderaxespad=1 ,fontsize='10')
plt.show()