In [1]:
%matplotlib widget
import seaborn as sns
import matplotlib.pyplot as plt
from collections import Counter
import pandas as pd
import re

#stripper function
def strip_tree(tree):
     return re.sub('(\d|:|_|\.|[a-z])', '', tree)

In [2]:
def load_trees(fname):
    with open(fname,'r') as fin:
        return [l.rstrip().split(';') for l in fin][0][:-1]

In [5]:
trees = load_trees("joined_trees.txt")
rows = []

for i,comb in enumerate([strip_tree(tree) for tree in trees]):
    
    x = comb.replace('(','').replace(')','').replace(',','')[:3]
    code = '-'.join(sorted(x[:2])+[x[2]])
        
    chrom = (i%29)+1
    chrom = chrom if chrom <= 29 else 'all'
      
    config = None
    run = i//29
    if run < 5:
        config = 'shasta'
    elif run < 10:
        config = 'hifiasm'
    elif run < 20:
        config = 'hs_shuf'
    elif run < 40:
        config = 'either'
    elif run < 43:
        config = 'peregrine'
    elif run < 46:
        config = 'raven'
    elif run < 49:
        config = 'flye'
    elif run < 50:
        config = 'hicanu'
    elif run < 60:
        config = 'all_shuf'
    elif run < 80:
        config = 'all_eith'
    rows.append({'run':i//30,'chr':chrom,'order':code,'config':config})
df = pd.DataFrame(rows)

Unnamed: 0_level_0,run,chr,order
config,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
all_eith,580,580,580
all_shuf,290,290,290
either,580,580,580
flye,87,87,87
hicanu,29,29,29
hifiasm,145,145,145
hs_shuf,290,290,290
peregrine,87,87,87
raven,87,87,87
shasta,145,145,145


In [20]:
sns.catplot(data=df,x='chr',kind='count',hue='order',col_wrap=4,col='config')

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

<seaborn.axisgrid.FacetGrid at 0x7fcc8e0dc340>

In [43]:
qq= df[(df['config']=='hifiasm')|(df['config']=='shasta')|(df['config']=='either')|(df['config']=='hs_shuf')].groupby(['chr','order']).count().reset_index()

plt.figure()
sns.scatterplot(data=qq,x='chr',y='run',hue='order')

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

Unnamed: 0,chr,order,run,config
3,2,B-P-O,40,40
11,6,B-O-P,40,40
17,9,B-O-P,40,40
18,10,B-P-O,40,40
29,15,B-P-O,40,40
48,23,B-P-O,39,39
50,24,B-P-O,40,40
53,26,B-P-O,40,40
59,29,B-P-O,40,40


In [23]:
data='(gaur:0.376151,(nellore:0.117037,(bsw:0.0383929,(obv:0.036079,pied:0.036079):0.00231387):0.0786446):0.259113);(gaur:0.384387,(nellore:0.104898,(pied:0.0358684,(bsw:0.0336749,obv:0.0336749):0.00219359):0.0690295):0.279489);(gaur:0.393264,(nellore:0.112438,(pied:0.04183,(bsw:0.037261,obv:0.037261):0.00456896):0.0706083):0.280826);(gaur:0.399114,(nellore:0.122466,(bsw:0.0433282,(obv:0.0387218,pied:0.0387218):0.00460634):0.0791373):0.276648);(gaur:0.366217,(nellore:0.118095,(pied:0.0400524,(bsw:0.0372951,obv:0.0372951):0.00275733):0.0780421):0.248123);(gaur:0.394545,(nellore:0.111131,(pied:0.0436057,(bsw:0.0387256,obv:0.0387256):0.00488007):0.067525):0.283414);(gaur:0.409183,(nellore:0.112553,(bsw:0.0389752,(obv:0.0360255,pied:0.0360255):0.00294963):0.0735783):0.29663);(gaur:0.397388,(nellore:0.122268,(obv:0.0454757,(bsw:0.0351746,pied:0.0351746):0.0103011):0.0767922):0.275121);(gaur:0.398349,(nellore:0.105709,(pied:0.0400871,(bsw:0.0381708,obv:0.0381708):0.00191624):0.0656224):0.29264);(gaur:0.386962,(nellore:0.102029,(bsw:0.0406497,(obv:0.0385982,pied:0.0385982):0.00205157):0.061379):0.284933);(gaur:0.369279,(nellore:0.114112,(pied:0.0387218,(bsw:0.0361617,obv:0.0361617):0.00256012):0.0753905):0.255167);(gaur:0.380651,(nellore:0.13181,(bsw:0.0528749,(obv:0.0441349,pied:0.0441349):0.00874002):0.0789346):0.248842);(gaur:0.391107,(nellore:0.111082,(pied:0.0343925,(bsw:0.0314762,obv:0.0314762):0.00291624):0.0766893):0.280025);(gaur:0.39991,(nellore:0.104656,(bsw:0.0374423,(obv:0.0363872,pied:0.0363872):0.00105507):0.0672136):0.295254);(gaur:0.36453,(nellore:0.122061,(bsw:0.043176,(obv:0.040549,pied:0.040549):0.00262703):0.0788854):0.242469);(gaur:0.399314,(nellore:0.112428,(bsw:0.0385897,(obv:0.0364024,pied:0.0364024):0.00218733):0.0738385):0.286885);(gaur:0.388082,(nellore:0.11466,(pied:0.0401085,(bsw:0.0382637,obv:0.0382637):0.00184477):0.0745517):0.273422);(gaur:0.352804,(nellore:0.111778,(pied:0.0426599,(bsw:0.0388053,obv:0.0388053):0.00385459):0.0691186):0.241026);(gaur:0.397195,(nellore:0.100784,(bsw:0.0366994,(obv:0.0356394,pied:0.0356394):0.00105997):0.0640849):0.29641);(gaur:0.328738,(nellore:0.0994499,(pied:0.0365601,(bsw:0.0348973,obv:0.0348973):0.00166282):0.0628898):0.229288);(gaur:0.403084,(nellore:0.116689,(bsw:0.0409729,(obv:0.0313408,pied:0.0313408):0.00963216):0.0757156):0.286395);(gaur:0.391599,(nellore:0.0879211,(bsw:0.0306894,(obv:0.0303176,pied:0.0303176):0.000371842):0.0572316):0.303678);(gaur:0.330951,(nellore:0.123869,(pied:0.0526,(bsw:0.0465642,obv:0.0465642):0.00603579):0.0712689):0.207082);(gaur:0.365075,(nellore:0.116235,(bsw:0.0401447,(obv:0.0372307,pied:0.0372307):0.00291405):0.0760902):0.24884);(gaur:0.42264,(nellore:0.118208,(obv:0.0437409,(bsw:0.0410686,pied:0.0410686):0.00267231):0.0744673):0.304432);(gaur:0.390656,(nellore:0.113633,(obv:0.0397816,(bsw:0.0373379,pied:0.0373379):0.00244372):0.0738513):0.277023);(gaur:0.398647,(nellore:0.144023,(pied:0.036351,(bsw:0.0354978,obv:0.0354978):0.000853173):0.107672):0.254624);(gaur:0.384289,(nellore:0.141181,(obv:0.0394741,(bsw:0.0383582,pied:0.0383582):0.00111593):0.101707):0.243107);(gaur:0.355713,(nellore:0.129896,(bsw:0.0420809,(obv:0.0386343,pied:0.0386343):0.00344661):0.0878151):0.225817)'.replace('gaur','G').replace('bsw','B').replace('nellore','N').replace('obv','O').replace('pied','P')
raw = [strip_tree(t).replace('(','').replace(')','').replace(',','')[2:] for t in data.split(';')]

SNP = ['-'.join(sorted(i[1:])+[i[0]]) for i in raw]

['O-P-B',
 'B-O-P',
 'B-O-P',
 'O-P-B',
 'B-O-P',
 'B-O-P',
 'O-P-B',
 'B-P-O',
 'B-O-P',
 'O-P-B',
 'B-O-P',
 'O-P-B',
 'B-O-P',
 'O-P-B',
 'O-P-B',
 'O-P-B',
 'B-O-P',
 'B-O-P',
 'O-P-B',
 'B-O-P',
 'O-P-B',
 'O-P-B',
 'B-O-P',
 'O-P-B',
 'B-P-O',
 'B-P-O',
 'B-O-P',
 'B-P-O',
 'O-P-B']

In [80]:
for t in ['hifiasm','shasta','hs_shuf','either','peregrine','raven','flye','hicanu','all_shuf','all_eith']:
    most_c = []
    for i in range(1,30):
        dfa = df[(df['chr']==i)&(df['config']==t)]
        most_c.append(Counter(dfa['order']).most_common(1)[0][0])
    c= 0
    for i,j in zip(most_c,SNP):
        c+=(i==j)
    print(t,c)

hifiasm 10
shasta 14
hs_shuf 13
either 10
peregrine 11
raven 7
flye 11
hicanu 10
all_shuf 9
all_eith 11


In [88]:
g = sns.catplot(data=df,x='order',kind='count',hue='config',col='chr',col_wrap=4)#,order=['O-P-B','B-P-O','B-O-P'])
for i,ax in enumerate(g.axes):
    #ax.scatter(SNP[i],20)
    ax.scatter('B-P-O',15,alpha=0)
    ax.axvline(SNP[i])


Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

In [None]:
import pandas as pd
df = pd.read_csv('bad_regions.csv')

In [None]:
sns.pairplot(data=df,hue='asm')

In [None]:
df.groupby('asm').mean()

In [None]:
import scipy.stats as ss
ss.mannwhitneyu(df[df['asm']=='P_hifiasm']['N_unaligned'],df[df['asm']=='P_shasta']['N_unaligned'])

In [None]:
f=[]
for i in range(300,310):
    x = df[df['run']==i]
    l = [(row['asm'],row['N_uncalled']) for _,row in x.iterrows()]
    f.append([q[0] for q in sorted(l,key=lambda i: i[1])])
f

In [None]:
orders = [l.rstrip() for l in open('orders.txt')]