In [1]:
import json
import pandas as pd
import seaborn as sb
import matplotlib.pyplot as plt
import sys

In [2]:
catalog = pd.read_csv("/Users/english/code/adotto/regions/adotto_TRregions_v1.1.bed", sep='\t')
catalog.set_index(['chr', 'start', 'end'], inplace=True)

In [3]:
benchmark = pd.read_csv("/Users/english/code/adotto/benchmark/GIABTR_benchmark.6.26/GIABTR.HG002.benchmark.regions.bed.gz",
                        sep='\t', names=['chr', 'start', 'end', 'tier', 'repl', 'var_state', 'entropy', 'mat_ad', 'pat_ad'])
benchmark.set_index(['chr', 'start', 'end'], inplace=True)

In [4]:
len(catalog), len(benchmark), len(benchmark) / len(catalog)

(1784804, 1706853, 0.9563251763218819)

In [5]:
data = benchmark[benchmark['tier'] == "Tier1"]
data = data.join(catalog)

In [6]:
data['is_inter'] = data['interspersed'] != '.'

def get_max_motif(x):
    d = json.loads(x)
    return max([len(_['motif']) for _ in d])
data['max_motif'] = data['annos'].apply(get_max_motif)

data['motif_length_bin'] = pd.cut(data['max_motif'], bins=[2, 8, 20, 50, 100, sys.maxsize], right=False, 
              labels=["[2, 8)", "[9, 20)", "[21,50)", "[51, 100)", ">100"])
data['max_delta'] = data[['mat_ad','pat_ad']].max(axis=1)
hg002_subset = data['var_state'].apply(lambda x: x & 0x1) == 1


data['approx_copies'] = data['max_delta'] / data['max_motif']

real_deltas = pd.read_csv("n_ad.txt", sep='\t', names=["chr", "start", "end", "delta"]).set_index(['chr', 'start', 'end'])
data['real_delta'] = real_deltas
data['approx_copies'] = data['real_delta'] / data['max_motif']

FileNotFoundError: [Errno 2] No such file or directory: 'n_ad.txt'

In [None]:
p = sb.boxplot(data=data[hg002_subset & (~data['is_inter'])], x = "motif_length_bin", y="max_delta", showfliers=False)
p.set(title="Motif x Delta (F2c)")

In [None]:
# There's a general trend that longer motifs create longer expansions/contractions of the region
# However, for regions where the maximum motif is over 100bp, we see an increase in 'partial' copies
# This is possibly due to the maximum motif length annotation being a super-sequence of the  

In [None]:
p = sb.boxplot(data=data[hg002_subset], 
               x = "motif_length_bin", 
               y="max_delta", 
               hue='is_inter',
               showfliers=False)

In [None]:
p = sb.boxplot(data=data[hg002_subset & ~data['is_inter']], 
               x = "motif_length_bin",
               y="max_delta", 
               showfliers=False)

In [None]:
data['is_inter'].value_counts() # Can make another note about these.

In [None]:
print((data[hg002_subset & (~data['is_inter'])]['approx_copies'] > 0).sum(),
      (data[hg002_subset & (~data['is_inter'])]['approx_copies'] < 0).sum())

In [None]:
view = data[hg002_subset & (data['approx_copies'].abs().between(1, 21)) & (~data['is_inter'])]
p = sb.histplot(data=view, 
                x='approx_copies', hue="motif_length_bin", 
                multiple='stack', 
                binwidth=1)
plt.xticks(ha='left')
p.set(yscale='log', title="approximate expansion/contraction copy number (F2d)")

# For this we'll be looking for symmetry. 
Though there will be an imbalance at larger allele deltas because contractions are limited to a maximum length of the region's span whereas expansions can continue indefinitley.

In [None]:
# Why we sometimes want to exclude interspersed repeats
view = data[hg002_subset & (data['approx_copies'].between(1, 21)) & (data['is_inter'])]
p = sb.histplot(data=view, 
                x='fake_approx_copies', hue="motif_length_bin", 
                multiple='stack', 
                binwidth=1)
p.set(yscale='log')

# Percent of TR regions in Tiers

In [None]:
benchmark['tier'].value_counts()

In [None]:
benchmark['tier'].value_counts() / len(benchmark)

In [None]:
v = benchmark[benchmark['tier'] == "Tier1"].reset_index()
t1_span = (v['end'] - v['start']).sum()
v = benchmark[benchmark['tier'] == "Tier2"].reset_index()
t2_span = (v['end'] - v['start']).sum()
print('t1', t1_span, 't2', t2_span)

In [None]:
# Unanimous agreement