In [12]:
import pandas as pd
import seaborn as sb
import glob
import joblib
import numpy as np

In [None]:
# Per sample numbers

In [11]:
all_dfs = glob.glob("persamp_varnumbers/*jl")
chrom_persample = [joblib.load(_) for _ in all_dfs]

In [14]:
sample_sum = {}
for chrom in chrom_persample:
    for sample, values in chrom.items():
        if sample not in sample_sum:
            sample_sum[sample] = values
        else:
            sample_sum[sample] += values

In [27]:
sample_sum['HG00096'][:,:4].sum()

7506675

In [37]:
s = sample_sum['HG00096'][:, :4]
s[1] / s.sum(axis=0)

array([0.20591961, 0.35057034, 0.65898265, 0.71786304, 0.49732879])

In [28]:
vars_per_sample = pd.DataFrame(
                        [[k, v[:, :4].sum()] for k,v in sample_sum.items()],
                        columns=['sample', 'tot_vars']
                    )
vars_per_sample

Unnamed: 0,sample,tot_vars
0,HG00096,7506675
1,HG00171,7716979
2,HG00512,7950401
3,HG00513,7949054
4,HG00514,9854081
...,...,...
81,NA21309,7080185
82,li:HG00733,6550635
83,li:NA12878,6269644
84,li:NA24385,6307540


In [30]:
vars_per_sample['tot_vars'].describe().astype(int)

count          86
mean      7250447
std        994311
min       5774879
25%       6258978
50%       7250783
75%       7765342
max      10227465
Name: tot_vars, dtype: int64

In [34]:
def pct_tr(d):
    s = d[:, :4]
    return s[1].sum() / s.sum()

pct_tr_per_sample = pd.DataFrame(
                        [[k, pct_tr(v)] for k,v in sample_sum.items()],
                        columns=['sample', 'pct_tr']
                    )
pct_tr_per_sample

Unnamed: 0,sample,pct_tr
0,HG00096,0.241313
1,HG00171,0.245533
2,HG00512,0.233581
3,HG00513,0.237805
4,HG00514,0.227258
...,...,...
81,NA21309,0.253136
82,li:HG00733,0.254720
83,li:NA12878,0.251977
84,li:NA24385,0.253190


In [39]:
def pct_tr_type(k, d):
    ret = list(d[1] / d.sum(axis=0))
    ret.insert(0, k)
    return ret

pct_tr_per_sample = pd.DataFrame(
                        [pct_tr_type(k, v) for k,v in sample_sum.items()],
                        columns=['sample', 'snp', 'five', 'fifty', 'sv', 'len']
                    )
pct_tr_per_sample

Unnamed: 0,sample,snp,five,fifty,sv,len
0,HG00096,0.205920,0.350570,0.658983,0.717863,0.497329
1,HG00171,0.211059,0.357604,0.663484,0.711061,0.485174
2,HG00512,0.198931,0.332674,0.659010,0.702700,0.462575
3,HG00513,0.202685,0.351580,0.662701,0.708800,0.471683
4,HG00514,0.205112,0.251086,0.658276,0.703133,0.444972
...,...,...,...,...,...,...
81,NA21309,0.212807,0.380759,0.669610,0.727189,0.470844
82,li:HG00733,0.217738,0.377267,0.668760,0.735432,0.507921
83,li:NA12878,0.213776,0.376909,0.667226,0.738703,0.489362
84,li:NA24385,0.212839,0.379759,0.665112,0.738439,0.518887


In [40]:
pct_tr_per_sample.describe()

Unnamed: 0,snp,five,fifty,sv,len
count,86.0,86.0,86.0,86.0,86.0
mean,0.212091,0.366297,0.666016,0.722369,0.464789
std,0.00684,0.025523,0.005652,0.011762,0.022267
min,0.189031,0.251086,0.654101,0.691322,0.401017
25%,0.208962,0.360253,0.661121,0.714486,0.450411
50%,0.214178,0.378982,0.668138,0.725985,0.465788
75%,0.216863,0.381,0.670546,0.730871,0.481645
max,0.2214,0.386498,0.674651,0.739112,0.518887


In [3]:
pvcf = pd.read_csv("TRcatalog_pVCFcounts.txt.gz", sep='\t')

In [4]:
genome = pd.read_csv("genome_pVCFcounts.txt", sep='\t')

In [5]:
genome.sum()

chrom          chr1chr2chr3chr4chr5chr6chr7chr8chr9chr10chr11...
snp                                                    106410565
five                                                    14421648
fifty                                                    3138977
sv                                                        763953
total_len                                             1185371549
total_count                                            124735143
dtype: object

In [6]:
tots_g = genome.drop(columns=['chrom']).sum()
tots_p = pvcf.drop(columns=['chrom', 'start', 'end']).sum()

In [7]:
tots_p / tots_g

snp            0.182234
five           0.259509
fifty          0.697908
sv             0.742466
total_len      0.410679
total_count    0.207577
dtype: float64

In [8]:
tots_p

snp             19391644
five             3742549
fifty            2190718
sv                567209
total_len      486807444
total_count     25892120
dtype: int64

In [19]:
pvcf['total_count'].sum()

25892120

In [5]:
(pvcf['total_count'] == 0).mean()

0.2724349564433966

# What percent of the TR regions have no variants
- no variants at all
- no non-snps
- no variants >= 5bp

In [41]:
pvcf

Unnamed: 0,chrom,start,end,snp,five,fifty,sv,total_len,total_count
0,chr1,9975,10498,1165,54,5,3,491,1227
1,chr1,10602,11025,651,18,7,15,2684,691
2,chr1,11195,11472,91,7,0,0,7,98
3,chr1,15797,15871,14,4,0,0,4,18
4,chr1,16687,16768,11,4,2,0,24,17
...,...,...,...,...,...,...,...,...,...
1784799,chrY,56881152,56881235,14,6,5,0,51,25
1784800,chrY,56884295,56884465,227,12,13,0,205,252
1784801,chrY,56884637,56884703,109,2,0,0,2,111
1784802,chrY,56885593,56885665,89,3,0,0,5,92


In [43]:
pvcf.head()[['five', 'fifty', 'sv']].sum(axis=1)

0    62
1    40
2     7
3     4
4     6
dtype: int64

In [44]:
print('none at all', (pvcf['total_count'] == 0).mean())
print('no non-snps', (pvcf[['five', 'fifty', 'sv']].sum(axis=1) == 0).mean())
print('no >=5bp', (pvcf[['fifty', 'sv']].sum(axis=1) == 0).mean())

none at all 0.2724349564433966
no non-snps 0.4745254941158805
no >=5bp 0.810680612549053


In [46]:
# Total span of TR regions with any variation
view = pvcf[(pvcf[['fifty', 'sv']].sum(axis=1) != 0)]

In [49]:
(view['end'] - view['start']).sum() / 2_923_715_986

0.03035558495591849

In [53]:
tots_p[['fifty', 'sv']].sum() / tots_g[['fifty', 'sv']].sum()

0.706629890876854

In [74]:
view[['chrom', 'start', 'end']].to_csv('TR_with_any_variant.bed', sep='\t', header=False, index=False)

In [75]:
view = pvcf[(pvcf[['fifty', 'sv']].sum(axis=1) == 0)]
view[['chrom', 'start', 'end']].to_csv('TR_without_any_variant.bed', sep='\t', header=False, index=False)

# Checking dbSNP

In [67]:
sub = ['chrom', 'start', 'end']

In [69]:
all_dbsnp = (pd.read_csv("TRcatalog_chr1_dbSNPcounts.txt.gz", sep='\t')
                    .groupby(sub)
                    .sum()
                   .reset_index())
all_dbsnp.drop_duplicates(subset=sub).shape

(145464, 8)

In [71]:
all_dbsnp[(all_dbsnp[['fifty', 'sv']].sum(axis=1) != 0)].shape

(115377, 8)

In [73]:
115377 / (pvcf['chrom'] == 'chr1').sum()

0.791326593599539

In [66]:
common_dbsnp = (pd.read_csv("TRcatalog_chr1_dbSNP_commoncounts.txt.gz", sep='\t')
                    .groupby(sub)
                    .sum()
                   .reset_index()
               )
common_dbsnp.drop_duplicates(subset=sub).shape

(78988, 8)

In [68]:
common_dbsnp[(common_dbsnp[['fifty', 'sv']].sum(axis=1) != 0)].shape

(37968, 8)

In [72]:
37968 / (pvcf['chrom'] == 'chr1').sum()

0.26040795050822346