# HMM Islands Modelling and Analysis

## 0. Prerequisites

### 0.1. Dependencies

In [1]:
import re
import pickle as pkl
import numpy as np
from operator import itemgetter
from scipy.stats import mannwhitneyu, chi2
from kagami.comm import l, paste, smap, pmap, unpack, fold
from kagami.dtypes import Table

### 0.2. Routines

In [5]:
def _unique(x, ignore = ''):
    ux = np.unique(x)
    return ux[ux != ignore]

In [6]:
def _ordunique(a):
    a = np.asarray(a)
    _, idx = np.unique(a, return_index = True)
    return a[np.sort(idx)]

### 0.3. Load merged data

In [126]:
otab = Table.loadhdf(
    '../data/temporal/BMT_filter_withoutAF_90percent.filtered.merged_table.hdf',
)

## 1. Identify HMM Islands

In [66]:
def _hmm_islands(col, name):
    sfs = otab.ridx_.scaffold
    usf = _ordunique(sfs)
    
    _r = re.compile('(3{3,})')
    def _summ(s,lasti):
        stab = otab[sfs == s,col]
        itab = Table(-np.ones_like(stab, dtype = int), 
                     rownames = otab.rows_[sfs == s], colnames = [name])
        
        sv = stab.X_[:,0].astype(int)
        ss = paste(sv[sv != -1].astype(str))
        res = list(_r.finditer(ss))
        
        iv = -np.ones(len(ss), dtype = int)
        for i,r in enumerate(res): iv[r.start():r.end()] = i
        iv[iv != -1] += lasti
        itab.X_[sv != -1] = iv.reshape((-1,1))
        
        stab = stab[sv != -1]
        lns = smap(res, lambda x: len(x.group()))
        pos = smap(res, lambda x: np.sort(stab.ridx_.pos[slice(*x.span())]))
        return (itab, lns, pos), max(np.max(iv)+1, lasti)

    odct, lid = {}, 0
    for sf in usf:
        odct[sf],lid = _summ(sf,lid)
    return odct

In [67]:
bmhdct = _hmm_islands('BM_HMM_State', 'BM_HMM_Island')
mthdct = _hmm_islands('MT_HMM_State', 'MT_HMM_Island')

Save to file to avoid repeat running

In [127]:
oprefix = '../data/temporal/BMT_filter_withoutAF_90percent.filtered.'
# with open(oprefix + 'BM_hmmdct.pkl', 'wb') as f: pkl.dump(bmhdct, f)
# with open(oprefix + 'MT_hmmdct.pkl', 'wb') as f: pkl.dump(mthdct, f)

with open(oprefix + 'BM_hmmdct.pkl', 'rb') as f: bmhdct = pkl.load(f)
with open(oprefix + 'MT_hmmdct.pkl', 'rb') as f: mthdct = pkl.load(f)    

Insert to table

In [128]:
itabs = smap(bmdct.values(), lambda x: x[0])
bmitab = fold(itabs, lambda x,y: x.append(y, axis = 0))

itabs = smap(mtdct.values(), lambda x: x[0])
mtitab = fold(itabs, lambda x,y: x.append(y, axis = 0))

In [129]:
otab = otab.append(bmitab[otab.rows_], axis = 1).append(mtitab[otab.rows_], axis = 1)

## 2. Outliers

### 2.1 Overall outliers

In [131]:
pvals = otab[:,['BM_Waples_Test_P', 'MT_Waples_Test_P']].X_.T
bmoutls, mtoutls = smap(pvals, lambda x: np.logical_and(~np.isnan(x), x < 0.01))
print(f'BM outliers = {np.sum(bmoutls)}')
print(f'MT outliers = {np.sum(mtoutls)}')

BM outliers = 29649
MT outliers = 11065


In [132]:
bmsgids, mtsgids = smap([bmoutls, mtoutls], lambda x: _unique(otab[x].ridx_.gene, ''))
print(f'BM genes with outlier = {len(bmsgids)}')
print(f'MT genes with outlier = {len(mtsgids)}')

BM genes with outlier = 4103
MT genes with outlier = 1935


### 2.2 Reversal outliers

In [133]:
afs = otab[:,['B_AFs', 'M_AFs', 'T_AFs']].X_
majors = afs[:,0] > 0.5
afs[majors] = 1- afs[majors]

In [134]:
rloci = np.logical_or(
    np.logical_and(afs[:,0] < afs[:,1], afs[:,1] > afs[:,2]),
    np.logical_and(afs[:,0] > afs[:,1], afs[:,1] < afs[:,2]),    
)
print(f'reverse loci = {np.sum(rloci)}')

dloci = np.logical_or(
    np.logical_and(afs[:,0] < afs[:,1], afs[:,1] < afs[:,2]),
    np.logical_and(afs[:,0] > afs[:,1], afs[:,1] > afs[:,2]),
)
print(f'directional loci = {np.sum(dloci)}')

reverse loci = 371517
directional loci = 182653


In [135]:
outls = np.logical_and(bmoutls, mtoutls)

orloci = np.logical_and(rloci, outls)
print(f'reverse outlier loci = {np.sum(orloci)}')
odloci = np.logical_and(dloci, outls)
print(f'directional outlier loci = {np.sum(odloci)}')

reverse outlier loci = 1753
directional outlier loci = 18


In [136]:
iolaps = np.logical_and.reduce(otab[:,['BM_HMM_Island', 'MT_HMM_Island']].X_.astype(int).T != -1)

iorloci = np.logical_and.reduce([rloci, iolaps, outls])
print(f'reverse outlier loci in island overlapping regions = {np.sum(iorloci)}')
iodloci = np.logical_and.reduce([dloci, iolaps, outls])
print(f'directional outlier loci in island overlapping regions = {np.sum(iodloci)}')

reverse outlier loci in island overlapping regions = 1109
directional outlier loci in island overlapping regions = 10


In [137]:
iids = otab[:,['BM_HMM_Island', 'MT_HMM_Island']].X_.astype(int).T

bmiids, mtiids = smap(iids, lambda x: _unique(x[iorloci], -1))
print(f'BM islands with reverse outlier loci = {len(bmiids)}')
print(f'MT islands with reverse outlier loci = {len(mtiids)}')

bmgids, mtgids = smap(
    zip(iids, [bmiids,mtiids]), 
    unpack(lambda ids,uid: [_unique(otab.ridx_.gene[ids == i], '') for i in uid]),
    lambda x: np.unique(fold(x, np.union1d)),
)
print(f'BM islands with reverse outlier loci contain genes = {len(bmgids)}')
print(f'MT islands with reverse outlier loci contain genes = {len(mtgids)}')

BM islands with reverse outlier loci = 390
MT islands with reverse outlier loci = 406
BM islands with reverse outlier loci contain genes = 473
MT islands with reverse outlier loci contain genes = 377


### 2.3 Stats

Test BM island sizes significantly larger than MT

In [138]:
_neuc = lambda x: np.max(x)-np.min(x)+1

def _island_size(dval):
    uids = np.unique(dval[0].X_[:,0])
    uids = uids[uids != -1]
    poss = dval[2]
    assert len(uids) == len(poss)
    return {i: _neuc(p) for i,p in zip(uids,poss)}

bmldcts, mtldcts = smap((bmhdct, mthdct), lambda x: smap(x.values(), _island_size))
bmldct = {} 
for d in bmldcts: bmldct.update(d)
mtldct = {} 
for d in mtldcts: mtldct.update(d)

In [139]:
bmilens, mtilens = smap(
    zip([bmiids,mtiids], [bmldct,mtldct]),
    unpack(lambda i,d: np.array(itemgetter(*i)(d))),
)
print(f'Length of BM islands with reverse outlier loci = {np.sum(bmilens)}')
print(f'Length of MT islands with reverse outlier loci = {np.sum(mtilens)}')

Length of BM islands with reverse outlier loci = 1954661
Length of MT islands with reverse outlier loci = 1231181


In [140]:
_mwu = lambda x,y: mannwhitneyu(x, y, alternative = 'greater')
print(f'BM islands size larger than MT = {_mwu(bmilens, mtilens,)}')

BM islands size larger than MT = MannwhitneyuResult(statistic=95519.0, pvalue=2.31166416481555e-07)


Test number of reversal outlier significantly larger than expected

In [143]:
N = otab.nrow
bcnts = otab[:,['B_Counts0','B_Counts1']].X_.reshape((N,1,2))
mcnts = otab[:,['M_Counts0','M_Counts1']].X_.reshape((N,1,2))
tcnts = otab[:,['T_Counts0','T_Counts1']].X_.reshape((N,1,2))
cnts = np.hstack([bcnts, mcnts, tcnts])

def _chi2(counts):
    a1, b1, a2, b2 = counts.T
    s1, s2 = a1 + b1, a2 + b2
    stats = (s1+s2)/(s1*s2) * np.power(a1*b2-a2*b1,2)/((a1+a2)*(b1+b2))
    pvals = 1 - chi2.cdf(stats, 1)
    return pvals

def _perm(_):
    pcnt = np.apply_along_axis(np.random.permutation, 1, cnts) + 1 # avoid zero division
    bmpvals = _chi2(pcnt[:,:2,:].reshape((N,4)))
    mtpvals = _chi2(pcnt[:,1:,:].reshape((N,4)))
    
    pouts = np.logical_and(bmpvals < 0.01, mtpvals < 0.01)
    
    lcnts = pcnt[:,:,0]
    rlocs = np.logical_or(
        np.logical_and.reduce([lcnts[:,0] < lcnts[:,1], lcnts[:,1] > lcnts[:,2], pouts]),
        np.logical_and.reduce([lcnts[:,0] > lcnts[:,1], lcnts[:,1] < lcnts[:,2], pouts]),
    )
    return np.sum(rlocs), np.hstack([lcnts, bmpvals.reshape((-1,1)), mtpvals.reshape((-1,1))])

nperm = 100
perms = pmap(range(nperm), _perm)

In [144]:
mrevs = np.array(smap(perms, itemgetter(0)))
pperc = np.sum(mrevs >= np.sum(orloci)) / nperm
print(f'on {nperm} permutations p-value = {pperc} ({int(np.round(np.mean(mrevs)))} vs {np.sum(orloci)})')

on 100 permutations p-value = 0.0 (1015 vs 1753)


## 3. Island Summary

In [148]:
iids = otab[:,['BM_HMM_Island', 'MT_HMM_Island']].X_.astype(int)

In [153]:
print(f'number of HGD SNPs in islands = {l(np.sum(iids != -1, axis = 0))}')

number of HGD SNPs in islands = [128969, 49148]


In [154]:
inums = smap(iids.T, lambda x: _unique(x,-1).shape[0])
print(f'number of islands = {inums}')

number of islands = [6111, 2879]


In [160]:
fsts = otab[:,['BM_Fst', 'MT_Fst']].X_
print(f'BM mean Fst in islands = {np.mean(fsts[iids[:,0]!=-1,0])}')
print(f'MT mean Fst in islands = {np.mean(fsts[iids[:,1]!=-1,1])}')

BM mean Fst in islands = 0.10570131843532941
MT mean Fst in islands = 0.10120572020806545


In [157]:
def _lens(idx):
    uid = _unique(idx, -1)
    return np.array(smap(uid, lambda x: np.sum(idx == x)))
lens = pmap(iids.T, _lens)
print(f'BM island SNPs mean = {np.mean(lens[0])}, std = {np.std(lens[0])}')
print(f'MT island SNPs mean = {np.mean(lens[1])}, std = {np.std(lens[1])}')

BM island SNPs mean = 21.10440189821633, std = 28.718204951372215
MT island SNPs mean = 17.071205279610975, std = 19.48960056963467


In [159]:
_size = lambda dct: np.hstack(smap(dct.values(), itemgetter(2), lambda x: smap(x, lambda p: np.max(p)-np.min(p)+1)))
bmhsize, mthsize = smap((bmhdct, mthdct), _size)
print(f'mean island size = {[np.mean(bmhsize), np.mean(mthsize)]}')

mean island size = [2427.6489936180656, 1713.4529350468913]
