# HMM Islands Modelling and Analysis

## 0. Prerequisites

### 0.1. Dependencies

In [30]:
import re
import pickle as pkl
import numpy as np
from kagami.comm import paste, smap, fold
from kagami.dtypes import Table

### 0.2. Routines

In [8]:
def _unique(x, ignore = ''):
    ux = np.unique(x)
    return ux[ux != ignore]

In [10]:
def _ordunique(a):
    a = np.asarray(a)
    _, idx = np.unique(a, return_index = True)
    return a[np.sort(idx)]

### 0.3. Load merged data

In [9]:
otab = Table.loadhdf(
    '../data/temporal/BMT_filter_withoutAF_90percent.filtered.merged_table.hdf',
)

## 1. Identify HMM Islands

In [21]:
def _hmm_islands(col, name):
    sfs = otab.ridx_.scaffold
    usf = _ordunique(sfs)
    
    _r = re.compile('(3{3,})')
    def _summ(s,lasti):
        stab = otab[sfs == s,col]
        itab = Table(-np.ones_like(stab, dtype = int), 
                     rownames = otab.rows_[sfs == s], colnames = [name])
        
        sv = stab.X_[:,0].astype(int)
        ss = paste(sv[sv != -1].astype(str))
        res = list(_r.finditer(ss))
        
        iv = -np.ones(len(ss), dtype = int)
        for i,r in enumerate(res): iv[r.start():r.end()] = i
        iv[iv != -1] += lasti
        itab.X_[sv != -1] = iv.reshape((-1,1))
        
        stab = stab[sv != -1]
        lns = smap(res, lambda x: len(x.group()))
        pos = smap(res, lambda x: np.sort(stab.ridx_.pos[slice(*x.span())]))
        return (itab, lns, pos), max(np.max(iv)+1, lasti)

    odct, lid = {}, 0
    for sf in usf:
        odct[sf],lid = _summ(sf,lid)
    return odct

In [22]:
bmdct = _hmm_islands('BM_HMM_State', 'BM_HMM_Island')
mtdct = _hmm_islands('MT_HMM_State', 'MT_HMM_Island')

Save to file to avoid repeat running

In [25]:
oprefix = '../data/temporal/BMT_filter_withoutAF_90percent.filtered.'
# with open(oprefix + 'BM_hmmdct.pkl', 'wb') as f: pkl.dump(bmdct, f)
# with open(oprefix + 'MT_hmmdct.pkl', 'wb') as f: pkl.dump(mtdct, f)

with open(oprefix + 'BM_hmmdct.pkl', 'rb') as f: bmdct = pkl.load(f)
with open(oprefix + 'MT_hmmdct.pkl', 'rb') as f: mtdct = pkl.load(f)    

Insert to table

In [31]:
itabs = smap(bmdct.values(), lambda x: x[0])
bmitab = fold(itabs, lambda x,y: x.append(y, axis = 0))

itabs = smap(mtdct.values(), lambda x: x[0])
mtitab = fold(itabs, lambda x,y: x.append(y, axis = 0))

In [32]:
otab = otab.append(bmitab[otab.rows_], axis = 1).append(mtitab[otab.rows_], axis = 1)

## 2. Outliers

In [36]:
pvals = otab[:,['BM_Waples_Test_P', 'MT_Waples_Test_P']].X_.T
bmoutls, mtoutls = smap(pvals, lambda x: np.logical_and(~np.isnan(x), x < 0.01))
print(f'BM outliers = {np.sum(bmoutls)}')
print(f'MT outliers = {np.sum(mtoutls)}')

BM outliers = 16607
MT outliers = 4631


In [4]:
bmsgids = np.unique(aftab[bmoutls].ridx_.gene)
bmsgids = bmsgids[bmsgids != '']
print(f'BM genes with outlier = {len(bmsgids)}')

mtsgids = np.unique(aftab[mtoutls].ridx_.gene)
mtsgids = mtsgids[mtsgids != '']
print(f'MT genes with outlier = {len(mtsgids)}')

BM genes with outlier = 4103
MT genes with outlier = 1935
