# Filter Genes and SNPs

## 0. Prerequisites

### 0.1. Dependencies

In [1]:
import re
import numpy as np
from operator import itemgetter
from kagami.comm import unpack, smap, pmap, drop, pick, pickmap, collapse, paste
from kagami.dtypes import Table
from kagami.portals import tablePortal

### 0.2. Routines

In [2]:
_gidre = re.compile('gene=(Dapma7bEVm\d{6});')
_gid   = lambda x: _gidre.findall(x[-1])[0]
_gids  = lambda x: np.unique(smap(x, _gid))

In [3]:
def _unique(x, ignore = ''):
    ux = np.unique(x)
    return ux[ux != ignore]

## 1. Load Data

### 1.1. Load GFF

In [4]:
gff = tablePortal.loadtsv(
    '../../data/references/dmagset7finloc9b.puban.gff'
)

gff = drop(gff, lambda x: len(x) == 0 or x[0].startswith('#'))
assert len(set(smap(gff,len))) == 1
print(f'total data lines = {len(gff)}')

total data lines = 366882


In [5]:
mgff = pick(gff, lambda x: x[2] == 'mRNA')
mgff = pick(mgff, lambda x: x[0].startswith('scaffold'))
mgff = smap(mgff, itemgetter(0,3,4,-1))
print(f'total mRNA lines in scaffolds = {len(mgff)}')

total mRNA lines in scaffolds = 43950


### 1.2. Filter by top scaffolds

Select genes in top scaffolds

In [6]:
topscf = np.unique(tablePortal.loadcsv(
    '../../data/references/top_293_scaffolds'
))
print(f'top scaffolds = {len(topscf)}')

top scaffolds = 293


In [7]:
mgff = pick(mgff, lambda x: x[0] in topscf)
assert np.unique(smap(mgff, lambda x: x[0])).shape[0] == len(topscf)
print(f'mRNA lines in top scaffolds = {len(mgff)}')

mRNA lines in top scaffolds = 38240


### 1.3. Load SNPs

Option 1: Load SNPs from vcflib result

In [8]:
snpdm = np.array(tablePortal.loadtsv(
    '../../data/temporal/BMT_filter_withoutAF_90percent_BM_AFs'
))
snplst = np.unique(smap(snpdm[:,:2], lambda x: paste(x, sep = '_')))

Option 2: Load SNPs from list file

In [None]:
snplst = np.array(tablePortal.loadcsv(
    '../../data/temporal/BMT_filter_withoutAF_90percent_BMT_shared_loci'
)).flatten()

In [9]:
snps = Table(
    np.zeros((snplst.shape[0],1)), 
    rownames = snplst, 
    rowindex = {
        'scaffold': smap(snplst, lambda x: x.split('_',1)[0]),
        'pos': smap(snplst, lambda x: int(x.split('_',1)[1])),
    },
)

# sort position
sids = sorted(snps.rows_, key = lambda x: unpack(lambda s,p:(int(s[8:]),int(p)))(x.split('_',1)))
snps = snps[sids]

print(f'total number of SNPs = {snps.nrow}')

total number of SNPs = 1257108


## 2. Filter Genes

### 2.1. Filter by gene quality 

Select strong and non-split genes

In [10]:
agids = _gids(mgff)
print(f'unique genes in top scaffolds = {len(agids)}')

_sgids  = lambda x: _gids(pick(x, lambda v: len(re.findall('Split=.+?;', v[-1])) > 0))
sgids = _sgids(mgff)
print(f'split genes = {len(sgids)}')

_wgids  = lambda x: _gids(pick(x, lambda v: 'quality=Class:Strong' not in v[-1]))
wgids = _wgids(mgff)
print(f'non-strong genes = {len(wgids)}')

genes = np.setdiff1d(agids, np.union1d(sgids, wgids))
print(f'gene number after filtering = {len(genes)}')

unique genes in top scaffolds = 35362
split genes = 3989
non-strong genes = 15025
gene number after filtering = 17197


Save top genes if necessary

In [11]:
tablePortal.savecsv(genes.reshape((-1,1)), 
    '../../data/references/top_293_scaffolds_strong_nonsplit_genes'
)

True

In [12]:
ggff = pick(mgff, lambda x: _gid(x) in genes)
assert len(ggff) == len(genes) # No gene should exist in two lines

gscf = np.unique(smap(ggff, itemgetter(0)))
print(f'number of scaffolds contain genes after filtering = {len(gscf)}')

number of scaffolds contain genes after filtering = 291


### 2.2. Filter by SNPs

Select genes contain at least one SNP

In [13]:
def _filter(scf):
    sgff = pick(ggff, lambda x: x[0] == scf)
    rngs = np.array(smap(sgff, itemgetter(1,2)), dtype = int)
    assert np.all(rngs[:,1] - rngs[:,0] > 0)

    spos = snps.ridx_.pos[snps.ridx_.scaffold == scf]
    
    slns = np.where(smap(rngs, lambda x: np.any(np.logical_and(x[0] <= spos, spos <= x[1]))))[0]
    if len(slns) == 0: return ()
    
    sgff = itemgetter(*slns)(sgff)
    return (sgff,) if isinstance(sgff[0], str) else sgff
sgffs = pmap(gscf, _filter)

In [14]:
ggff = collapse(sgffs)
assert len(set(smap(ggff,len))) == 1
assert len(ggff) == len(_gids(ggff))
print(f'mRNA lines with SNPs = {len(ggff)}')

mRNA lines with SNPs = 16862


### 2.3. Filter by gene overlaps

Remove SNPs locate in the overlapping region of more than one genes or outside any gene

Remove genes with all the SNPs locate in the overlapping region with another gene

In [15]:
def _filter(scf):
    sgff = pick(ggff, lambda x: x[0] == scf)
    rngs = np.array(smap(sgff, itemgetter(1,2)), dtype = int)
    assert np.all(rngs[:,1] - rngs[:,0] > 0)

    ssnp = snps[snps.ridx_.scaffold == scf]
    spos = ssnp.ridx_.pos

    cnts = np.zeros(max(np.max(rngs), np.max(spos)), dtype = np.uint8) # less than 255 genes overlapping
    for rng in rngs: cnts[rng[0]-1:rng[1]] += 1

    rpos = spos[cnts[spos-1] == 1]
    ssnp = ssnp[smap(spos, lambda x: x in rpos)]

    slns = np.where(smap(rngs, lambda x: np.any(np.logical_and(x[0] <= rpos, rpos <= x[1]))))[0]
    sgff = itemgetter(*slns)(sgff) if len(slns) > 0 else ()
    
    return ((sgff,) if isinstance(sgff[0], str) else sgff, ssnp)
sgffs, ssnps = zip(*pmap(gscf, _filter))

In [16]:
ggff, gsnps = collapse(sgffs), collapse(ssnps)
assert len(set(smap(ggff,len))) == 1
assert len(ggff) == len(_gids(ggff))
print(f'mRNA lines with unique SNPs = {len(ggff)}')
print(f'SNPs in filtered genes and outside overlapping regions = {len(gsnps)}')

mRNA lines with unique SNPs = 12430
SNPs in filtered genes and outside overlapping regions = 527112


Save filtered genes if necessary

In [17]:
gids = _gids(ggff)
tablePortal.savecsv(gids.reshape((-1,1)), 
    '../../data/temporal/BMT_filter_withoutAF_90percent_BMT_shared_loci_filtered_genes'
)

True

## 3. Filter SNPs

### 3.1. Select SNPs outside any gene

In [18]:
def _filter(scf):
    sgff = pick(mgff, lambda x: x[0] == scf)
    rngs = np.array(smap(sgff, itemgetter(1,2)), dtype = int)
    assert np.all(rngs[:,1] - rngs[:,0] > 0)

    ssnp = snps[snps.ridx_.scaffold == scf]
    spos = ssnp.ridx_.pos

    rpos = pick(spos, lambda x: not np.any(np.logical_and(rngs[:,0] <= x, x <= rngs[:,1])))
    ssnp = ssnp[smap(spos, lambda x: x in rpos)]
    return ssnp
ssnps = pmap(topscf, _filter)

In [19]:
osnps = collapse(ssnps)
print(f'SNPs outside genes = {len(osnps)}')

SNPs outside genes = 394686


### 3.2. Save filtered SNPs to file

In [20]:
asnps = gsnps + osnps

_re = re.compile('scaffold(\d{5})_(\d+)')
asnps = asnps[sorted(asnps.rows_, key = lambda x: smap(_re.findall(x)[0],int))]

print(f'number of SNPs after filtering = {len(asnps)}')

number of SNPs after filtering = 921798


In [None]:
tablePortal.savecsv(np.array(asnps.rows_).reshape((-1,1)),
    '../../data/temporal/BMT_filter_withoutAF_90percent_BMT_shared_loci_filtered_loci'
)