In [9]:
import numpy as np
import scipy
import pandas
import matplotlib as mpl
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
sns.set_style('white')
sns.set_style('ticks')
sns.set_context('notebook')
import h5py
import allel; print('scikit-allel', allel.__version__)
import warnings
warnings.filterwarnings('ignore')

scikit-allel 1.1.9


In [2]:
from os.path import join 
WORKDIR = '/home/sergio/Res_CIML/TLX3_project'
SCRIPTS = join(WORKDIR,'scripts')
DATADIR = join(WORKDIR,'data')

In [3]:
### Functions
def chrom2num(st):
    chrm = st.split(':')[0]
    pos = st.split(':')[1].split('-')

    pl = int(pos[0].replace(',',''))

    pr = int(pos[1].replace(',',''))
    
    return chrm, pl, pr

def plot_variant_density(pos, window_size, title=None):
    
    # setup windows 
    bins = np.arange(pos.min(), pos.max(), window_size)
    
    # use window midpoints as x coordinate
    x = (bins[1:] + bins[:-1])/2
    
    # compute variant density in each window
    h, _ = np.histogram(pos, bins=bins)
    y = h / window_size
    
    # plot
    fig, ax = plt.subplots(figsize=(12, 3))
    sns.despine(ax=ax, offset=10)
    ax.plot(x, y)
    ax.set_xlabel('Chromosome position (bp)')
    ax.set_ylabel('Variant density (bp$^{-1}$)')
    if title:
        ax.set_title(title)

def plot_variant_hist_2d(f1, f2, variants, downsample):
    x = variants[f1][:][::downsample]
    y = variants[f2][:][::downsample]
    fig, ax = plt.subplots(figsize=(6, 6))
    sns.despine(ax=ax, offset=10)
    ax.hexbin(x, y, gridsize=20)
    ax.set_xlabel(f1)
    ax.set_ylabel(f2)
    ax.set_title('Variant %s versus %s joint distribution' % (f1, f2))

def plot_variant_hist(f, variants, bins=30, down=200):
    x = variants[f][:][::down]
    fig, ax = plt.subplots(figsize=(7, 5))
    sns.despine(ax=ax, offset=10)
    ax.hist(x, bins=bins)
    ax.set_xlabel(f)
    ax.set_ylabel('No. variants')
    ax.set_title('Variant %s distribution' % f)

In [4]:
### VCF direct
ftlx = join(DATADIR,'tracks/WGS-WES/Germline/FERRIER_09_Germline.allchr.snpEff.p.SAL.SAL10_1.vcf')
#ftlxg = 'tracks/WGS/Germline/FERRIER_09_Germline.allchr.snpEff.p.SAL.SAL10_1.vcf.gz'

# read VCF file, transform SNPEFF to separated fields (optional)
cs = allel.read_vcf(ftlx,fields='*', numbers={'ALT': 4},transformers=allel.ANNTransformer())


In [5]:
# variants data to DataFrame, transform SNPEFF to separated fields (optional)
var = allel.vcf_to_dataframe(ftlx,fields='*', numbers={'ALT': 4}, transformers=allel.ANNTransformer())

In [None]:
# Genotype array to special class GenotypeArray
gt = allel.GenotypeArray(cs['calldata/GT'])
    ##- typical functions
    # gt.is_het()
    # gt.count_het(axis=1)
    # ac = gt.count_alleles()

In [None]:
#gt1 = gt.subset([2,7,9,12,45,67,124])
#gt2= gt.subset([3,8,10,13,46,68,125])

#gt3 = gt2.concatenate(gt1)



In [None]:
#gt1

In [None]:
print(sorted(var['ANN_Feature_Type'].unique()))
print(sorted(var['ANN_Transcript_BioType'].unique()))

In [None]:
cod_var = var[var['ANN_Feature_Type']=='transcript']

In [None]:
print(len(var))
len(cod_var)
cod_var[['ANN_Gene_Name','ANN_Feature_Type', 'ANN_Transcript_BioType']].head()

In [None]:
cod_ind=cod_var.index
cod_gt=gt[cod_ind]

## Now we have pair {cod_var, cod_gt} for transcripts only

In [None]:
#cod_var.head(12)
a,b,c = plt.hist(np.log(cod_var['QUAL']), bins=100)

In [None]:
#cod_gt[2,:]

## Strip var data to region 

In [None]:
st  ='chr12:77,033,211-78,041,433'
c,l,r = chrom2num(st)
print(c,l,r)

In [None]:
var_reg = var[(var['CHROM']==c) & (var['POS']>l) & (var['POS']<r)]

In [None]:
 plot_variant_density(var_reg['POS'], window_size=35, title=c)

In [None]:
### Plot density for all chromosomes
# for ch in df['CHROM'].unique():
#     dfc = df[df['CHROM']==ch]
#    plot_windowed_variant_density(dfc['POS'], window_size=100000, title=ch+' , raw variant density')

## Working with chunked table, we need HDF5 file

In [None]:
### Save to hdf5
#import sys
#allel.vcf_to_hdf5(ftlx,'FERRIER_09_Germline.allchr.snpEff.p.SAL.SAL10_1_Shrt.h5', 
#                  fields='*', alt_number=4,transformers=allel.ANNTransformer(),log=sys.stdout, vlen=False)

In [6]:
### HDF5 from VCF database
ftlxh5 =join(DATADIR,'tracks/WGS-WES/Germline/FERRIER_09_Germline.allchr.snpEff.p.SAL.SAL10_1_Shrt.h5')

# read HDF5 file
csh = h5py.File(ftlxh5,mode='r')
var_tb = allel.VariantChunkedTable(csh['variants'], 
                                   names=['CHROM', 'POS', 'REF', 'ALT', 'DP', 'MQ', 'QD', 'is_snp',
                                             'ANN_AA_length',
                                             'ANN_Allele',
                                             'ANN_Annotation',
                                             'ANN_Annotation_Impact',
                                             'ANN_Feature_ID',
                                             'ANN_Feature_Type',
                                             'ANN_Gene_ID',
                                             'ANN_Gene_Name',
                                             'ANN_Rank',
                                             'ANN_Transcript_BioType','numalt'])

In [None]:
#a,b,c=plt.hist(var_tb['DP'][:], bins=10)
#csh['variants/REF']

## Now we can work with filters

In [7]:
#fltr_expr = '(QD > 5) & (MQ > 40) & (DP > 1500) & (DP < 3000)'
#fltr_expr="ANN_Feature_Type==b'transcript'"

fltr_expr="is_snp==True"

var_tb_fltr = var_tb.eval(fltr_expr)[:]

#var_tb
#var_tb_fltr
np.count_nonzero(var_tb_fltr)
#np.count_nonzero(~var_tb_fltr)

#list(csh['calldata'].keys())
#list(csh['variants'].keys())

909904

In [8]:
## apply filter
var_pass = var_tb.compress(var_tb_fltr)


## Genotype from HDF5 

In [None]:
list(csh['calldata'].keys())

In [None]:
gth = allel.GenotypeChunkedArray(csh['calldata/GT'])
gth

In [None]:
list(csh['samples'])

In [None]:
import pandas as pd
samples = pd.DataFrame({'sample':[b'AC3812', b'AC3813', b'AC3814', b'AC3815'], 'cell_type':['TAP','TAP','TLX3','TLX3']})
TLX = samples['cell_type'].isin(['TLX3'])
TAP = samples['cell_type'].isin(['TAP'])


## Subset genotype on transcrips and samples

In [None]:
gth_tlx = gth.subset(var_tb_fltr, TLX)
gth_tap = gth.subset(var_tb_fltr, TAP)

#### Now we have three tables: {var_pass, gth_tlx, gth_tap}  for transcripts only 

In [None]:
n_variants = len(var_pass)
pc_missing_tlx = gth_tlx.count_missing(axis=0)[:] * 100 / n_variants
pc_het_tlx = gth_tlx.count_het(axis=0)[:] * 100 / n_variants

pc_missing_tap = gth_tap.count_missing(axis=0)[:] * 100 / n_variants
pc_het_tap = gth_tap.count_het(axis=0)[:] * 100 / n_variants



print('TLX3 missing = ', pc_missing_tlx)
print('TLX3 hetero = ', pc_het_tlx)

print('TAP missing = ', pc_missing_tap)
print('TAP hetero = ', pc_het_tap)

In [None]:
tlx_seg = gth_tlx.count_alleles().count_segregating()
tap_seg = gth_tap.count_alleles().count_segregating()

print('TLX segregating = ', tlx_seg)
print('TAP segregating = ', tap_seg)

# CASE 1: Variants for list of genes - tumor suppressors

In [None]:
import EnrichRLib as erl

tall_sup = erl.read_gmt('gene_lists/T-ALL_suppressor.gmt')
gl = tall_sup['T-ALL-suppressor']

gl

In [None]:
# working with pair {cod_var, cod_gt}
cod_var.loc[:,'ANN_Gene_Name'] = cod_var['ANN_Gene_Name'].str.upper()

In [None]:
#cod_var['ANN_Gene_Name'].head()

cod_var_gs = cod_var.loc[cod_var['ANN_Gene_Name'].isin(gl)]

In [None]:
cod_gt_gs = cod_gt[cod_var_gs.index]


In [None]:
# TLX3 count homo/hetero
tlx_homalt = cod_gt_gs[:,2:].is_hom_alt()[:]
cod_gt_gs[:,2:].count_hom_alt()


In [None]:
# TAP count homo/hetero
tap_homalt = cod_gt_gs[:,:2].is_hom_alt()[:]

cod_gt_gs[:,:2].count_hom_alt()

In [None]:
cod_var_gs_r = cod_var_gs.reset_index()

# tlx
cod_var_gs_tlx = cod_var_gs_r[tlx_homalt]

# tap
cod_var_gs_tap = cod_var_gs_r[tap_homalt]

In [None]:
cols = ['CHROM', 'POS', 'REF', 'ALT_1', 'ALT_2',
        'ANN_Annotation',
        'ANN_Annotation_Impact',
        'ANN_Feature_ID',
        'ANN_Feature_Type',
        'ANN_Gene_ID',
        'ANN_Gene_Name',
        'ANN_Rank',
        'ANN_Transcript_BioType']

cod_var_gs_tlx[cols]

In [None]:
cod_var_gs_tap[cols]

# CASE 2: Variant in enhancers

In [None]:
import pybedtools as pb

In [None]:
enh = pb.BedTool('tracks/Enhancers_ChromHMM.bed')
enh_df = enh.to_dataframe()

In [None]:
#enh_df.head()

In [None]:
ftlx = 'tracks/WGS-WES/Germline/FERRIER_09_Germline.allchr.snpEff.p.SAL.SAL10_1.vcf'
var_b = pb.BedTool(ftlx)

In [None]:
var_enh = (var_b + enh).saveas('tracks/WGS-WES/Germline/Vars_Enh_noHeader.vcf')

In [None]:
len(var_enh)

In [None]:
# Concat with header

# !cat tracks/WGS-WES/Germline/Germline_header.txt tracks/WGS-WES/Germline/Vars_Enh_noHeader.vcf > tracks/WGS-WES/Germline/Vars_Enh.vcf

In [None]:
var_enh_df = var_enh.to_dataframe(names=['CHROM','POS','ID','REF','ALT','QUAL','FILTER','INFO','FORMAT','AC3812','AC3813','AC3814','AC3815'])

In [None]:
var_enh_df.head()

In [None]:
#var_enh_df['INFO'][16]

In [None]:
#plot_variant_hist_2d('QD', 'MQ', var, downsample=500)



#“MQ” is average mapping quality across all samples.
#plot_variant_hist('MQ', var, down=2)

#“QD” is a slightly odd statistic but turns out to be very useful 
# for finding poor quality SNPs. Roughly speaking, high numbers 
# mean that evidence for variation is strong (concentrated), 
# low numbers mean that evidence is weak (dilute).


#x = var['QD'][:][::1000]
#plot_variant_hist('QD', var, bins=30, down=500)

#ac = gt.count_alleles()
#ac

