In [11]:
import numpy as np
import scipy
import pandas as pd
import pybedtools as pb

# figs
import matplotlib as mpl
import matplotlib.pyplot as plt
%matplotlib inline
mpl.rcParams['figure.dpi']= 150
import seaborn as sns


#import h5py
#import allel

# my libs
import EnrichRLib as erl
import RNA_expression_processing as rn
import Enh_Mut_Manip as emm

# warnings
import warnings
warnings.filterwarnings('ignore')

In [30]:
# Project settings
from os.path import join 
WORKDIR = '/home/sergio/Res_CIML/TLX3_project'
SCRIPTS = join(WORKDIR,'scripts')
DATADIR = join(WORKDIR,'data')
WGS = join(DATADIR,'tracks/WGS-WES/Germline')
RP = join(DATADIR,'tracks/MARGE/relativeRP/bam_input')

# Data loading

### Genome Wide mutations in TLX3

In [7]:
## == Genome vcf
tlx_gn = pb.BedTool(join(WGS,'TLX3_WGS.vcf.gz'))

In [18]:
## == Expression table TLX3 vs RAG
tbl = pd.read_table(join(DATADIR,'tracks', 'TLX3vsRAG-results_genesNames.txt'), index_col=0)
#tbl = tbl[(tbl.padj < 0.05)].dropna()

# -- Pheno --
A,B = 'TLX3','RAG'
classes = [A]*3+[B]*3


cols = ['Gene_name', 'TLX3.1_1','TLX3.1_5','TLX3.1_P','R2.RAG1W.RAG1','RAGS.RAGZ','RAGZ']

tbn = tbl[cols]
expr = tbn.set_index(keys=tbn.columns[0])
#expr.index = expr.index.str.upper()

In [19]:
## == Enhancers 
enh = pb.BedTool(join(DATADIR,'tracks/Enhancers_ChromHMM.bed'))
enh_df = enh.to_dataframe()

# Enh to genes table
enh2gn = pd.read_table(join(DATADIR,'tracks/Enhancers_ChromHMM_enh2genes.txt'), 
                        header=1, 
                        names=['enhancers','gene_name'])
# Gene to enh table
gn2enh = pd.read_table(join(DATADIR,'tracks/Enhancers_ChromHMM_genes2enh.txt'), 
                        header=1, 
                        names=['gene_name','enhancers'])


In [34]:
# Regulatory Potentials
rpt  =  pd.read_table(join(RP,'RAG_TLX_TAP_relativeRP_mm10mm9.txt'))

# -- transform
rp = rpt.sort_values('lgFC_TLXvsRAG', axis=0, ascending=False)
rp.drop_duplicates(subset='gene_name', inplace=True)


A,B = 'TLX_rel_RP','RAG_rel_RP'
cols = ['gene_name', A, B]

rp = rp[cols]
rp = rp.set_index(keys=rp.columns[0])


In [36]:
rpt.head()

Unnamed: 0,chr,start,end,gene_id,gene_name,strand,RAG_raw_RP,RAG_rel_RP,TLX_raw_RP,TLX_rel_RP,TAP_raw_RP,TAP_rel_RP,chr_mm9,strand_mm9,start_mm9,end_mm9,lgFC_TLXvsRAG,lgFC_TAPvsRAG,lgFC_TAPvsTLX
0,chr1,134235457,134235458,NM_001282945,Adora1,-,1323.139261,0.718,1292.018637,0.701,536.407687,0.291,chr1,-,136132034,136132035,-0.034569,-1.302965,-1.268395
1,chr1,134235457,134235458,NM_001039510,Adora1,-,1323.139261,0.718,1292.018637,0.701,536.407687,0.291,chr1,-,136132034,136132035,-0.034569,-1.302965,-1.268395
2,chr1,134234856,134234857,NM_001291928,Adora1,-,1329.644667,0.722,1292.148218,0.702,537.01003,0.292,chr1,-,136131433,136131434,-0.040528,-1.30603,-1.265503
3,chr1,134235457,134235458,NM_001008533,Adora1,-,1323.139261,0.718,1292.018637,0.701,536.407687,0.291,chr1,-,136132034,136132035,-0.034569,-1.302965,-1.268395
4,chr1,134235457,134235458,NM_001291930,Adora1,-,1323.139261,0.718,1292.018637,0.701,536.407687,0.291,chr1,-,136132034,136132035,-0.034569,-1.302965,-1.268395


In [25]:
#gn2enh.head()
ts = emm.enh_gene(list(expr[0:10].index),gn2enh)

ts_en = enh_df[enh_df['name'].isin(ts)]

tsen = pb.BedTool.from_dataframe(ts_en).sort()