In [34]:
import pybedtools as pb
import numpy as np
#import scipy
import pandas as pd
#import matplotlib as mpl
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
%load_ext autoreload 
%autoreload 2  

In [2]:
from os.path import join 
WORKDIR = '/home/sergio/Res_CIML/TLX3_project'
SCRIPTS = join(WORKDIR,'scripts')
DATADIR = join(WORKDIR,'data')

## Load enhancers 

In [35]:
enh = pb.BedTool(join(DATADIR,'tracks/Enhancers_ChromHMM.bed'))
enh_df = enh.to_dataframe()

# Enh to genes table
enh2gn = pd.read_table(join(DATADIR,'tracks/Enhancers_ChromHMM_enh2genes.txt'), 
                        header=1, 
                        names=['enhancers','gene_name'])
# Gene to enh table
gn2enh = pd.read_table(join(DATADIR,'tracks/Enhancers_ChromHMM_genes2enh.txt'), 
                        header=1, 
                        names=['gene_name','enhancers'])

def enh_gene(genl,df):
    if isinstance(genl, str):
        genl = [genl]
    enhll = list()
    for gen in genl:
        reg = df[df['gene_name']==gen]
        if len(reg)>0:
            dr = reg.iloc[0]['enhancers'].split(' ')
            enhl = [x for x in dr if 'enh' in x]
        else:
            enhl=[]
        enhll = enhll+enhl
    return list(set(enhll))

def gene_enh(enhl,df):
    if isinstance(enhl, str):
        enhl = [enhl]
    
    genl = list()
    for enh in enhl:
        reg = df[df['enhancers']==enh]
        if len(reg)>0:
            ls = reg.iloc[0]['gene_name'].split(', ')
            gl=[tr.split(' (')[0] for tr in ls]
        else:
            gl=[]
        genl = genl+gl
    return list(set(genl))


In [36]:
gens = ['Tm9sf2', 'Tle3']
gnn = 'Tm9sf2'
enhs = ['enh_168', 'enh_1683', 'enh_268']
enhh = 'enh_168'

print(enh_gene(gens,gn2enh))
print(gene_enh(enhh,enh2gn))



['enh_6659', 'enh_18785', 'enh_6657', 'enh_18786', 'enh_18790', 'enh_18777', 'enh_6658', 'enh_18779', 'enh_18781', 'enh_18787', 'enh_18783', 'enh_6661', 'enh_6660', 'enh_18791', 'enh_18789', 'enh_18793', 'enh_18778', 'enh_18780', 'enh_18792', 'enh_18794', 'enh_18784', 'enh_18788', 'enh_6662', 'enh_18782']
['Il1r2', 'Map4k4']


## Load peaks 

In [None]:
tlx = pb.BedTool('tracks/TLX3_TLX3_peaks.bed')
tlx_df =tlx.to_dataframe(names=['chr','start','end','name','score'])

#shr = 100
#top = tlx_df[tlx_df['score']>shr].sort_values('score', axis=0, ascending=False)
#tlx_top = pb.BedTool.from_dataframe(top)

In [None]:
#(enh+tlx).head()

## Load TSS

In [None]:
genes = pb.BedTool('tracks/annot_tracks/genes.bed')

### == TSS region
tss =  3000

In [None]:
def gen_wth_peaks(en,enh2gn,tlx,genes,tss=3000):
    gn_tss = genes.slop(b=tss, genome='mm9')
    gn_tss_tlx=gn_tss+tlx
    gn_tss_tlx_df = gn_tss_tlx.to_dataframe()
    
    gn_tlx_l =list(gn_tss_tlx_df['name']) 
    
    gl = gene_enh(en,enh2gn)
    
    return list(set(gl) & set(gn_tlx_l))

In [None]:
enh_tlx = enh+tlx
enh_tlx_df = enh_tlx.to_dataframe()

In [None]:
import warnings
warnings.filterwarnings('ignore')

enh_tlx_df['gentlx'] = enh_tlx_df.apply(lambda row: gen_wth_peaks(row['name'],enh2gn,tlx,genes), axis=1)



In [None]:
enh_tlx_df['num_gentlx'] = enh_tlx_df.apply(lambda row: len(row['gentlx']), axis=1)


In [None]:
enh_tlx_df_sort = enh_tlx_df[enh_tlx_df['num_gentlx']>0]
enh_tlx_df_sort.sort_values('num_gentlx', axis=0, ascending=False,inplace = True)

In [None]:
#enh_tlx_df_sort.head()
len(enh_tlx_df_sort)

## Load mutations

In [None]:
ftlx = 'tracks/WGS-WES/Germline/FERRIER_09_Germline.allchr.snpEff.p.SAL.SAL10_1.vcf'
var_b = pb.BedTool(ftlx)

In [None]:
#allTLX_list =list(all_tss_gene['name'])
#allTLX_list.sort()
#len(allTLX_list)

In [None]:
#a,b,c = plt.hist(np.log(tlx_df['score']), bins=100)
#a,b,c = plt.hist(tlx_df['score'], bins=100)

## Intersect enhancers and high score peaks

In [None]:
#enh_tlx_top = enh+tlx_top 

In [None]:
#enh_tlx_top.head()

## Load WGS variants

In [None]:
path = 'tracks/WGS-WES/Germline/'
ftlx = 'tracks/WGS-WES/Germline/FERRIER_09_Germline.allchr.snpEff.p.SAL.SAL10_1.vcf'
var_b = pb.BedTool(ftlx)

## Find variants in enhancers with TLX3 peaks

In [None]:
var_ehn_tlx = var_b+enh_tlx_top

In [None]:
var_ehn_tlx_df = var_ehn_tlx.to_dataframe(names=['CHROM','POS','ID','REF','ALT','QUAL','FILTER','INFO','FORMAT','AC3812','AC3813','AC3814','AC3815'])

In [None]:
#var_ehn_tlx_df.head(10)
len(var_ehn_tlx_df)

In [None]:
# Enhancers Table manipulation
enh2gn = pd.read_table('tracks/Enhancers_ChromHMM_enh2genes.txt', 
                        header=1, 
                        names=['enhancers','gene_name'])

In [None]:
enh_tlx_top_df = enh_tlx_top.to_dataframe()

In [None]:
enh_nm = list(enh_tlx_top_df['name'])

In [None]:
enh2gn_top = enh2gn[enh2gn['enhancers'].isin(enh_nm)]

In [None]:
a = list()
for i in range(len(enh2gn_top )):
    ls = enh2gn_top['gene_name'].iloc[i].split(', ')
    a=a+[tr.split(' (')[0] for tr in ls]


In [None]:
len(set(a))

# Enrichment

In [None]:

import EnrichRLib as erl
# List of gene sets as above
gss = [ 
       'GO_Biological_Process_2018',
       'GO_Cellular_Component_2018',
       'GO_Molecular_Function_2018',
       'KEGG_2016',
       'Reactome_2016'
       ]

In [None]:
a = [x.upper() for x in a]

enr_a = erl.enrich_gs(a,gss)



In [None]:
# --- Plot ---
enr_a.sort_values('p-Val', axis=0, inplace = True)
ds = enr_a.head(20)

f, ax = plt.subplots()
sns.barplot(y=ds.index,
            x='-log10(p-Val)',
            ax = ax, 
            color="Red", 
            data = ds)
ax.set_title('Genes regulated by enhancers with high Tlx3 peaks')

# Expression analysis

In [None]:
import RNA_expression_processing as rn
from os.path import join

In [None]:
tbl = pd.read_table(join('tracks', 'TLX3vsRAG-results_genesNames.txt'), index_col=0)
#tbl = tbl[(tbl.padj < 0.05)].dropna()

# === Pheno ==
A,B = 'TLX3','RAG'
classes = [A]*3+[B]*3


cols = ['Gene_name', 'TLX3.1_1','TLX3.1_5','TLX3.1_P','R2.RAG1W.RAG1','RAGS.RAGZ','RAGZ']

tbn = tbl[cols]
tbv = tbn.set_index(keys=tbn.columns[0])
tbv.index=tbv.index.str.upper()



In [None]:

### == UP analysis

tbu = tbv.iloc[tbv.index.isin(a)]


In [None]:
jj,kk,ll=rn.scatter_n(tbu, A, B, classes, n_top=3, ttl='') 

In [None]:
ntop=40
gr = rn.cluster(tbu, A, B, classes, n_top=ntop)
gr.ax_heatmap.set_title('Cluster '+A+'/'+B)