In [2]:
import pybedtools as pb
import numpy as np
#import scipy
import pandas as pd
#import matplotlib as mpl
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

## Load enhancers 

In [None]:
enh = pb.BedTool('tracks/Enhancers_ChromHMM.bed')
enh_df = enh.to_dataframe()

## Load peaks and filter by score

In [None]:
tlx_pk = pb.BedTool('tracks/TLX3_TLX3_peaks.bed')
tlx_df =tlx_pk.to_dataframe(names=['chr','start','end','name','score'])

shr = 100
top = tlx_df[tlx_df['score']>shr].sort_values('score', axis=0, ascending=False)
tlx_top = pb.BedTool.from_dataframe(top)

In [None]:
#a,b,c = plt.hist(np.log(tlx_df['score']), bins=100)
a,b,c = plt.hist(tlx_df['score'], bins=100)

## Intersect enhancers and high score peaks

In [None]:
enh_tlx_top = enh+tlx_top 

In [None]:
enh_tlx_top.head()

## Load WGS variants

In [None]:
path = 'tracks/WGS-WES/Germline/'
ftlx = 'tracks/WGS-WES/Germline/FERRIER_09_Germline.allchr.snpEff.p.SAL.SAL10_1.vcf'
var_b = pb.BedTool(ftlx)

## Find variants in enhancers with TLX3 peaks

In [None]:
var_ehn_tlx = var_b+enh_tlx_top

In [None]:
var_ehn_tlx_df = var_ehn_tlx.to_dataframe(names=['CHROM','POS','ID','REF','ALT','QUAL','FILTER','INFO','FORMAT','AC3812','AC3813','AC3814','AC3815'])

In [None]:
#var_ehn_tlx_df.head(10)
len(var_ehn_tlx_df)

In [None]:
# Enhancers Table manipulation
enh2gn = pd.read_table('tracks/Enhancers_ChromHMM_enh2genes.txt', 
                        header=1, 
                        names=['enhancers','gene_name'])

In [None]:
enh_tlx_top_df = enh_tlx_top.to_dataframe()

In [None]:
enh_nm = list(enh_tlx_top_df['name'])

In [None]:
enh2gn_top = enh2gn[enh2gn['enhancers'].isin(enh_nm)]

In [None]:
a = list()
for i in range(len(enh2gn_top )):
    ls = enh2gn_top['gene_name'].iloc[i].split(', ')
    a=a+[tr.split(' (')[0] for tr in ls]


In [None]:
len(set(a))

# Enrichment

In [None]:

import EnrichRLib as erl
# List of gene sets as above
gss = [ 
       'GO_Biological_Process_2018',
       'GO_Cellular_Component_2018',
       'GO_Molecular_Function_2018',
       'KEGG_2016',
       'Reactome_2016'
       ]

In [None]:
a = [x.upper() for x in a]

enr_a = erl.enrich_gs(a,gss)



In [None]:
# --- Plot ---
enr_a.sort_values('p-Val', axis=0, inplace = True)
ds = enr_a.head(20)

f, ax = plt.subplots()
sns.barplot(y=ds.index,
            x='-log10(p-Val)',
            ax = ax, 
            color="Red", 
            data = ds)
ax.set_title('Genes regulated by enhancers with high Tlx3 peaks')

# Expression analysis

In [None]:
import RNA_expression_processing as rn
from os.path import join

In [None]:
tbl = pd.read_table(join('tracks', 'TLX3vsRAG-results_genesNames.txt'), index_col=0)
#tbl = tbl[(tbl.padj < 0.05)].dropna()

# === Pheno ==
A,B = 'TLX3','RAG'
classes = [A]*3+[B]*3


cols = ['Gene_name', 'TLX3.1_1','TLX3.1_5','TLX3.1_P','R2.RAG1W.RAG1','RAGS.RAGZ','RAGZ']

tbn = tbl[cols]
tbv = tbn.set_index(keys=tbn.columns[0])
tbv.index=tbv.index.str.upper()



In [None]:

### == UP analysis

tbu = tbv.iloc[tbv.index.isin(a)]


In [None]:
jj,kk,ll=rn.scatter_n(tbu, A, B, classes, n_top=3, ttl='') 

In [None]:
ntop=40
gr = rn.cluster(tbu, A, B, classes, n_top=ntop)
gr.ax_heatmap.set_title('Cluster '+A+'/'+B)