In [None]:
'''
Calculate significantly enriched pathways of selected 91 CLUE compounds
'''
import pandas as pd
import os
from Enrichment import *
from collections import Counter
import numpy as np
import scipy.cluster.hierarchy as sch
from scipy.stats import pearsonr,spearmanr
from scipy.spatial.distance import cosine
# set the correct working directory
if os.path.isdir('M:/'):
    os.chdir('M:\Box\Jake-Jegga\IPF Drug Discovery\Results')
elif os.path.isdir('E:/'):
    os.chdir('E:/Box Sync/Jake-Jegga/IPF Drug Discovery/Results')
else:
    os.chdir('/Users/wano3m/Box Sync/Jake-Jegga/IPF Drug Discovery/Results')
enrich = SEA('../../../Ontology_Info/')
termids = pd.read_table('../../../Lincs_data/TermID2TermName.txt',index_col = 0)
master_gene_table = pd.read_csv('../../../Ontology_Info/Master Gene Conversion Table.csv',dtype = str)
master_gene_table = master_gene_table.dropna(subset=['h_entrez_id']).set_index('h_entrez_id')

if 'Enrichment analysis' not in os.getcwd():
    os.chdir('../Intermediate results/Enrichment analysis')
    
ipf_up_terms = pd.read_csv('IPF metaanalysis median FC 06 enrichment.csv',index_col = 2)
ipf_dn_terms = pd.read_csv('IPF metaanalysis median FC -06 enrichment.csv',index_col = 2)
ipf_up_terms['genelist'] = ipf_up_terms.apply(lambda x: ','.join(sorted(x[4:].index[x[4:]==1])),axis = 1)
ipf_dn_terms['genelist'] = ipf_dn_terms.apply(lambda x: ','.join(sorted(x[4:].index[x[4:]==1])),axis = 1)

cmpd_enrichment = pd.read_table('../../Results/91+nintedanib enrichment by dose by cell type report.txt',index_col = 0)
cmpd_enrichment['idx'] = cmpd_enrichment.index + '_' + cmpd_enrichment.cellline
cmpd_enrichment.set_index('idx',inplace = True)
cmpd_enrichment.dropna(subset=['Raw_p_value'],inplace=True)
# represent enrichment pvalues as log(pvalue), with positive sign indicating up enriched genes and negative sigh for down regulated genes
cmpd_enrichment_sign_vector = np.array([-1 if x=='up' else 1 for x in cmpd_enrichment.reg])
cmpd_enrichment = cmpd_enrichment.ix[:,['Raw_p_value','Term_name','genelist']]
cmpd_enrichment.Raw_p_value = cmpd_enrichment_sign_vector*np.log10(cmpd_enrichment.Raw_p_value)

lib_size = pd.Series()
for cat in ['BP','Pathway','MP']:
    lib_size = lib_size.append(enrich.dict_g2t[cat].sum())
lib_size = lib_size[lib_size<=1000]
ipf_enrichment = pd.DataFrame()

cm_ipf_terms = [x for x in ipf_up_terms.Name.values if x in ipf_dn_terms.Name.values]

for tmp,reg in zip([ipf_up_terms,ipf_dn_terms],[1,-1]):
    valid_ipf_terms = [x for x in tmp.index if x in lib_size.index]
    tmp = tmp.loc[valid_ipf_terms]
    tmp = tmp[~tmp.Name.isin(cm_ipf_terms)]
    tmp = tmp.copy().drop_duplicates(subset=['Name']).set_index('Name')[['pValue','genelist']]
    tmp.pValue = -reg*(np.log10(tmp.pValue))
    ipf_enrichment = ipf_enrichment.append(tmp)
    
cmpd_ipf_corr = pd.DataFrame(columns = ['Peasonr','Spearr'])
for cmpd in cmpd_enrichment.index.unique():
    current_cmpd = cmpd_enrichment.loc[cmpd].copy()
    if isinstance(current_cmpd,pd.Series):
        continue
    # get rid of redundant terms, this is most likely a bug in ontology
    valid_terms = current_cmpd.Term_name.value_counts()[current_cmpd.Term_name.value_counts() == 1].index
    cm_terms = [x for x in current_cmpd.Term_name if (x in valid_terms)&(x in ipf_enrichment.index)]
    if len(cm_terms) <=2:
        print('{} has fewer than 2 terms in common with ipf enrichment signature'.format(cmpd))
        continue            
    else:
        current_cmpd.set_index('Term_name',inplace=True)
        _cmpd_enrichment_pvals = current_cmpd.ix[cm_terms,'Raw_p_value'].values
        _cmpd_enrichment_genes = current_cmpd.ix[cm_terms,'genelist'].values
        _cmpd_enrichment = pd.DataFrame(0,index = ipf_enrichment.index, columns = ['pval','genelist'])
        _cmpd_enrichment.ix[cm_terms,0] = _cmpd_enrichment_pvals
        _cmpd_enrichment.ix[cm_terms,1] = _cmpd_enrichment_genes
        ipf_vector = ipf_enrichment.pValue
        cmpd_vector = _cmpd_enrichment.pval
        cmpd_ipf_corr.ix[cmpd,'Peasonr'] = pearsonr(ipf_vector,cmpd_vector)[0]
        cmpd_ipf_corr.ix[cmpd,'Spearr'] = spearmanr(ipf_vector,cmpd_vector)[0]
        cmpd_ipf_corr.ix[cmpd,'cosine'] = 1-cosine(ipf_vector,cmpd_vector)
        reversed_terms = ipf_enrichment.index[(ipf_vector*cmpd_vector)<0]
        if reversed_terms.shape[0] == 0:
            continue
        top10_reversed_terms = ((abs(ipf_vector)+abs(cmpd_vector))/2).loc[reversed_terms].index[:10]
        cmpd_ipf_corr.ix[cmpd,'top-Reversed'] = ', '.join(top10_reversed_terms)
        cmpd_ipf_corr.ix[cmpd,'reversed-cmpd-psum'] = abs(ipf_vector[reversed_terms]).sum()
        cmpd_ipf_corr.ix[cmpd,'reversed-ipf-psum'] = abs(cmpd_vector[reversed_terms]).sum()
        for enrichment_lib,dataset in zip([ipf_enrichment,_cmpd_enrichment],['ipf','cmpd']):
            enrichment_lib = enrichment_lib.loc[reversed_terms]
            for reg in ['up','dn']:
                if reg == 'up':
                    _tmp_enrichment_lib = enrichment_lib[enrichment_lib.ix[:,0]>0] 
                else:
                    _tmp_enrichment_lib = enrichment_lib[enrichment_lib.ix[:,0]<0]
                genes = ','.join(sorted(set(','.join(_tmp_enrichment_lib.genelist).split(','))))
                cmpd_ipf_corr.ix[cmpd,'-'.join([dataset,reg])] = genes
        reversed_genes = [x for x in cmpd_ipf_corr.ix[cmpd,'ipf-up'].split(',') if x in cmpd_ipf_corr.ix[cmpd,'cmpd-dn'].split(',')]
        reversed_genes += [x for x in cmpd_ipf_corr.ix[cmpd,'ipf-dn'].split(',') if x in cmpd_ipf_corr.ix[cmpd,'cmpd-up'].split(',')]
        cmpd_ipf_corr.ix[cmpd,'reversed-genes'] = ','.join(sorted([x for x in reversed_genes if x!='']))
        cmpd_ipf_corr.ix[cmpd,'reversed/total'] = cmpd_ipf_corr.ix[cmpd,'reversed-cmpd-psum']/abs(current_cmpd.Raw_p_value).sum()
cmpd_ipf_corr.to_excel('91 cmpd enrichment.xlsx')