# This is used for running data vs all human ensembl databases
### /nvme/bins/iPAGEv1.0/PAGE_DATA/ANNOTATIONS/human_ensembl*

In [1]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import pickle
import seaborn as sns
import scipy.stats


In [2]:
###generate databases to loop through
cmd = "ls -d /nvme/bins/iPAGEv1.0/PAGE_DATA/ANNOTATIONS/human_ensembl*"
### save as list for looping
databases = os.popen(cmd).read().split('\n')
##format
tmp = []
for db in databases:
    tmp.append(db.split('/')[-1])
databases = tmp.copy()
databases = list(filter(None,databases))

### get ensembl genes represented in iPAGE database

In [3]:
%%bash 

cp /nvme/bins/iPAGEv1.0/PAGE_DATA/ANNOTATIONS/human_ensembl_msigdb_c6/human_ensembl_msigdb_c6_index.txt human_ensembl_index.txt
awk -F '\t' '{print $1}' human_ensembl_index.txt > ensembl_index.txt

In [4]:
databases

['human_ensembl',
 'human_ensembl_encode_tf',
 'human_ensembl_msigdb_c1',
 'human_ensembl_msigdb_c2',
 'human_ensembl_msigdb_c3',
 'human_ensembl_msigdb_c4',
 'human_ensembl_msigdb_c5',
 'human_ensembl_msigdb_c6',
 'human_ensembl_msigdb_c7',
 'human_ensembl_msigdb_full',
 'human_ensembl_msigdb_h',
 'human_ensembl_RBPs_all_gene_ids',
 'human_ensembl_RBPs_all_gene_names',
 'human_ensembl_RBPs_coding_gene_ids',
 'human_ensembl_RBPs_coding_gene_ids_by_3UTR',
 'human_ensembl_RBPs_coding_gene_ids_by_5UTR',
 'human_ensembl_RBPs_coding_gene_ids_by_coding_exons',
 'human_ensembl_RBPs_coding_gene_ids_by_introns',
 'human_ensembl_RBPs_DeepBind']

### Formatting
##### make sure that genes in file match the index 

In [5]:
####define index file to loop through
FCfilename = 'log2FC_3_4.tsv'
subpath = 'ipage_outs_3/'
####
os.system('mkdir '+subpath)
FCfile = pd.read_csv(FCfilename,sep='\t',index_col=0,header=None)
FCfile.columns = ['log10fc']
####get list of ENSG gene names
ENSG = list(set(pd.read_csv('ensembl_index.txt',index_col=0).index.tolist()))
####convert gene names to ENSG
##read in dict
ENSGdict = pd.read_csv('ENSG_geneID.dict',sep='\t',index_col=1).to_dict()['Gene stable ID']
##define dict function
def switch_names(lookup,dictionary):
    if lookup in dictionary:
        return dictionary[lookup]
    else:
        return np.nan
##convert to ENSG
FCfile.index = FCfile.apply(lambda row: switch_names(row.name,ENSGdict),axis=1)
##drop nans
FCfile = FCfile.loc[FCfile.index!=np.nan]
FCfile = FCfile.dropna()
##format columns
FCfile.columns = ['log2FoldChange']
FCfile.index.names = ['Gene']
##save formatted DF, one for each database
savefile = []
for i in range(len(databases)):
    ###make subfolder
    cmd = 'mkdir '+subpath+databases[i]
    os.system(cmd)
    ###save a copy of the FC file to subfolder
    tmpfile = subpath+databases[i]+'/'+FCfilename
    FCfile.to_csv(tmpfile,sep='\t',header=True)
    savefile.append(tmpfile)
print(FCfile.shape)

(14296, 1)


### Excecute the script

In [6]:
##get cwd
cwd = os.getcwd()+"/"
###write final iPAGE command to an ipage .sh file
for i in range(len(databases)):
    iPAGE_cmd = "perl /nvme/bins/iPAGEv1.0/page.pl --expfile "+ \
                cwd+savefile[i]+" --species="+databases[i]+ \
                " --exptype=continuous --ebins=9 \n"
    os.system(iPAGE_cmd)


In [7]:
databases

['human_ensembl',
 'human_ensembl_encode_tf',
 'human_ensembl_msigdb_c1',
 'human_ensembl_msigdb_c2',
 'human_ensembl_msigdb_c3',
 'human_ensembl_msigdb_c4',
 'human_ensembl_msigdb_c5',
 'human_ensembl_msigdb_c6',
 'human_ensembl_msigdb_c7',
 'human_ensembl_msigdb_full',
 'human_ensembl_msigdb_h',
 'human_ensembl_RBPs_all_gene_ids',
 'human_ensembl_RBPs_all_gene_names',
 'human_ensembl_RBPs_coding_gene_ids',
 'human_ensembl_RBPs_coding_gene_ids_by_3UTR',
 'human_ensembl_RBPs_coding_gene_ids_by_5UTR',
 'human_ensembl_RBPs_coding_gene_ids_by_coding_exons',
 'human_ensembl_RBPs_coding_gene_ids_by_introns',
 'human_ensembl_RBPs_DeepBind']

## then run the command from this folder:

nohup sh run.sh iPAGE_JY.sh &

In [None]:
len(databases[0])

In [None]:
####open file and loop through each line
###
iPAGE_sh = "iPAGE_JY.sh"
cwd = os.getcwd()+"/"
os.system("rm "+iPAGE_sh)
###
with open(index_file,'r') as rf:
    for line in rf:
        #####do this to each file
        ###read in file into pandas df
        FC_file = line.split('\n')[0]
        FC_df = pd.read_csv(FC_file,sep=',',index_col=0)
        FC_df.columns = ['log2FoldChange']
        FC_df.index.names = ['Gene']
        ###filter
        FC_df = FC_df.loc[[x for x in FC_df.index if x in genes_with_GOs_set]]
        ###save
        save_file = "fc_values/filt_"+line.split('\n')[0]
        FC_df.to_csv(save_file,sep='\t',header=True)
        ###write final iPAGE command to an ipage .sh file
        iPAGE_cmd = "perl /nvme/bins/iPAGEv1.0/page.pl --expfile "+cwd+save_file+" --species=human_ensembl_msigdb_c6 --exptype=continuous --ebins=9 \n"
        with open(iPAGE_sh, "a") as myfile:
            myfile.write(iPAGE_cmd)
