In [6]:
## Full backend analysis for the CREEDS and L1000 dataset 
import os
import pandas as pd
from pandas.compat import StringIO
import numpy as np
from numpy import loadtxt
import sys
import json
from pprint import pprint
import objectpath
import csv
import re
import matplotlib.pyplot as plt
import json, requests
from pprint import pprint
import itertools
import scipy
from scipy.spatial import distance
from sklearn.metrics.pairwise import pairwise_distances
from clustergrammer_widget import *
def get_geneset(df, indexer):
    df_ = df.loc[indexer, :]
    return list(df_[df_ == 1].index)

In [4]:
#### DRUG QUERY
## load in the pre-formed datasets from the L1000_Analysis and CREEDS_Analysis files.
## THIS TAKES A WHILE TO LOAD, SO ONLY LOAD THIS ONCE AND EARLY

# L1000 up and down gene loads for drug signatures
L1000_up_genes = pd.read_csv("L1000_up_genes.csv")
L1000_down_genes = pd.read_csv("L1000_down_genes.csv")

# CREEDS up and down genes for disease signatures
with open("disease_signatures-v1.0.json") as f:
    CREEDS_data = json.load(f)

# generate the up and down gene signatures
CREEDS_up_genes = {
    row['do_id']: row['up_genes']
    for row in CREEDS_data
}
CREEDS_down_genes = {
    row['do_id']: row['down_genes']
    for row in CREEDS_data
}

# load in the EMR Data (filtered > 200 in R code [Drug_diagnosis_test_code.R])
EMR_data = pd.read_csv("EMR_greater_200.csv")
## subset EMR data by the DOI and/or DrOI
EMR_data_df = pd.DataFrame(EMR_data)
#EMR_data
EMR_data_df.drop(EMR_data_df.columns[[0]], axis = 1, inplace = True) # remove the unecessary columns
#EMR_data_df

# implement the search from ICD9-do_id from the manual conversion
icd9_to_doid = pd.read_csv("ICD9_CREEDS_conversion.csv")
icd9_to_doid = pd.DataFrame(icd9_to_doid) # convert it to a data fram to drop unecessary rows
#icd9_to_doid # sanity check
icd9_to_doid_final = icd9_to_doid.drop(icd9_to_doid.columns[[0, 6, 7, 8, 9, 10, 11, 12, 13, 14]], axis = 1)
#icd9_to_doid_final # sanity check

## PRELOADED DATA
## L1000 conversion and analysis DRUG
L1000_down_extract = pd.DataFrame(L1000_down_genes)
L1000_up_extract = pd.DataFrame(L1000_up_genes)

## L1000 ANALYSIS -- FEEDS BACK INTO THE API to get additional signatures
metadata = pd.read_csv("L1000_metadata.csv")
#metadata ## same as LINC1000h5.row_metadata_df

EMR_Drug_Names = EMR_data_df['Drug_Name'] # this will be the selection for the dropdown menu
## get the subselection of drug names

unique_pert_ids = metadata['pert_id'].unique()
unique_drug_names = metadata['pert_desc'].unique()

possible_drug_inputs = set(unique_drug_names) & set(EMR_Drug_Names) # 28 possible drug inputs


In [None]:
### L1000
DrOI = "warfarin" # drug of interest. MAKE SURE TO SELECT THIS FROM "possible_drug_inputs"
####

DrOI_up_extract = L1000_up_extract[L1000_up_extract['Unnamed: 0'].apply(lambda s: bool(re.compile(DrOI, re.IGNORECASE).search(s)))]
DrOI_up_final = DrOI_up_extract.loc[:, (DrOI_up_extract != 0).any(axis=0)] # remove any genes without any expression in any of the results

DrOI_down_extract = L1000_down_extract[L1000_down_extract['Unnamed: 0'].apply(lambda s: bool(re.compile(DrOI, re.IGNORECASE).search(s)))]
DrOI_down_final = DrOI_down_extract.loc[:, (DrOI_down_extract != 0).any(axis=0)] # remove any genes without any expression in any of the results


L1000FWD_URL = 'http://amp.pharm.mssm.edu/L1000FWD/'

query_string = DrOI
response = requests.get(L1000FWD_URL + 'synonyms/' + query_string)
if response.status_code == 200:
	pprint(response.json())
	#json.dump(response.json(), open('api1_result.json', 'wb'), indent=4)
L1000_significant_query = response.json()
L1000_significant_query_df = pd.DataFrame(L1000_significant_query)
L1000_significant_pert_ids = L1000_significant_query_df["pert_id"]
L1000_summary_table = metadata[metadata["pert_id"].isin(L1000_significant_pert_ids)] # display this table!
L1000_drug_signatures = L1000_summary_table.rid

L1000_all_up_genes = []
L1000_all_down_genes = []

if len(L1000_significant_pert_ids) > 0:
    
    test=[]
    for q in L1000_significant_pert_ids:
        meta_doi = metadata[metadata["pert_id"].apply(lambda s: bool(re.compile(str(q), re.IGNORECASE).search(str(s))))]
        #print(q)
        meta_doi_ids = meta_doi.rid
        query = list(meta_doi_ids)
       # print(query)
        test.append(query)
    #test
    test1 = [x for x in test if x]
   #test1
    L1000_drug_signatures = list(itertools.chain.from_iterable(test1))
    
  
    if len(L1000_drug_signatures) > 0:
    ## L1000 iterations API   
        L1000FWD_URL = 'http://amp.pharm.mssm.edu/L1000FWD/'
        json_name_store = []
        drug_sig_store = []
        for x in L1000_drug_signatures:
            sig_id = x
            response = requests.get(L1000FWD_URL + 'sig/' + str(sig_id))
            if response.status_code == 200:
                #pprint(response.json())
                json.dump(response.json(), open(sig_id + '_api2_result.json', 'w'), indent=4)
                json_name = (sig_id + '_api2_result.json')
                print(json_name)
                json_name_store.append(json_name)
                
                L1000_query = response.json() 
                
                L1000_query_up_genes = L1000_query["up_genes"]
                L1000_all_up_genes.append(L1000_query_up_genes)
                
                L1000_query_down_genes = L1000_query["down_genes"]
                L1000_all_down_genes.append(L1000_query_down_genes)



    
        print("SIGNIFICANT L1000 SIGNATURES")
    else:
        print ("NO Significant signatures found in metadata")


        
else:
    print("NO SIGNIFICANT L1000 SIGNATURES FOR" + DrOI)

    
# generate gene matrices for the clustergrammer
L1000_all_up_genes_df = pd.DataFrame(L1000_all_up_genes)
L1000_all_up_index_names = [s + "_up_genes" for s in L1000_drug_signatures] # we will probably have to change L1000_drug_signatures to a variable with selected signatures.
L1000_all_up_genes_df.index = [L1000_all_up_index_names]
#L1000_all_up_genes_df

L1000_all_down_genes_df = pd.DataFrame(L1000_all_down_genes)
L1000_all_down_index_names = [s + "_down_genes" for s in L1000_drug_signatures]
L1000_all_down_genes_df.index = [L1000_all_down_index_names]
#L1000_all_down_genes_df


L1000_clustergrammer_combined_list = [L1000_all_up_genes_df
          ,L1000_all_down_genes_df]

L1000_clustergrammer_combined_df = pd.concat(L1000_clustergrammer_combined_list)
L1000_clustergrammer_combined_binary_table = pd.get_dummies(L1000_clustergrammer_combined_df,
              prefix_sep= "",
              prefix = "",
              )


##### Clustergrammer data preparation

### need to be able to select which signatures to compare against each other based on a user selection of the box

# should input L1000_clustergrammer_combined_binary_table

## build Jaccard index
jac_sim = 1- pairwise_distances(L1000_clustergrammer_combined_binary_table, metric = "hamming")

jac_sim = pd.DataFrame(jac_sim, index=L1000_clustergrammer_combined_binary_table.index.map(lambda s: s[:10]), columns=L1000_clustergrammer_combined_binary_table.index.map(lambda s: s[:10]))
jac_sim_clustergrammer = jac_sim.to_csv("L1000_test.tsv", sep = "\t")


## clustergrammer implementation
# load data into new network instance and cluster
net = Network(clustergrammer_widget)


#net.load_file('cars.tsv')
net.load_file("L1000_test.tsv")
net.cluster()

# view the results as a widget
net.widget()

In [59]:
L1000_clustergrammer_combined_binary_table

Unnamed: 0,ADD3,AIM1,ASB13,BPTF,CYP2J2,ELANE,FAM198B,FKBP14,FXYD3,FXYD6,...,BCL6,COQ9,KANK1,LTF,COL6A1,PPP3CA,FUT8,KIAA0528,CKAP4,BZW2
CPC004_A375_6H:BRD-A94756469-001-03-9:10_up_genes,0,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
CPC004_HA1E_24H:BRD-A94756469-001-03-9:10_up_genes,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
CPC004_HA1E_6H:BRD-A94756469-001-03-9:10_up_genes,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
CPC004_HCC515_24H:BRD-A94756469-001-03-9:10_up_genes,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
CPC004_HCC515_6H:BRD-A94756469-001-03-9:10_up_genes,0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
CPC004_HEPG2_6H:BRD-A94756469-001-03-9:10_up_genes,0,0,0,0,0,0,0,0,0,0,...,1,0,0,1,0,0,0,0,0,0
CPC004_HT29_6H:BRD-A94756469-001-03-9:10_up_genes,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
CPC004_MCF7_6H:BRD-A94756469-001-03-9:10_up_genes,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
CPC004_PC3_24H:BRD-A94756469-001-03-9:10_up_genes,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
CPC004_PC3_6H:BRD-A94756469-001-03-9:10_up_genes,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [61]:
L1000_clustergrammer_combined_list

[                                                         0        1    \
 CPC004_A375_6H:BRD-A94756469-001-03-9:10_up_genes     FKBP14    U2AF2   
 CPC004_HA1E_24H:BRD-A94756469-001-03-9:10_up_genes     ZNF43    ELANE   
 CPC004_HA1E_6H:BRD-A94756469-001-03-9:10_up_genes        LIF   PMAIP1   
 CPC004_HCC515_24H:BRD-A94756469-001-03-9:10_up_...      BPTF    PTGS2   
 CPC004_HCC515_6H:BRD-A94756469-001-03-9:10_up_g...     ELANE   PPAP2C   
 CPC004_HEPG2_6H:BRD-A94756469-001-03-9:10_up_genes      TGM2    TM2D3   
 CPC004_HT29_6H:BRD-A94756469-001-03-9:10_up_genes      PTGS2   TRIM13   
 CPC004_MCF7_6H:BRD-A94756469-001-03-9:10_up_genes       SSPN    C5AR1   
 CPC004_PC3_24H:BRD-A94756469-001-03-9:10_up_genes      PTGS2    TIMP3   
 CPC004_PC3_6H:BRD-A94756469-001-03-9:10_up_genes       PTGS2      LIF   
 CPC004_VCAP_6H:BRD-A94756469-001-03-9:10_up_genes    TRMT112     PECR   
 CPC017_A375_6H:BRD-A94756469-001-03-9:10_up_genes      SFTPD    PTGS2   
 CPC017_A549_24H:BRD-A94756469-001-03-

In [21]:
L1000_up_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,218,219,220,221,222,223,224,225,226,227
0,FXYD6,FAM198B,GNLY,SFTPB,NRCAM,HBB,P2RY13,CHMP2A,RASA4,MFAP2,...,RAB25,RPL38,PNP,MPPED2,OAZ2,KLHDC8A,COL6A1,ABCC5,FUCA1,EIF4B


In [55]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
df1 = scaler.fit_transform(jac_sim)

df2 = pd.DataFrame(df1)


ValueError: Length mismatch: Expected axis has 96 elements, new values have 18972 elements

In [60]:
frames

[                                                         0        1    \
 CPC004_A375_6H:BRD-A94756469-001-03-9:10_up_genes     FKBP14    U2AF2   
 CPC004_HA1E_24H:BRD-A94756469-001-03-9:10_up_genes     ZNF43    ELANE   
 CPC004_HA1E_6H:BRD-A94756469-001-03-9:10_up_genes        LIF   PMAIP1   
 CPC004_HCC515_24H:BRD-A94756469-001-03-9:10_up_...      BPTF    PTGS2   
 CPC004_HCC515_6H:BRD-A94756469-001-03-9:10_up_g...     ELANE   PPAP2C   
 CPC004_HEPG2_6H:BRD-A94756469-001-03-9:10_up_genes      TGM2    TM2D3   
 CPC004_HT29_6H:BRD-A94756469-001-03-9:10_up_genes      PTGS2   TRIM13   
 CPC004_MCF7_6H:BRD-A94756469-001-03-9:10_up_genes       SSPN    C5AR1   
 CPC004_PC3_24H:BRD-A94756469-001-03-9:10_up_genes      PTGS2    TIMP3   
 CPC004_PC3_6H:BRD-A94756469-001-03-9:10_up_genes       PTGS2      LIF   
 CPC004_VCAP_6H:BRD-A94756469-001-03-9:10_up_genes    TRMT112     PECR   
 CPC017_A375_6H:BRD-A94756469-001-03-9:10_up_genes      SFTPD    PTGS2   
 CPC017_A549_24H:BRD-A94756469-001-03-

In [44]:
jac_sim

Unnamed: 0,CPC004_A375_6H:BRD-A94756469-001-03-9:10_up_genes,CPC004_HA1E_24H:BRD-A94756469-001-03-9:10_up_genes,CPC004_HA1E_6H:BRD-A94756469-001-03-9:10_up_genes,CPC004_HCC515_24H:BRD-A94756469-001-03-9:10_up_genes,CPC004_HCC515_6H:BRD-A94756469-001-03-9:10_up_genes,CPC004_HEPG2_6H:BRD-A94756469-001-03-9:10_up_genes,CPC004_HT29_6H:BRD-A94756469-001-03-9:10_up_genes,CPC004_MCF7_6H:BRD-A94756469-001-03-9:10_up_genes,CPC004_PC3_24H:BRD-A94756469-001-03-9:10_up_genes,CPC004_PC3_6H:BRD-A94756469-001-03-9:10_up_genes,...,CPC017_A375_6H:BRD-K23478508-001-03-7:10_down_genes,CPC017_A549_24H:BRD-K23478508-001-03-7:10_down_genes,CPC017_A549_6H:BRD-K23478508-001-03-7:10_down_genes,CPC017_ASC_24H:BRD-K23478508-001-03-7:10_down_genes,CPC017_HEPG2_6H:BRD-K23478508-001-03-7:10_down_genes,CPC017_HT29_6H:BRD-K23478508-001-03-7:10_down_genes,CPC017_MCF7_24H:BRD-K23478508-001-03-7:10_down_genes,CPC017_MCF7_6H:BRD-K23478508-001-03-7:10_down_genes,CPC017_NPC_24H:BRD-K23478508-001-03-7:10_down_genes,CPC017_PHH_24H:BRD-K23478508-001-03-7:10_down_genes
CPC004_A375_6H:BRD-A94756469-001-03-9:10_up_genes,0.000000,0.023877,0.026460,0.024984,0.025459,0.027198,0.025880,0.025090,0.024352,0.026407,...,0.024035,0.023192,0.022296,0.024668,0.023034,0.020398,0.024984,0.023561,0.024984,0.024352
CPC004_HA1E_24H:BRD-A94756469-001-03-9:10_up_genes,0.023877,0.000000,0.023983,0.022401,0.022876,0.024931,0.023297,0.022612,0.021769,0.024246,...,0.021558,0.020715,0.019819,0.022191,0.020557,0.017921,0.022507,0.021084,0.022507,0.021874
CPC004_HA1E_6H:BRD-A94756469-001-03-9:10_up_genes,0.026460,0.023983,0.000000,0.024984,0.024931,0.027198,0.025775,0.024879,0.024035,0.026197,...,0.024035,0.023192,0.022296,0.024668,0.023034,0.020398,0.024984,0.023561,0.024984,0.024352
CPC004_HCC515_24H:BRD-A94756469-001-03-9:10_up_genes,0.024984,0.022401,0.024984,0.000000,0.023877,0.025933,0.024299,0.023508,0.022876,0.025248,...,0.022454,0.021716,0.020820,0.023192,0.021558,0.018923,0.023508,0.022085,0.023403,0.022876
CPC004_HCC515_6H:BRD-A94756469-001-03-9:10_up_genes,0.025459,0.022876,0.024931,0.023877,0.000000,0.025669,0.024668,0.024088,0.023139,0.025828,...,0.023034,0.022191,0.021295,0.023666,0.022032,0.019397,0.023983,0.022560,0.023877,0.023350
CPC004_HEPG2_6H:BRD-A94756469-001-03-9:10_up_genes,0.027198,0.024931,0.027198,0.025933,0.025669,0.000000,0.026829,0.025933,0.025090,0.027672,...,0.024984,0.024141,0.023245,0.025617,0.023983,0.021242,0.025933,0.024510,0.025933,0.025300
CPC004_HT29_6H:BRD-A94756469-001-03-9:10_up_genes,0.025880,0.023297,0.025775,0.024299,0.024668,0.026829,0.000000,0.024404,0.023666,0.026038,...,0.023456,0.022612,0.021716,0.024088,0.022454,0.019819,0.024404,0.022981,0.024404,0.023666
CPC004_MCF7_6H:BRD-A94756469-001-03-9:10_up_genes,0.025090,0.022612,0.024879,0.023508,0.024088,0.025933,0.024404,0.000000,0.022876,0.025353,...,0.022665,0.021822,0.020926,0.023297,0.021664,0.019028,0.023614,0.022191,0.023508,0.022981
CPC004_PC3_24H:BRD-A94756469-001-03-9:10_up_genes,0.024352,0.021769,0.024035,0.022876,0.023139,0.025090,0.023666,0.022876,0.000000,0.024615,...,0.021927,0.021084,0.020188,0.022560,0.020926,0.018185,0.022876,0.021453,0.022770,0.022138
CPC004_PC3_6H:BRD-A94756469-001-03-9:10_up_genes,0.026407,0.024246,0.026197,0.025248,0.025828,0.027672,0.026038,0.025353,0.024615,0.000000,...,0.024404,0.023561,0.022665,0.025037,0.023403,0.020767,0.025353,0.023930,0.025248,0.024721


In [56]:
##### Clustergrammer data preparation

### need to be able to select which signatures to compare against each other based on a user selection of the box

# should input L1000_clustergrammer_combined_binary_table

## build Jaccard index
jac_sim = 1- pairwise_distances(df2, metric = "hamming")

jac_sim = pd.DataFrame(jac_sim, index=L1000_clustergrammer_combined_binary_table.index.map(lambda s: s[:10]), columns=L1000_clustergrammer_combined_binary_table.index.map(lambda s: s[:10]))
jac_sim_clustergrammer = jac_sim.to_csv("L1000_test.tsv", sep = "\t")



In [57]:

## clustergrammer implementation
# load data into new network instance and cluster
net = Network(clustergrammer_widget)


#net.load_file('cars.tsv')
net.load_file("L1000_test.tsv")
net.cluster()

# view the results as a widget
net.widget()

  tmp_df['mat'] = pd.read_table(file_buffer, index_col=row_arr)


clustergrammer_widget(network='{"row_nodes": [{"name": "CPC001_HA1E_24H:BRD-K23478508-001-03-7:10_down_genes",…

In [3]:
### l1000 vs creeds

process3 = pd.read_csv("combined_l1000_creeds.tsv", sep = "\t")
#process3.columns = ['_'.join(col).strip() for col in process3.columns.values]
#process3.index = ['_'.join(ind).strip() for ind in process3.index.values]

TypeError: can only join an iterable

In [4]:

## clustergrammer implementation
# load data into new network instance and cluster
net = Network(clustergrammer_widget)

net.load_file("combined_l1000_creeds.tsv")
net.cluster()

# view the results as a widget
net.widget()

  tmp_df['mat'] = pd.read_table(file_buffer, index_col=row_arr)


clustergrammer_widget(network='{"row_nodes": [{"name": "APECED - Autoimmune polyendocrinopathy-candidiasis-ect…

In [None]:



## clustergrammer implementation
# load data into new network instance and cluster
net = Network(clustergrammer_widget)


#net.load_file('cars.tsv')
net.load_file("jac_sim1.tsv")
net.cluster()

# view the results as a widget
net.widget()

In [189]:
### L1000 PRELOAD DATA DRUG CARD
with open ("L1000_up_lookup.json", 'r') as f:
    L1000_up_lookup = json.load(f)

with open ("L1000_down_lookup.json", 'r') as f:
    L1000_down_lookup = json.load(f)
    
metadata = pd.read_csv("L1000_metadata.csv")

# CREEDS up and down genes for disease signatures
with open("disease_signatures-v1.0.json") as f:
    CREEDS_data = json.load(f)

# generate the up and down gene signatures
CREEDS_up_genes = {
    row['do_id']: row['up_genes']
    for row in CREEDS_data
}
CREEDS_down_genes = {
    row['do_id']: row['down_genes']
    for row in CREEDS_data
}


# load in the EMR Data (filtered > 200 in R code [Drug_diagnosis_test_code.R])
EMR_data = pd.read_csv("EMR_greater_200.csv")
## subset EMR data by the DOI and/or DrOI
EMR_data_df = pd.DataFrame(EMR_data)
#EMR_data
EMR_data_df.drop(EMR_data_df.columns[[0]], axis = 1, inplace = True) # remove the unecessary columns
#EMR_data_df

# implement the search from ICD9-do_id from the manual conversion
icd9_to_doid = pd.read_csv("ICD9_CREEDS_conversion.csv")
icd9_to_doid = pd.DataFrame(icd9_to_doid) # convert it to a data fram to drop unecessary rows
#icd9_to_doid # sanity check
icd9_to_doid_final = icd9_to_doid.drop(icd9_to_doid.columns[[0, 6, 7, 8, 9, 10, 11, 12, 13, 14]], axis = 1)
#icd9_to_doid_final # sanity check


In [190]:
### CLUSTERGRAMMER UPDATED FOR DRUG INPUT

## DRUG SEARCH - USE WARFARIN FOR AN EXAMPLE

DrOI = "warfarin"

### LOAD IN DRUG MATRIX FILE
DrugMatrix ={}
for line in open('DrugMatrix.txt', 'r'):
    label, genelist = line.split('\t\t', maxsplit=1) 
    genelist_split = genelist.strip().split('\t')
    DrugMatrix[label] = genelist_split


DrugMatrix = {x.replace('/', '_'): v  
     for x, v in DrugMatrix.items()}
### DrugMatrix drug input


## generate a list of searchable keys to reduce dict
DrugMatrix_keys = pd.DataFrame(list(DrugMatrix.keys()))
DrugMatrix_keys.columns = ["sigs"]
Drug_Matrix_DrOI = DrugMatrix_keys[DrugMatrix_keys["sigs"].apply(lambda s: bool(re.compile(str(DrOI), re.IGNORECASE).search(str(s))))]

## reduce dict
Drug_matrix_sigs_reduced = list(Drug_Matrix_DrOI["sigs"])
#DrugMatrix_sigs = {k: DrugMatrix[k] for k in list(Drug_matrix_sigs_reduced["sigs"])} # total sigs

## up sigs
Drug_matrix_up_sigs_reduced = Drug_Matrix_DrOI[Drug_Matrix_DrOI["sigs"].apply(lambda s: bool(re.compile(str("-up"), re.IGNORECASE).search(str(s))))]
DrugMatrix_up_sigs= {k: DrugMatrix[k] for k in list(Drug_matrix_up_sigs_reduced["sigs"])}

for a in list(Drug_matrix_up_sigs_reduced["sigs"]):
    DrugMatrix_up_sigs_save = DrugMatrix_up_sigs[a]
    print(a)
    #with open(a + "_DrugMatrix_up_sig.json", "w") as f:
        #json.dump(DrugMatrix_up_sigs_save, f)
        
## down sigs
Drug_matrix_down_sigs_reduced = Drug_Matrix_DrOI[Drug_Matrix_DrOI["sigs"].apply(lambda s: bool(re.compile(str("-dn"), re.IGNORECASE).search(str(s))))]
DrugMatrix_down_sigs= {k: DrugMatrix[k] for k in list(Drug_matrix_down_sigs_reduced["sigs"])}

for b in list(Drug_matrix_down_sigs_reduced["sigs"]):
    DrugMatrix_down_sigs_save = DrugMatrix_down_sigs[b]
    print(b)
    #with open(b + "_DrugMatrix_down_sig.json", "w") as f:
        #json.dump(DrugMatrix_down_sigs_save, f)
        
        


### L1000 DRUG CARD DRUG INPUT

DrOI_df = metadata[metadata["pert_desc"] == DrOI]
DrOI_pert_ids = list(DrOI_df["pert_id"])

DrOI_up_signatures = {k: L1000_up_lookup[k] for k in (DrOI_pert_ids)}
DrOI_up_no_perts = {k: v for d in DrOI_up_signatures.values() for k, v in d.items()}
DrOI_up_drug_sigs = list(DrOI_up_no_perts.keys())

DrOI_down_signatures = {k: L1000_down_lookup[k] for k in (DrOI_pert_ids)}
DrOI_down_no_perts = {k: v for d in DrOI_down_signatures.values() for k, v in d.items()}
DrOI_down_drug_sigs = list(DrOI_down_no_perts.keys())

DrOI_all_sigs = set(DrOI_up_drug_sigs) & set (DrOI_down_drug_sigs)
DrOI_all_sigs_up = [s + "_up" for s in DrOI_all_sigs]
DrOI_all_sigs_down = [s + "_down" for s in DrOI_all_sigs]
######## NEW CODE
DrOI_all_sigs_display = [DrOI + "_" + s for s in list(DrOI_all_sigs)]
########


for a in DrOI_all_sigs:
    L1000_up_json_file = DrOI_up_no_perts[a]
    L1000_down_json_file = DrOI_down_no_perts[a]
    print(a)
    #with open(a + "_L1000_up_sig.json", "w") as f:
        #json.dump(L1000_up_json_file, f)
    #with open(a + "_L1000_down_sig.json", "w") as f:
        #json.dump(L1000_down_json_file, f)

        
### CREEDS DRUG CARD 

#for a in loop_iteration:
CREEDS_URL = 'http://amp.pharm.mssm.edu/CREEDS/'
CREEEDS_Drug_response = requests.get(CREEDS_URL + 'search', params={'q':DrOI})
if CREEEDS_Drug_response.status_code == 200:
    #pprint(CREEEDS_Drug_response.json())
    #json.dump(CREEEDS_Drug_response.json(), open(DrOI + '_api1_result.json', 'w'), indent=4)
    CREEDS_drug_output_df = pd.DataFrame(CREEEDS_Drug_response.json())
    CREEDS_drug_output_ids = list(CREEDS_drug_output_df["id"])
    
    CREEDS_drug_output_ids_up = ["CREEDS_" + s + "_up" for s in CREEDS_drug_output_ids]
    CREEDS_drug_output_ids_down = ["CREEDS_" + s + "_down" for s in CREEDS_drug_output_ids]
    
    CREEDS_all_down_genes = []
    CREEDS_all_up_genes = []
    CREEDS_desc = []
    for a in CREEDS_drug_output_ids:
        CREEDS_URL = 'http://amp.pharm.mssm.edu/CREEDS/'
        CREEDS_drug_sigs_response = requests.get(CREEDS_URL + 'api', params={'id':'drug:DM609'})
        if CREEDS_drug_sigs_response.status_code == 200:
            CREEDS_drug_sigs_response_json = CREEDS_drug_sigs_response.json()
            
            ## up genes
            CREEDS_drug_sigs_up_genes = CREEDS_drug_sigs_response_json['up_genes']
            CREEDS_drug_sigs_up_genes_df = pd.DataFrame(CREEDS_drug_sigs_up_genes) # this is the up genes dataframe
            CREEDS_drug_sigs_up_genes_df.columns = ["Genes", "Score"]
            filename1 = (a + "_CREEDS_drug_sig_up_genes.csv")
            #CREEDS_drug_sigs_up_genes_df.to_csv(filename1) # this saves the df as a csv
            desc = (a + "_" + DrOI + "_" + CREEDS_drug_sigs_response_json["geo_id"])
            CREEDS_desc.append(desc)
            CREEDS_all_up_genes.append(list(CREEDS_drug_sigs_up_genes_df["Genes"]))

            
            ## down genes
            CREEDS_drug_sigs_down_genes = CREEDS_drug_sigs_response_json['down_genes']
            CREEDS_drug_sigs_down_genes_df = pd.DataFrame(CREEDS_drug_sigs_down_genes)# this is the down genes dataframe
            CREEDS_drug_sigs_down_genes_df.columns = ["Genes", "Score"]
            filename2 = (a + "_CREEDS_drug_sig_down_genes.csv")
            CREEDS_all_down_genes.append(list(CREEDS_drug_sigs_down_genes_df["Genes"]))
            #CREEDS_drug_sigs_down_genes_df.to_csv(filename2)
            #CREEDS_drug_sigs_down_genes = CREEDS_drug_sigs_response_json['down_genes'] # this saves the df as a csv
            print(filename2)
            
            up_keys = ['up_genes']
            gene_dict_up = {x:CREEDS_drug_sigs_response_json[x] for x in up_keys}
            gene_dict_up = {"CREEDS_" + a + "_" + k: v for k, v in gene_dict_up.items()}
            
            down_keys = ['down_genes']
            gene_dict_down = {x:CREEDS_drug_sigs_response_json[x] for x in down_keys}
            gene_dict_down = {"CREEDS_" + a + "_" + k: v for k, v in gene_dict_down.items()}
            

            
### CREEDS DISEASE CARD (DRUG INPUT)

# RETURNS THE do_id, geo_id, and disease name in a dictionary
CREEDS_GSE = {
    row['id']: [row['geo_id'], row["disease_name"]]
    for row in CREEDS_data
}

## filter by DrOI need icd9 codes for proper conversion and query through CREEDS
droi_search =EMR_data_df[EMR_data_df['Drug_Name'].apply(lambda s: bool(re.compile(DrOI, re.IGNORECASE).search(s)))]
droi_search_top5 = droi_search[0:10]
EMR_top_disease_from_drug = droi_search_top5["ICD9"]
#top_disease_from_drug = EMR_top_disease_from_drug[0:5]

## build a datatable of all the ICD-9 CM diagnosis codes families (i.e no decimal points)
EMR_top_disease_from_drug_df = pd.DataFrame(EMR_top_disease_from_drug, columns=['ICD9'])
EMR_top_disease_from_drug_df['ICD9_wildcard'] = EMR_top_disease_from_drug_df['ICD9'].apply(lambda code: code.split('.')[0])
#EMR_top_disease_from_drug_df.head()
icd9_to_doid_final['ICD9_wildcard'] = icd9_to_doid_final['ICD9'].apply(lambda code: str(code).split('.')[0])
#icd9_to_doid_final.head()
df_joined = pd.merge(
    left=EMR_top_disease_from_drug_df, left_on='ICD9_wildcard',
    right=icd9_to_doid_final, right_on='ICD9_wildcard',
    how='inner',
    suffixes=(
        '_left',
        '_right',
    )
)

CREEDS_drug_ids = pd.DataFrame(set(df_joined.CREEDS_drug_id))
CREEDS_drug_ids_list = list(set(df_joined.CREEDS_drug_id))
#CREEDS_GSE.keys()
#CREEDS_drug_ids_list
CREEDS_Drug_Final = dict((k, CREEDS_GSE[k]) for k in CREEDS_drug_ids_list)
CREEDS_drug_final_df = pd.DataFrame(CREEDS_Drug_Final).T
CREEDS_drug_final_df.columns = ["GSE_ID", "DISEASE"]
#CREEDS_drug_final_df # DISPLAY THIS DATAFRAME



### CREEDS DISEASE CARD FROM DRUG INPUT API
CREEDS_drug_final_diseases = CREEDS_drug_final_df.DISEASE
CREEDS_drug_final_GSE_ID = CREEDS_drug_final_df.GSE_ID
## CREEDS DISEASE CARD FROM DISEASE QUERY 
CREEDS_disease_output_ids_up = ["CREEDS_" + s + "_up" for s in CREEDS_drug_final_diseases]
CREEDS_disease_output_ids_down = ["CREEDS_" + s + "_down" for s in CREEDS_drug_final_diseases]

loop_iteration = np.arange(0, len(CREEDS_drug_final_diseases))
loop_iteration = list(loop_iteration)

CREEDS_total_api_df = []
CREEDS_all_disease_up_genes = []
CREEDS_all_disease_down_genes = []
for a in loop_iteration:
    CREEDS_URL = 'http://amp.pharm.mssm.edu/CREEDS/'
    CREEEDS_Disease_response = requests.get(CREEDS_URL + 'api', params={'id':CREEDS_drug_ids_list[a]})
    if CREEEDS_Disease_response.status_code == 200:
        #pprint(CREEEDS_Disease_response.json())
        #json.dump(CREEEDS_Drug_response.json(), open(CREEDS_drug_final_GSE_ID[a] + '_api1_result.json', 'w'), indent=4)
        CREEEDS_Disease_response_json = CREEEDS_Disease_response.json()
        
        ## up genes
        CREEDS_disease_sigs_up_genes = CREEEDS_Disease_response_json['up_genes']
        CREEDS_disease_sigs_up_genes_df = pd.DataFrame(CREEDS_disease_sigs_up_genes) # this is the up genes dataframe
        CREEDS_disease_sigs_up_genes_df.columns = ["Genes", "Score"]
        #desc = (a + "_" + DrOI + "_" + CREEEDS_Disease_response["geo_id"])
        #CREEDS_desc.append(desc)
        CREEDS_all_disease_up_genes.append(list(CREEDS_disease_sigs_up_genes_df["Genes"]))
        
        filename1 = (str(CREEDS_drug_ids_list[a]) + "_CREEDS_disease_sig_up_genes.csv")
        #CREEDS_disease_sigs_up_genes_df.to_csv(filename1) # this saves the df as a csv
        
        
        ## down genes
        CREEDS_disease_sigs_down_genes = CREEEDS_Disease_response_json['down_genes']
        CREEDS_disease_sigs_down_genes_df = pd.DataFrame(CREEDS_disease_sigs_down_genes) # this is the up genes dataframe
        CREEDS_disease_sigs_down_genes_df.columns = ["Genes", "Score"]
        CREEDS_all_disease_down_genes.append(list(CREEDS_disease_sigs_down_genes_df["Genes"]))
        
        filename2 = (str(CREEDS_drug_ids_list[a]) + "_CREEDS_disease_sig_down_genes.csv")
        #CREEDS_disease_sigs_down_genes_df.to_csv(filename2) # this saves the df as a csv
        print(filename2)
        # entire json
        #json.dump(response.json(), open(a + '_CREEDS_Disease_sig.json', 'w'), indent=4) # if the user wants the entire json, they can download this
            
        
        #CREEEDS_Drug_response_df = pd.DataFrame(CREEEDS_Drug_response_json)
        #CREEEDS_Drug_response_df # This will be the dataframe to return
        #CREEDS_total_api_df.append(CREEEDS_Drug_response_df)
#CREEDS_total_api_df = pd.concat(CREEDS_total_api_df, axis =1)
#CREEDS_total_api_df.T ## display this datatable


### GENESHOT API and further integration
GENESHOT_URL = 'http://amp.pharm.mssm.edu/geneshot/api'
query_string = '/search/%s'
search_term = DrOI

# true query from geneshot
response = requests.get(
    GENESHOT_URL + query_string % (search_term)
 )
if not response.ok:
    raise Exception('Error during query')

data = json.loads(response.text)
#print(data)

## GENESHOT QUERY USING AutoRIF

GENESHOT_URL = 'http://amp.pharm.mssm.edu/geneshot/api'
query_string = '/search/auto/%s'
search_term = 'wound healing' # this will be the user input 
geneshot_response = requests.get(
    GENESHOT_URL + query_string % (search_term)
 )
if not geneshot_response.ok:
    raise Exception('Error during query')

geneshot_data = json.loads(geneshot_response.text)
#print(geneshot_data)
geneshot_gene_df = geneshot_data["gene_count"]
geneshot_gene_list = list(geneshot_gene_df.keys()) # this extracts the genes from the json. We can then resend this through the geneshot api
geneshot_gene_list_commas = ",".join(geneshot_gene_list) # can save this as a csv. 

geneshot_gene_df1 = pd.DataFrame(geneshot_gene_df).T
geneshot_gene_df1.columns = ["Pubmed Count", "Publication Count/Total Publications"]

#write the geneshot pubmed data
#geneshot_gene_df1.to_csv(search_term + "_geneshot_pubmed_counts.csv")



query_string = '/associate/%s/%s'
similarity_matrix = 'coexpression' # we can make this dynamic. Parameters: (generif, tagger, autorif, coexpression, enrichr)
gene_symbols = geneshot_gene_list_commas

coexpression_response = requests.get(
    GENESHOT_URL + query_string % (similarity_matrix, gene_symbols)
 )
if not coexpression_response.ok:
    raise Exception('Error during query')

coexpression_data = json.loads(coexpression_response.text) # this will be the coexpression json they can download
geneshot_coexp_ass = {"GENESHOT_coexpression":list(coexpression_data["association"].keys())}

query_string = '/associate/%s/%s'
similarity_matrix = 'generif' # we can make this dynamic. Parameters: (generif, tagger, autorif, coexpression, enrichr)
gene_symbols = geneshot_gene_list_commas

generif_response = requests.get(
    GENESHOT_URL + query_string % (similarity_matrix, gene_symbols)
 )
if not generif_response.ok:
    raise Exception('Error during query')

generif_data = json.loads(generif_response.text) # this will be the coexpression json they can download
geneshot_generif = {"GENESHOT_generif":list(generif_data["association"].keys())}



query_string = '/associate/%s/%s'
similarity_matrix = 'tagger' # we can make this dynamic. Parameters: (generif, tagger, autorif, coexpression, enrichr)
gene_symbols = geneshot_gene_list_commas

tagger_response = requests.get(
    GENESHOT_URL + query_string % (similarity_matrix, gene_symbols)
 )
if not tagger_response.ok:
    raise Exception('Error during query')

tagger_data = json.loads(tagger_response.text) # this will be the coexpression json they can download
geneshot_tagger = {"GENESHOT_tagger":list(tagger_data["association"].keys())}



query_string = '/associate/%s/%s'
similarity_matrix = 'tagger' # we can make this dynamic. Parameters: (generif, tagger, autorif, coexpression, enrichr)
gene_symbols = geneshot_gene_list_commas

autorif_response = requests.get(
    GENESHOT_URL + query_string % (similarity_matrix, gene_symbols)
 )
if not autorif_response.ok:
    raise Exception('Error during query')

autorif_data = json.loads(autorif_response.text) # this will be the coexpression json they can download
geneshot_autorif = {"GENESHOT_autorif":list(autorif_data["association"].keys())}

query_string = '/associate/%s/%s'
similarity_matrix = 'tagger' # we can make this dynamic. Parameters: (generif, tagger, autorif, coexpression, enrichr)
gene_symbols = geneshot_gene_list_commas

enrichr_response = requests.get(
    GENESHOT_URL + query_string % (similarity_matrix, gene_symbols)
 )
if not enrichr_response.ok:
    raise Exception('Error during query')

enrichr_data = json.loads(enrichr_response.text) # this will be the coexpression json they can download
geneshot_enrichr = {"GENESHOT_enrichr":list(enrichr_data["association"].keys())}





#### GMT formation from these datasets (NO X2K)

#### format = TITLE \t\ Description \t\ Genes
def merge_two_dicts(x, y):
    z = x.copy()   # start with x's keys and values
    z.update(y)    # modifies z with y's keys and values & returns None
    return z

DrOI_up_no_perts_cp = {"L1000FWD_" + k +"_up": v for k, v in DrOI_up_no_perts.items()}
DrOI_down_no_perts_cp = {"L1000FWD_" + k +"_down": v for k, v in DrOI_down_no_perts.items()}

## Genes
DrugMatrix_Drug_Genes = merge_two_dicts(DrugMatrix_up_sigs , DrugMatrix_down_sigs)
DrugMatrix_Drug_Genes = {"DrugMatrix_" + k: v for k, v in DrugMatrix_Drug_Genes.items()}
L1000_Drug_Genes = merge_two_dicts(DrOI_up_no_perts_cp,DrOI_down_no_perts_cp)

CREEDS_up_Genes = {
    CREEDS_drug_output_ids_up[a]: CREEDS_all_up_genes[a]
    for a in range(len(CREEDS_drug_output_ids_up))
}

CREEDS_down_Genes = {
    CREEDS_drug_output_ids_down[a]: CREEDS_all_down_genes[a]
    for a in range(len(CREEDS_drug_output_ids_up))
}

CREEDS_Disease_up_Genes = {
    CREEDS_disease_output_ids_up[a]: CREEDS_all_disease_up_genes[a]
    for a in range(len(CREEDS_drug_output_ids_up))
}

CREEDS_Disease_down_Genes = {
    CREEDS_disease_output_ids_down[a]: CREEDS_all_disease_down_genes[a]
    for a in range(len(CREEDS_drug_output_ids_up))
}


total_genes = merge_two_dicts(L1000_Drug_Genes, DrugMatrix_Drug_Genes)
total_genes =merge_two_dicts(total_genes, geneshot_coexp_ass)
total_genes =merge_two_dicts(total_genes, geneshot_generif)
total_genes =merge_two_dicts(total_genes, geneshot_tagger)
total_genes =merge_two_dicts(total_genes, geneshot_autorif)
total_genes =merge_two_dicts(total_genes, geneshot_enrichr)
total_genes =merge_two_dicts(total_genes, CREEDS_up_Genes)
total_genes =merge_two_dicts(total_genes, CREEDS_down_Genes)
total_genes =merge_two_dicts(total_genes, CREEDS_Disease_up_Genes)
total_genes =merge_two_dicts(total_genes, CREEDS_Disease_down_Genes)

## you will need to change this path file. 
with open ("/Users/maayanlab/Desktop/Andrew/Drug_Project/Code_base/Drug_discovery_application/Autoencoder_Embedding/data/"+DrOI+".gmt", "w") as file:
    for k in list(total_genes.keys()):
        file.write(k + '\t')
        #file.write('\t'+'na')
        file.write("\t".join(total_genes[k]))
        file.write('\n')
        


            

Warfarin-0.25_mg_kg_in_CMC-Rat-Liver-1d-up
Warfarin-0.25_mg_kg_in_CMC-Rat-Liver-3d-up
Warfarin-0.25_mg_kg_in_CMC-Rat-Liver-5d-up
Warfarin-0.25_mg_kg_in_CMC-Rat-Liver-1d-dn
Warfarin-0.25_mg_kg_in_CMC-Rat-Liver-3d-dn
Warfarin-0.25_mg_kg_in_CMC-Rat-Liver-5d-dn
CPC005_PC3_24H:BRD-A24514565-001-04-8:10
CPC020_VCAP_24H:BRD-A24514565-236-11-5:10
CPC005_HA1E_24H:BRD-A24514565-001-04-8:10
drug:DM2191_CREEDS_drug_sig_down_genes.csv
drug:DM2192_CREEDS_drug_sig_down_genes.csv
drug:DM2193_CREEDS_drug_sig_down_genes.csv
dz:412_CREEDS_disease_sig_down_genes.csv
dz:209_CREEDS_disease_sig_down_genes.csv
dz:856_CREEDS_disease_sig_down_genes.csv
dz:273_CREEDS_disease_sig_down_genes.csv
dz:179_CREEDS_disease_sig_down_genes.csv
dz:400_CREEDS_disease_sig_down_genes.csv
dz:258_CREEDS_disease_sig_down_genes.csv


In [177]:
CREEDS_all_disease_up_genes

[]

In [193]:
list(total_genes.keys())

['L1000FWD_CPC005_PC3_24H:BRD-A24514565-001-04-8:10_up',
 'L1000FWD_CPC005_HA1E_24H:BRD-A24514565-001-04-8:10_up',
 'L1000FWD_CPC020_VCAP_24H:BRD-A24514565-236-11-5:10_up',
 'L1000FWD_CPC005_PC3_24H:BRD-A24514565-001-04-8:10_down',
 'L1000FWD_CPC020_VCAP_24H:BRD-A24514565-236-11-5:10_down',
 'L1000FWD_CPC005_HA1E_24H:BRD-A24514565-001-04-8:10_down',
 'DrugMatrix_Warfarin-0.25_mg_kg_in_CMC-Rat-Liver-1d-up',
 'DrugMatrix_Warfarin-0.25_mg_kg_in_CMC-Rat-Liver-3d-up',
 'DrugMatrix_Warfarin-0.25_mg_kg_in_CMC-Rat-Liver-5d-up',
 'DrugMatrix_Warfarin-0.25_mg_kg_in_CMC-Rat-Liver-1d-dn',
 'DrugMatrix_Warfarin-0.25_mg_kg_in_CMC-Rat-Liver-3d-dn',
 'DrugMatrix_Warfarin-0.25_mg_kg_in_CMC-Rat-Liver-5d-dn',
 'geneshot_coexpression',
 'geneshot_generif',
 'geneshot_tagger',
 'geneshot_autorif',
 'geneshot_enrichr',
 'CREEDS_drug:DM2191_up',
 'CREEDS_drug:DM2192_up',
 'CREEDS_drug:DM2193_up',
 'CREEDS_drug:DM2191_down',
 'CREEDS_drug:DM2192_down',
 'CREEDS_drug:DM2193_down',
 'CREEDS_essential hypertensi

In [202]:
## you will need to change this path file. 
with open ("/Users/maayanlab/Desktop/Andrew/Drug_Project/Code_base/Drug_discovery_application/Autoencoder_Embedding/data/"+DrOI+".gmt", "w") as file:
    for k in list(total_genes.keys()):
        file.write(k + '\t')
        #file.write('\t'+'na')
        file.write("\t".join(total_genes[k]))
        file.write('\n')
        


In [79]:
list(total_genes.keys())

['Warfarin-0.25_mg_kg_in_CMC-Rat-Liver-1d-dn',
 'Warfarin-0.25_mg_kg_in_CMC-Rat-Liver-3d-dn',
 'Warfarin-0.25_mg_kg_in_CMC-Rat-Liver-5d-dn',
 'Warfarin-0.25_mg_kg_in_CMC-Rat-Liver-1d-up',
 'Warfarin-0.25_mg_kg_in_CMC-Rat-Liver-3d-up',
 'Warfarin-0.25_mg_kg_in_CMC-Rat-Liver-5d-up',
 'CPC005_PC3_24H:BRD-A24514565-001-04-8:10',
 'CPC020_VCAP_24H:BRD-A24514565-236-11-5:10',
 'CPC005_HA1E_24H:BRD-A24514565-001-04-8:10']

In [None]:

## TITLES
DrugMatrix_Drug_titles = (list(DrugMatrix_up_sigs.keys()) + list(DrugMatrix_down_sigs.keys()))
CREEDS_Drug_titles = CREEDS_drug_output_ids_up + CREEDS_drug_output_ids_down
L1000_Drug_titles = list(DrOI_all_sigs) #DrOI_all_sigs_up + DrOI_all_sigs_down
L1000_Drug_titles = ["L1000_" + s for s in L1000_Drug_titles]
total_titles= DrugMatrix_Drug_titles+L1000_Drug_titles

## Descriptions
L1000_Drug_description = DrOI_all_sigs_display
CREEDS_Drug_description = CREEDS_desc
DrugMatrix_Drug_description = (list(DrugMatrix_up_sigs.keys()) + list(DrugMatrix_down_sigs.keys()))
