In [9]:
## Full backend analysis for the CREEDS and L1000 dataset 
import os
import pandas as pd
import itertools
from pandas.compat import StringIO
import numpy as np
from numpy import loadtxt
import sys
import json
from pprint import pprint
import objectpath
import csv
import re
import matplotlib.pyplot as plt
import json, requests
from pprint import pprint
import scipy
from scipy.spatial import distance
from sklearn.metrics.pairwise import pairwise_distances
from clustergrammer_widget import *
def get_geneset(df, indexer):
    df_ = df.loc[indexer, :]
    return list(df_[df_ == 1].index)

In [7]:
## load in the pre-formed datasets from the L1000_Analysis and CREEDS_Analysis files.
## THIS TAKES A WHILE TO LOAD, SO ONLY LOAD THIS ONCE AND EARLY

# L1000 up and down gene loads for drug signatures
L1000_up_genes = pd.read_csv("L1000_up_genes.csv")
L1000_down_genes = pd.read_csv("L1000_down_genes.csv")

# CREEDS up and down genes for disease signatures
with open("disease_signatures-v1.0.json") as f:
    CREEDS_data = json.load(f)

# generate the up and down gene signatures
CREEDS_up_genes = {
    row['do_id']: row['up_genes']
    for row in CREEDS_data
}
CREEDS_down_genes = {
    row['do_id']: row['down_genes']
    for row in CREEDS_data
}

# load in the EMR Data (filtered > 200 in R code [Drug_diagnosis_test_code.R])
EMR_data = pd.read_csv("EMR_greater_200.csv")
## subset EMR data by the DOI and/or DrOI
EMR_data_df = pd.DataFrame(EMR_data)
#EMR_data
EMR_data_df.drop(EMR_data_df.columns[[0]], axis = 1, inplace = True) # remove the unecessary columns
#EMR_data_df

# implement the search from ICD9-do_id from the manual conversion
icd9_to_doid = pd.read_csv("ICD9_CREEDS_conversion.csv")
icd9_to_doid = pd.DataFrame(icd9_to_doid) # convert it to a data fram to drop unecessary rows
#icd9_to_doid # sanity check
icd9_to_doid_final = icd9_to_doid.drop(icd9_to_doid.columns[[0, 6, 7, 8, 9, 10, 11, 12, 13, 14]], axis = 1)
#icd9_to_doid_final # sanity check

## PRELOADED DATA
## L1000 conversion and analysis DRUG
L1000_down_extract = pd.DataFrame(L1000_down_genes)
L1000_up_extract = pd.DataFrame(L1000_up_genes)

## L1000 ANALYSIS -- FEEDS BACK INTO THE API to get additional signatures
metadata = pd.read_csv("L1000_metadata.csv")
#metadata ## same as LINC1000h5.row_metadata_df

EMR_Drug_Names = EMR_data_df['Drug_Name'] # this will be the selection for the dropdown menu
## get the subselection of drug names

unique_pert_ids = metadata['pert_id'].unique()
unique_drug_names = metadata['pert_desc'].unique()

possible_drug_inputs = set(unique_drug_names) & set(EMR_Drug_Names) # 28 possible drug inputs


In [None]:
## X2K API integration 

# Import modules
import http.client
import json

##### Function to run X2K
### Input: a Python list of gene symbols
### Output: a dictionary containing the results of X2K, ChEA, G2N, KEA.


def run_X2K(input_genes, options={}):
    # Open HTTP connection
    conn = http.client.HTTPConnection("amp.pharm.mssm.edu")

    # Set default options
    default_options = {'text-genes': '\n'.join(input_genes),
                       'included_organisms': 'both',
                       'TF-target gene background database used for enrichment': 'ChEA & ENCODE Consensus',
                       'sort transcription factors by': 'p-value',
                       'min_network_size': 10,
                       'number of top TFs': 10,
                       'path_length': 2,
                       'min_number_of_articles_supporting_interaction': 0,
                       'max_number_of_interactions_per_protein': 200,
                       'max_number_of_interactions_per_article': 100,
                       'enable_BioGRID': True,
                       'enable_IntAct': True,
                       'enable_MINT': True,
                       'enable_ppid': True,
                       'enable_Stelzl': True,
                       'kinase interactions to include': 'kea 2018',
                       'sort kinases by': 'p-value'}

    # Update options
    for key, value in options.items():
        if key in default_options.keys() and key != 'text-genes':
            default_options.update({key: value})

    # Get payload
    boundary = "----WebKitFormBoundary7MA4YWxkTrZu0gW"
    payload = ''.join(
        ['--' + boundary + '\r\nContent-Disposition: form-data; name=\"{key}\"\r\n\r\n{value}\r\n'.format(**locals())
         for key, value in default_options.items()]) + '--' + boundary + '--'

    # Get Headers
    headers = {
        'content-type': "multipart/form-data; boundary=" + boundary,
        'cache-control': "no-cache",
    }

    # Initialize connection
    conn.request("POST", "/X2K/api", payload, headers)

    # Get response
    res = conn.getresponse()

    # Read response
    data = res.read().decode('utf-8')

    # Convert to dictionary
    x2k_results = {key: json.loads(value) if key != 'input' else value for key, value in json.loads(data).items()}

    # Clean results
    x2k_results['ChEA'] = x2k_results['ChEA']['tfs']
    x2k_results['G2N'] = x2k_results['G2N']['network']
    x2k_results['KEA'] = x2k_results['KEA']['kinases']
    x2k_results['X2K'] = x2k_results['X2K']['network']

    # Return results
    return x2k_results

############################################################################################################################################################

### try the input
DrOI = "DIGOXIN" # drug of interest. MAKE SURE TO SELECT THIS FROM "possible_drug_inputs"
####

DrOI_up_extract = L1000_up_extract[L1000_up_extract['Unnamed: 0'].apply(lambda s: bool(re.compile(DrOI, re.IGNORECASE).search(s)))]
DrOI_up_final = DrOI_up_extract.loc[:, (DrOI_up_extract != 0).any(axis=0)] # remove any genes without any expression in any of the results

DrOI_down_extract = L1000_down_extract[L1000_down_extract['Unnamed: 0'].apply(lambda s: bool(re.compile(DrOI, re.IGNORECASE).search(s)))]
DrOI_down_final = DrOI_down_extract.loc[:, (DrOI_down_extract != 0).any(axis=0)] # remove any genes without any expression in any of the results


L1000FWD_URL = 'http://amp.pharm.mssm.edu/L1000FWD/'

query_string = DrOI
response = requests.get(L1000FWD_URL + 'synonyms/' + query_string)
if response.status_code == 200:
	pprint(response.json())
	#json.dump(response.json(), open('api1_result.json', 'wb'), indent=4)
L1000_significant_query = response.json()
L1000_significant_query_df = pd.DataFrame(L1000_significant_query)
L1000_significant_pert_ids = L1000_significant_query_df["pert_id"]
L1000_summary_table = metadata[metadata["pert_id"].isin(L1000_significant_pert_ids)] # display this table!
L1000_drug_signatures = L1000_summary_table.rid


if len(L1000_significant_pert_ids) > 0:
    
    test=[]
    for q in L1000_significant_pert_ids:
        meta_doi = metadata[metadata["pert_id"].apply(lambda s: bool(re.compile(str(q), re.IGNORECASE).search(str(s))))]
        #print(q)
        meta_doi_ids = meta_doi.rid
        query = list(meta_doi_ids)
       # print(query)
        test.append(query)
    #test
    test1 = [x for x in test if x]
   #test1
    L1000_drug_signatures = list(itertools.chain.from_iterable(test1))
    
  
    if len(L1000_drug_signatures) > 0:
    ## L1000 iterations API   
        L1000FWD_URL = 'http://amp.pharm.mssm.edu/L1000FWD/'
        json_name_store = []
        drug_sig_store = []
        for x in L1000_drug_signatures:
            sig_id = x
            L1000_gene_response = requests.get(L1000FWD_URL + 'sig/' + str(sig_id))
            if L1000_gene_response.status_code == 200:
                #pprint(response.json())
                json.dump(L1000_gene_response.json(), open(sig_id + '_api2_result.json', 'w'), indent=4)
                json_name = (sig_id + '_api2_result.json')
                #print(json_name)
                json_name_store.append(json_name)
                
                L1000_query = L1000_gene_response.json() 
                L1000_query_up_genes = L1000_query["up_genes"]
                L1000_query_down_genes = L1000_query["down_genes"]
                
                
                ### UNIQUE X2K CODE
                
                #L1000 up genes
                L1000_X2K_up_genes = run_X2K(L1000_query_up_genes)
                L1000_X2K_up_genes = L1000_X2K_up_genes["X2K"]
                L1000_X2K_up_genes_df = pd.DataFrame(L1000_X2K_up_genes['nodes'])
                #print(L1000_X2K_up_genes_df)
                filename_up = (sig_id + "_X2K_up_genes.csv")
                L1000_X2K_up_genes_df.to_csv(filename_up) # THIS IS THE FILE THEY SHOULD BE ABLE TO DOWNLOAD
                
                
                #L1000 down genes
                L1000_X2K_down_genes = run_X2K(L1000_query_down_genes)
                L1000_X2K_down_genes = L1000_X2K_down_genes["X2K"]
                L1000_X2K_down_genes_df = pd.DataFrame(L1000_X2K_down_genes['nodes'])
                #print(L1000_X2K_down_genes_df)
                filename_down = (sig_id + "_X2K_down_genes.csv")
                L1000_X2K_down_genes_df.to_csv(filename_down) # THIS IS THE FILE THEY SHOULD BE ABLE TO DOWNLOAD



        print("SIGNIFICANT L1000 SIGNATURES")
    else:
        print ("NO Significant signatures found in metadata")

    
else:
    print("NO SIGNIFICANT L1000 SIGNATURES FOR" + DrOI)


In [33]:
(L1000_query_up_genes)


['FXYD6',
 'FAM198B',
 'GNLY',
 'SFTPB',
 'NRCAM',
 'HBB',
 'P2RY13',
 'CHMP2A',
 'RASA4',
 'MFAP2',
 'UQCRH',
 'GGPS1',
 'S100A10',
 'APLP2',
 'S100A12',
 'PAWR',
 'LDOC1',
 'RREB1',
 'RPL28',
 'EML3',
 'FGR',
 'ADAM28',
 'GRWD1',
 'MARCKS',
 'NAPA',
 'PCBP2',
 'MBNL2',
 'CTSH',
 'TWF1',
 'IFITM1',
 'WDR1',
 'AQP1',
 'CTSB',
 'LEFTY2',
 'KIAA0226L',
 'AQP9',
 'MTUS1',
 'TNFRSF21',
 'USP48',
 'NUPL1',
 'TSPAN5',
 'SH3BGRL',
 'CTSS',
 'GATA3',
 'FAM65B',
 'DDX17',
 'KCTD12',
 'SNRPD2',
 'LST1',
 'PTPN12',
 'SPARCL1',
 'GRB14',
 'MSRA',
 'N4BP2L2',
 'ENC1',
 'TLR4',
 'TRIP6',
 'TREM1',
 'CDKN1C',
 'CD3E',
 'ATP6V1D',
 'RPL37A',
 'SLC7A8',
 'TCF4',
 'GRIA2',
 'PCM1',
 'RPL27A',
 'GPM6B',
 'DCLK1',
 'FCN1',
 'FGF9',
 'POLR1D',
 'SEMA4A',
 'TMEM111',
 'ARID4B',
 'MAFB',
 'XIST',
 'NEBL',
 'SLC11A1',
 'EGR1',
 'PALLD',
 'THBS1',
 'NPC2',
 'CEACAM5',
 'NGRN',
 'CEACAM6',
 'IGHM',
 'CLDN3',
 'NPEPPS',
 'PSME4',
 'CLPTM1',
 'CNR1',
 'GSPT1',
 'JAM3',
 'ASAH1',
 'PARP2',
 'HIST1H2BD',
 'HIST1H2B

In [None]:
### L1000 input into X2k

L1000_up_input_genes