In [2]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
from bravado.client import SwaggerClient
from lifelines import KaplanMeierFitter
import requests
import json
from lifelines import CoxPHFitter
from sksurv.linear_model import CoxPHSurvivalAnalysis
import itertools

In [3]:
cancer_and_study_id={}
cancer_and_study_id['gbm'] = ['gbm_tcga_pub','gbm_tcga_pub2013','gbm_tcga_pan_can_atlas_2018','gbm_tcga',"gbm_mayo_pdx_sarkaria_2019", "gbm_columbia_2019"]


In [5]:
genes=genes.split()

Taking the molecular data of all MT genes (more than 1000) and converting these data into a table with rows of patient IDs.
Add to the table with #patient ids. Finally, we will have a table and we will analyze it using cox regression

In [None]:
#Butun MT genlerinin (1000den fazla) molecular datasini alip bu datalari rowlari patient id olan bir masaya cevirmek. Sonradan clinical data yi bu 
#patient id lerin oldugu tabloya ekle. En sonunda da elimizde gmailden paylastigim formatta bir tablo olucak ve buna cox analiz yapcaz.

In [9]:
def get_all_MTgenes_and_IDs(genes): #returns a dictionary that has keys as gene id(number) and value as gene name
    MT_genes=genes
    MT_gene_and_ID={}#stores MT genes and their corresponding gene id
    all_genes=requests.get('https://www.cbioportal.org/api/genes?direction=ASC&pageNumber=0&pageSize=10000000&projection=SUMMARY').json()
    for mt_gene in MT_genes:
        for gene_info in all_genes:
            if gene_info['hugoGeneSymbol']==mt_gene:
                MT_gene_and_ID[mt_gene]=gene_info['entrezGeneId']
    return {value : key for (key, value) in MT_gene_and_ID.items()}
#MT_gene_and_ID=get_all_MTgenes_and_IDs()
def get_sample_lists_in_studies(studies):
    sample_lists=[]
    sample_list_and_study_id={}
    for study in studies:
        url='https://www.cbioportal.org/api/studies/{0}/sample-lists?direction=ASC&pageNumber=0&pageSize=10000000&projection=SUMMARY'
        #print('\n')
        #print(study+'\n')
        for sam_list in requests.get(url.format(study)).json(): 
            if sam_list['name']=="All samples": #getting the correct sample id corresponding to 'Samples with mRNA data (RNA Seq V2)'
                #sample_list_and_study_id[sam_list['sampleListId']]=study #dictionary
                sample_lists.append(sam_list['sampleListId']) #for each study this variable get overwritten
                break
    return sample_lists
def get_eachMTgene_study_and_mol_data(MT_ID_and_gene,studies): #iterating over all gene ids and creating sample_list_and_mol_data_for_eachgene 
        # each gene in this dictionary is itself a dictionary that has n keys for each study, and each sample study stores
        #molecular data as a list, each element being a patients mrna data for that gene
    gene_studies_moldata={}
    for gene_id in MT_ID_and_gene.keys(): 
        gene_studies_moldata[MT_ID_and_gene[gene_id]]={}
        for study_id in studies: 
            mol_data_url='https://www.cbioportal.org/api/molecular-profiles/{study_id}'+'_mrna_median_all_sample_Zscores"/molecular-data?entrezGeneId={geneid}&projection=SUMMARY&sampleListId={study_id}'+'_all'
            mol_data=requests.get(mol_data_url.format(study_id=study_id,geneid=gene_id)).json()
            if len(mol_data)==0:
                continue
            gene_studies_moldata[MT_ID_and_gene[gene_id]][study_id]=mol_data
    return gene_studies_moldata


In [151]:
MT_ID_and_gene=get_all_MTgenes_and_IDs(genes)

In [12]:
sample_lists=get_sample_lists_in_studies2(cancer_and_study_id['gbm'])
sample_lists

['gbm_tcga_pub_all',
 'gbm_tcga_pub2013_all',
 'gbm_tcga_pan_can_atlas_2018_all',
 'gbm_tcga_all',
 'gbm_mayo_pdx_sarkaria_2019_all',
 'gbm_columbia_2019_all']

In [158]:

mol_data_url='https://www.cbioportal.org/api/molecular-profiles/{study_id}'+'_rna_seq_v2_mrna_median_all_sample_Zscores/molecular-data?entrezGeneId={geneid}&projection=SUMMARY&sampleListId={study_id}'+'_all'
        #if requests.get(mol_data_url.format(sample_list_id=sample_list,geneid=gene_id)).json():
studies=cancer_and_study_id['gbm']
i=0
mol_data={}
for gene in MT_ID_and_gene:#MT_genes
    i=i+1
    if i == 100: # 100 tane genin mol datasini al (daha fazlasini alinca crash ediyor)
        break
    mol_data[gene]={}
    for study_id in studies:
        mol_data[gene][study_id]=requests.get(mol_data_url.format(study_id=study_id,geneid=gene)).json()
        



In [None]:
add_exp_feature_mol_data(mol_data,studies) #Adding overexpression values to the mol data 
gene_patient_id_exp_and_val=get_only_patiend_id_and_expression(mol_data,studies) #extracting necessary data from mol data
unique_patient_ids=get_unique_patient_ids(gene_patient_id_exp_and_val)#finding how many unique patiend ids exist
len(unique_patient_ids)
df=create_df_patientid_and_gene_exp1(unique_patient_ids,gene_patient_id_exp_and_val,MT_ID_and_gene)
df=add_new_features_to_df_using_clinicaldata(df,unique_patient_ids,studies)

In [187]:
def add_new_features_to_df_using_clinicaldata(df,unique_patient_ids,studies):
    patientClinicalValues = {}
    url = 'https://www.cbioportal.org/api/studies/{0}/clinical-data?clinicalDataType={1}&direction=ASC&pageNumber=0&pageSize=10000000&projection=SUMMARY'
    #Add ,'IDH1_MUTATION' to the attributes somehow, i think its not returned by api
    attributes = ['AGE', 'SEX', 'OS_MONTHS']
    for studyId in studies:
        response=requests.get(url.format(studyId,'PATIENT')).json()
        for row in response:
            attrName = row['clinicalAttributeId']
            if attrName not in attributes:
                continue
            patientId = row['patientId']
            if patientId not in patientClinicalValues.keys():
                patientClinicalValues[patientId] = {'AGE': None, 'SEX': None, 'OS_MONTHS': None}
            patientClinicalValues[patientId][attrName] = row['value']
    #unique_patient_ids_with_age_sex_os_months_status_cols=[]
    #for i in unique_patient_ids:
        #if i in patientClinicalValues.keys():
        
            #unique_patient_ids_with_age_sex_os_months_status_cols.append(i)
    df['OS_MONTHS']=[patientClinicalValues[patientId]['OS_MONTHS'] if patientId in patientClinicalValues else float('Nan')
                     for patientId in unique_patient_ids ]
    df['AGE']=[patientClinicalValues[patientId]['AGE'] if patientId in patientClinicalValues else float('Nan')
                     for patientId in unique_patient_ids ]
    df['SEX']=[patientClinicalValues[patientId]['SEX']=='Male' if patientId in patientClinicalValues else float('Nan')
                     for patientId in unique_patient_ids ]
    df=df.fillna()#MIGHT BE A PROBLEMMM

In [None]:
#keep genes that appear in at least one study
def keeps_genes_appearing_in_atleast_1study(mol_data,studies):
    i=0
    genes_ids_that_appear_in_atleast_one_study=[]
    gene_has_data={}
    for gene in mol_data:
        gene_has_data[gene]=0
        i=i+1
        if i==10:
            break
        for study_id in studies:
            print(study_id)
            print(gene)
            has_data=int(list(mol_data[gene][study_id].keys())[0]=='message')
            gene_has_data[gene]+=has_data
    for gene in gene_has_data:
        if gene_has_data[gene]>0:
            genes_ids_that_appear_in_atleast_one_study.append(gene)
    return genes_ids_that_appear_in_atleast_one_study
genes_we_want=keeps_genes_appearing_in_atleast_1study(mol_data,studies)
new_mol_data={}
for gene in mol_data:
    if gene in genes_we_want:
        new_mol_data[gene]=mol_data[gene]
#   now we know all the genes that have at least 1 appearance in studies:
#   I want to just get the data for each gene, and add together and add the clinical data(if present)
#   have a huge database

In [172]:
gene_patient_id_exp_and_val[25890]['TCGA-02-0047']

0.196

In [177]:
def create_df_patientid_and_gene_exp(unique_patient_ids,patientID_exp_and_val,MT_ID_and_gene):#overexp +1 or -1, only including MT genes that appear in all selected studies  
    df=pd.DataFrame()
    df['PatientId']=[i for i in unique_patient_ids]
    for gene in patientID_exp_and_val.keys():
        for patientId in unique_patient_ids:
            if patientId in patientID_exp_and_val[gene]:
                df[MT_ID_and_gene[gene]+' overexpressed']=patientID_exp_and_val[gene][patientId][0]
    return df



In [None]:
def add_exp_feature_mol_data(mol_data,studies): #-1 if underexpressed
# acces every mol data through iterating oover every gene first and then in the second loop over sample lists
    j=0
    for gene in mol_data.keys():
        for study_id in studies:
            length=len((mol_data[gene][study_id]))
            for i in range(length):
                if type(mol_data[gene][study_id])==dict or length==0: #if it is a dictionary it means it has a message: not found so in that study 
                    #id for that gene there is no results
                    continue
                else:
                    exp=mol_data[gene][study_id][i]['value']
                    if exp>=1:
                        mol_data[gene][study_id][i]['overexpressed']=1
                    elif exp<=-1:
                        mol_data[gene][study_id][i]['overexpressed']=-1  
                    else: #greater than equal to zero
                        mol_data[gene][study_id][i]['overexpressed']=0 
    return mol_data
def get_only_patiend_id_and_expression(mol_data,study_id): #from sample_list_and_mol_data_for_genes_in_all_studies[gene][samplelistid[i]]  
    gene_patient_id_exp_and_val={} 
    for gene in mol_data.keys():
        gene_patient_id_exp_and_val[gene]={}
        for study_id in studies:
            for one_mol_data in mol_data[gene][study_id]:
                if type(mol_data[gene][study_id])==list and len(mol_data[gene][study_id])>0:  #if list it sdtores data, if dict it stores error msg
                    gene_patient_id_exp_and_val[gene][one_mol_data['patientId']]=(one_mol_data['overexpressed'],one_mol_data['value'])
    return gene_patient_id_exp_and_val
def get_unique_patient_ids(patientID_exp_and_val): #from patientID_and_overexpression_genes_in_allstudies
    patient_ids=[]
    for gene in patientID_exp_and_val:
        for pid in patientID_exp_and_val[gene]:
            patient_ids.append(pid)
    unique_patient_ids=set(patient_ids)
    return unique_patient_ids

In [166]:
genes="""
ABI3BP
ADIPOQ
AEBP1
AGRN
AMBN
AMELX
AMELY
BGLAP
BMPER
BSPH1
CDCP2
CILP
CILP2
COCH
COLQ
COMP
CRELD1
CRELD2
CRIM1
CRISPLD1
CRISPLD2
CTGF
CTHRC1
CYR61
DDX26B
DMBT1
DMP1
DPT
DSPP
ECM1
ECM2
EDIL3
EFEMP1
EFEMP2
EGFLAM
ELN
ELSPBP1
EMID1
EMILIN1
EMILIN2
EMILIN3
EYS
FBLN1
FBLN2
FBLN5
FBLN7
FBN1
FBN2
FBN3
FGA
FGB
FGG
FGL1
FGL2
FN1
FNDC1
FNDC7
FNDC8
FRAS1
GAS6
GLDN
HMCN1
HMCN2
IBSP
IGFALS
IGFBP1
IGFBP2
IGFBP3
IGFBP4
IGFBP5
IGFBP6
IGFBP7
IGFBPL1
IGSF10
KAL1
KCP
LAMA1
LAMA2
LAMA3
LAMA4
LAMA5
LAMB1
LAMB2
LAMB3
LAMB4
LAMC1
LAMC2
LAMC3
LGI1
LGI2
LGI3
LGI4
LRG1
LTBP1
LTBP2
LTBP3
LTBP4
MATN1
MATN2
MATN3
MATN4
MEPE
MFAP1
MFAP2
MFAP3
MFAP4
MFAP5
MFGE8
MGP
MMRN1
MMRN2
MXRA5
NDNF
NELL1
NELL2
NID1
NID2
NOV
NPNT
NTN1
NTN3
NTN4
NTN5
NTNG1
NTNG2
OIT3
OTOG
OTOL1
PAPLN
PCOLCE
PCOLCE2
POMZP3
POSTN
PXDN
PXDNL
RELN
RSPO1
RSPO2
RSPO3
RSPO4
SBSPON
SLIT1
SLIT2
SLIT3
SMOC1
SMOC2
SNED1
SPARC
SPARCL1
SPON1
SPON2
SPP1
SRPX
SRPX2
SSPO
SVEP1
TECTA
TECTB
TGFBI
THBS1
THBS2
THBS3
THBS4
THSD4
TINAG
TINAGL1
TNC
TNFAIP6
TNN
TNR
TNXB
TSKU
TSPEAR
VIT
VTN
VWA1
VWA2
VWA3A
VWA3B
VWA5A
VWA5B1
VWA5B2
VWA7
VWA9
VWCE
VWDE
VWF
WISP1
WISP2
WISP3
ZP1
ZP2
ZP3
ZP4
ZPLD1
COL10A1
COL11A1
COL11A2
COL12A1
COL13A1
COL14A1
COL15A1
COL16A1
COL17A1
COL18A1
COL19A1
COL1A1
COL1A2
COL20A1
COL21A1
COL22A1
COL23A1
COL24A1
COL25A1
COL26A1
COL27A1
COL28A1
COL2A1
COL3A1
COL4A1
COL4A2
COL4A3
COL4A4
COL4A5
COL4A6
COL5A1
COL5A2
COL5A3
COL6A1
COL6A2
COL6A3
COL6A5
COL6A6
COL7A1
COL8A1
COL8A2
COL9A1
COL9A2
COL9A3
ACAN
ASPN
BCAN
BGN
CHAD
CHADL
DCN
EPYC
ESM1
FMOD
HAPLN1
HAPLN2
HAPLN3
HAPLN4
HSPG2
IMPG1
IMPG2
KERA
LUM
NCAN
NYX
OGN
OMD
OPTC
PODN
PODNL1
PRELP
PRG2
PRG3
PRG4
SPOCK1
SPOCK2
SPOCK3
SRGN
VCAN
ANXA1
ANXA10
ANXA11
ANXA13
ANXA2
ANXA3
ANXA4
ANXA5
ANXA6
ANXA7
ANXA8
ANXA8L1
ANXA9
C1QA
C1QB
C1QC
C1QL1
C1QL2
C1QL3
C1QL4
C1QTNF1
C1QTNF2
C1QTNF3
C1QTNF4
C1QTNF5
C1QTNF6
C1QTNF7
C1QTNF8
C1QTNF9
CD209
CLC
CLEC10A
CLEC11A
CLEC12A
CLEC12B
CLEC14A
CLEC17A
CLEC18A
CLEC18B
CLEC18C
CLEC19A
CLEC1A
CLEC1B
CLEC2A
CLEC2B
CLEC2D
CLEC2L
CLEC3A
CLEC3B
CLEC4A
CLEC4C
CLEC4D
CLEC4E
CLEC4F
CLEC4G
CLEC4M
CLEC5A
CLEC6A
CLEC7A
CLEC9A
COLEC10
COLEC11
COLEC12
CSPG4
CSPG5
ELFN1
ELFN2
EMCN
FCN1
FCN2
FCN3
FREM1
FREM2
FREM3
GPC1
GPC2
GPC3
GPC4
GPC5
GPC6
GREM1
GRIFIN
HPX
HSPC159
ITLN1
ITLN2
LGALS1
LGALS12
LGALS13
LGALS14
LGALS16
LGALS2
LGALS3
LGALS4
LGALS7
LGALS8
LGALS9
LGALS9B
LGALS9C
LMAN1
LMAN1L
MBL2
MUC1
MUC12
MUC13
MUC15
MUC16
MUC17
MUC19
MUC2
MUC20
MUC21
MUC22
MUC3A
MUC4
MUC5AC
MUC5B
MUC6
MUC7
MUC8
MUCL1
OVGP1
PARM1
PLXDC1
PLXDC2
PLXNA1
PLXNA2
PLXNA3
PLXNA4
PLXNB1
PLXNB2
PLXNB3
PLXNC1
PLXND1
PROL1
REG1A
REG1B
REG3A
REG3G
REG4
SDC1
SDC2
SDC3
SDC4
SEMA3A
SEMA3B
SEMA3C
SEMA3D
SEMA3E
SEMA3F
SEMA3G
SEMA4A
SEMA4B
SEMA4C
SEMA4D
SEMA4F
SEMA4G
SEMA5A
SEMA5B
SEMA6A
SEMA6B
SEMA6C
SEMA6D
SEMA7A
SFTA2
SFTA3
SFTPA1
SFTPA2
SFTPB
SFTPC
SFTPD
A2M
A2ML1
ADAM10
ADAM11
ADAM12
ADAM15
ADAM17
ADAM18
ADAM19
ADAM2
ADAM20
ADAM21
ADAM22
ADAM23
ADAM28
ADAM29
ADAM30
ADAM32
ADAM33
ADAM7
ADAM8
ADAM9
ADAMDEC1
ADAMTS1
ADAMTS10
ADAMTS12
ADAMTS13
ADAMTS14
ADAMTS15
ADAMTS16
ADAMTS17
ADAMTS18
ADAMTS19
ADAMTS2
ADAMTS20
ADAMTS3
ADAMTS4
ADAMTS5
ADAMTS6
ADAMTS7
ADAMTS8
ADAMTS9
ADAMTSL1
ADAMTSL2
ADAMTSL3
ADAMTSL4
ADAMTSL5
AGT
AMBP
ASTL
BMP1
C17orf58
CD109
CELA1
CELA2A
CELA2B
CELA3A
CELA3B
CPAMD8
CPN2
CST1
CST11
CST2
CST3
CST4
CST5
CST6
CST7
CST8
CST9
CST9L
CSTA
CSTB
CSTL1
CTSA
CTSB
CTSC
CTSD
CTSE
CTSF
CTSG
CTSH
CTSK
CTSL
CTSO
CTSS
CTSV
CTSW
CTSZ
EGLN1
EGLN2
EGLN3
ELANE
F10
F12
F13A1
F13B
F2
F7
F9
FAM20A
FAM20B
FAM20C
HABP2
HMSD
HPSE
HPSE2
HRG
HTRA1
HTRA3
HTRA4
HYAL1
HYAL2
HYAL3
HYAL4
ITIH1
ITIH2
ITIH3
ITIH4
ITIH5
ITIH6
KAZALD1
KNG1
KY
LEPRE1
LEPREL1
LEPREL2
LOX
LOXL1
LOXL2
LOXL3
LOXL4
LPA
MASP1
MASP2
MEP1A
MEP1B
MMP1
MMP10
MMP11
MMP12
MMP13
MMP14
MMP15
MMP16
MMP17
MMP19
MMP2
MMP20
MMP21
MMP23B
MMP24
MMP25
MMP26
MMP27
MMP28
MMP3
MMP7
MMP8
MMP9
NGLY1
OGFOD1
OGFOD2
P4HA1
P4HA2
P4HA3
P4HTM
PAMR1
PAPPA
PAPPA2
PCSK5
PCSK6
PI3
PLAT
PLAU
PLG
PLOD1
PLOD2
PLOD3
PRSS1
PRSS12
PRSS2
PRSS3
PZP
SERPINA1
SERPINA10
SERPINA11
SERPINA12
SERPINA2
SERPINA3
SERPINA4
SERPINA5
SERPINA6
SERPINA7
SERPINA9
SERPINB1
SERPINB10
SERPINB11
SERPINB12
SERPINB13
SERPINB2
SERPINB3
SERPINB4
SERPINB5
SERPINB6
SERPINB7
SERPINB8
SERPINB9
SERPINC1
SERPIND1
SERPINE1
SERPINE2
SERPINE3
SERPINF1
SERPINF2
SERPING1
SERPINH1
SERPINI1
SERPINI2
SLPI
SPAM1
ST14
SULF1
SULF2
TGM1
TGM2
TGM3
TGM4
TGM5
TGM6
TGM7
TIMP1
TIMP2
TIMP3
TIMP4
TLL1
TLL2
TMPRSS15
AMH
ANGPT1
ANGPT2
ANGPT4
ANGPTL1
ANGPTL2
ANGPTL3
ANGPTL4
ANGPTL5
ANGPTL6
ANGPTL7
AREG
ARTN
BDNF
BMP10
BMP15
BMP2
BMP3
BMP4
BMP5
BMP6
BMP7
BMP8A
BMP8B
BRINP2
BRINP3
BTC
C1QTNF9B
CBLN1
CBLN2
CBLN3
CBLN4
CCBE1
CCL1
CCL11
CCL13
CCL14
CCL15
CCL16
CCL17
CCL18
CCL19
CCL2
CCL20
CCL21
CCL22
CCL23
CCL24
CCL25
CCL26
CCL27
CCL28
CCL3
CCL3L3
CCL4
CCL4L1
CCL4L2
CCL5
CCL7
CCL8
CFC1
CFC1B
CHRD
CHRDL1
CHRDL2
CLCF1
CNTF
CRHBP
CRLF1
CRLF3
CRNN
CSF1
CSF2
CSF3
CSH1
CSH2
CSHL1
CTF1
CX3CL1
CXCL1
CXCL10
CXCL11
CXCL12
CXCL13
CXCL14
CXCL2
CXCL3
CXCL5
CXCL6
CXCL8
CXCL9
DHH
EBI3
EDA
EGF
EGFL6
EGFL7
EGFL8
EPGN
EPO
EREG
FASLG
FGF1
FGF10
FGF11
FGF12
FGF13
FGF14
FGF16
FGF17
FGF18
FGF19
FGF2
FGF20
FGF21
FGF22
FGF23
FGF3
FGF4
FGF5
FGF6
FGF7
FGF8
FGF9
FGFBP1
FGFBP2
FGFBP3
FIGF
FLG
FLG2
FLT3LG
FRZB
FST
FSTL1
FSTL3
GDF1
GDF10
GDF11
GDF15
GDF2
GDF3
GDF5
GDF6
GDF7
GDF9
GDNF
GH1
GH2
HBEGF
HCFC1
HCFC2
HGF
HGFAC
HHIP
HRNR
IFNA1
IFNA10
IFNA13
IFNA14
IFNA16
IFNA17
IFNA2
IFNA21
IFNA4
IFNA5
IFNA6
IFNA7
IFNA8
IFNB1
IFNE
IFNG
IFNK
IFNW1
IGF1
IGF2
IHH
IL10
IL11
IL12A
IL12B
IL13
IL15
IL16
IL17A
IL17B
IL17C
IL17D
IL17F
IL18
IL19
IL1A
IL1B
IL1F10
IL1F5
IL1F6
IL1F7
IL1F8
IL1F9
IL1RN
IL2
IL20
IL22
IL23A
IL24
IL25
IL26
IL3
IL34
IL4
IL5
IL6
IL7
IL9
INHA
INHBA
INHBB
INHBC
INHBE
INS
INS-IGF2
INSL3
INSL5
INSL6
ISM1
ISM2
KITLG
LEFTY1
LEFTY2
LEP
LIF
LTA
LTB
MDK
MEGF10
MEGF11
MEGF6
MEGF8
MEGF9
MST1
MST1L
MSTN
NGF
NODAL
NRG1
NRG2
NRG3
NRG4
NRTN
NTF3
NTF4
OSM
PDGFA
PDGFB
PDGFC
PDGFD
PF4
PF4V1
PGF
PIK3IP1
PPBP
PRL
PSPN
PTN
RPTN
S100A1
S100A10
S100A11
S100A12
S100A13
S100A14
S100A16
S100A2
S100A3
S100A4
S100A5
S100A6
S100A7
S100A7A
S100A7L2
S100A8
S100A9
S100B
S100G
S100P
S100Z
SCUBE1
SCUBE2
SCUBE3
SFRP1
SFRP2
SFRP4
SFRP5
SHH
TCHH
TCHHL1
TDGF1
TGFA
TGFB1
TGFB2
TGFB3
THPO
TNF
TNFSF10
TNFSF11
TNFSF12
TNFSF13
TNFSF13B
TNFSF14
TNFSF15
TNFSF18
TNFSF4
TNFSF8
TNFSF9
TPO
VEGFA
VEGFB
VEGFC
VWC2
VWC2L
WFIKKN1
WFIKKN2
WIF1
WNT1
WNT10A
WNT10B
WNT11
WNT16
WNT2
WNT2B
WNT3
WNT3A
WNT4
WNT5A
WNT5B
WNT6
WNT7A
WNT7B
WNT8A
WNT8B
WNT9A
WNT9B
XCL1
XCL2
ZFP91
ADAM1A
ADAM21P1
ADAM3A
ADAM3B
ADAM5
ADAM6
ANXA2P2
ANXA8L2
BPIFA4P
C17orf101
COL6A4P1
COL6A4P2
CST9L2
CTF2P
CTSL3P
CTSLP3
CTSLP6
CTSLP7
DMBT1P1
EGFEM1P
FGF7P2
gene_A6NLB4
HYALP1
KGFLP1
KGFLP2
LOC400696
LOC728715
LPAL2
MBL1P
MST1P2
NCRNA00083
NEPNP
NTF6B
OVOS1
OVOS2
PPBPP1
PRSS3P1
SERPINA13P
TNFSF12-TNFSF13
TNXA
UNQ5830
AXL
CD44
CHEK1
DACH1
DDR1
DKK1
DLL1
DLL4
DNMT1
ERBB2
GSK3B
HDAC1
ID1
IKBKB
ITGA2
ITGA4
ITGA6
ITGB1
JAG1
JAK2
KIT
KITLG
LATS1
LIN28A
LIN28B
MAML1
MERTK
MYC
MYCN
NANOG
NFKB1
NOS2
NOTCH2
PLAUR
POU5F1
PROM1
PTCH1
PTPRC
SAV1
SIRT1
SMO
SNAI1
SOX2
STAT3
TAZ
TGFBR1
THY1
TWIST1
TWIST2
WEE1
WNT1
WWC1
YAP1
ZEB1
ZEB2
ACTN1
ADGRE5
AKT1
ANKRD2
ANO1
ARHGAP24
ARHGEF2
ARHGEF40
ASIC1
ASIC2
ATOH1
BAG3
BCAR1
CAMKK2
CAPN2
CAV1
CD47
CD55
CDH1
CDH5
CIB2
CIB3
CITED2
CLRN1
CTNNA1
CXCR1
DTNA
EDN1
EGFR
EHD2
FLNA
FLT4
FSCN1
G3BP2
GNA11
GNAQ
GP1BA
GPSM2
HOTAIR
HOXA5
ILK
ITGA2B
ITGA5
ITGB3
JCAD
KCNK10
KCNK2
KCNK4
KCNMA1
KLF2
KLF3
MAPK1
MAPK14
MAPK3
MAPK8
MIR103A1
MIR146A
MIR181B1
MIR181B2
MIR195
MIR21
MRTFA
MYL2
MYO7A
OLR1
PANX1
PDLIM5
PDLIM7
PECAM1
PIEZO1
PIEZO1P1
PIEZO2
PKD2
PKHD1
PLPP3
POU4F3
PPFIA1
PPP1R12A
PRKAA1
PRKCA
PTGS2
PTK2
PXN
RAP2A
RELA
RHOA
ROCK1
RUNX2
SCNN1A
SELE
SMAD3
SOST
SRC
TAC1
TAOK3
TGFB1
TMC1
TMC2
TMEM150C
TNF
TRPA1
TRPC1
TRPC6
TRPM7
TRPV1
TRPV4
UNC50
VCL
VHL
WWTR1"""

In [None]:
#remove_empty_studies
gene_and_study_to_remove={}
new_mol_data={}
for gene in mol_data.keys():
    for study_id in mol_data[gene].keys():
        if type(mol_data[gene][study_id])==list:
            continue
        #if list(mol_data[gene][study_id].keys())[0]=='message':
        else:
            if gene in gene_and_study_to_remove:
                gene_and_study_to_remove[gene].append(study_id)
            else:
                gene_and_study_to_remove[gene]=[]