# Monkeypox KG

# Package installation is required only once

In [None]:
#pip install -r packages.txt

In [30]:
import pybel
from tqdm import tqdm
import pandas as p
from pybel.dsl import Protein
from pybel.dsl import Abundance
from pybel.dsl import Pathology
from pybel.dsl import BiologicalProcess
from pybel.dsl import Population
from pybel.dsl import Gene
from pybel.dsl import Rna
from pybel.dsl import MicroRna
from pybel.dsl import Fragment
import chembl_webresource_client
import openpyxl
import networkx as nx
from pybel.io.jupyter import to_jupyter
import matplotlib.pyplot as plt
from chembl_webresource_client.new_client import new_client
import pubchempy
import pickle
import re
from ByCovidGraph import *
import pubchempy as pcp
import urllib
from urllib.parse import urlparse
import io


# Drugs against monkeypox from https://www.jglobalbiosecurity.com/articles/10.31646/gbio.12 + chembl database

In [2]:
mpox_chembl = 'CHEMBL152 CHEMBL203321 CHEMBL1257073 CHEMBL220610 CHEMBL221054 CHEMBL1257073 CHEMBL375768 CHEMBL220150'.split()


# Reading file from pubchem for monkeypox 
# https://pubchem.ncbi.nlm.nih.gov/taxonomy/10244

In [3]:

mpox_pchem = pd.read_csv('data/normalized_data/TaxID_10244_bioactivity.csv')
mpox_pchem = mpox_pchem['cid']
mpox_pchem = list(set(mpox_pchem))


In [4]:
pchem2chembl_list = cid2chembl(mpox_pchem)

Pubchem: 134406 Converted to: CHEMBL321500
Pubchem: 44420748 Converted to: CHEMBL375768
Pubchem: 101879053 Converted to: CHEMBL3357216
Pubchem: 101879054 Converted to: CHEMBL3357215
Pubchem: 16124688 Converted to: CHEMBL1257073
Pubchem: 11243799 Converted to: CHEMBL220150
Pubchem: 155545375 Converted to: CHEMBL4565849
Pubchem: 11773602 Converted to: CHEMBL220610
Pubchem: 73124 Converted to: CHEMBL494759
Pubchem: 37542 Converted to: CHEMBL1643
Pubchem: 44341289 Converted to: CHEMBL114867
Pubchem: 44341811 Converted to: CHEMBL113930
Pubchem: 44341812 Converted to: CHEMBL326139
Pubchem: 118895042 Converted to: CHEMBL3753637
Pubchem: 118895044 Converted to: CHEMBL3753489
Pubchem: 60613 Converted to: CHEMBL152
Pubchem: 11301322 Converted to: CHEMBL220098
Pubchem: 189926 Converted to: CHEMBL112159
Pubchem: 5271015 Converted to: CHEMBL360506
Pubchem: 11452011 Converted to: CHEMBL219691
Pubchem: 11463022 Converted to: CHEMBL221054
Pubchem: 155521397 Converted to: CHEMBL4449833
Pubchem: 4511103

In [5]:
z = set(mpox_chembl).intersection(pchem2chembl_list)
z

{'CHEMBL1257073',
 'CHEMBL152',
 'CHEMBL220150',
 'CHEMBL220610',
 'CHEMBL221054',
 'CHEMBL375768'}

In [6]:
chemblid = mpox_chembl+pchem2chembl_list
len(chemblid)

32

In [7]:
chemblid = set(chemblid)
chemblid = list(chemblid)
len(chemblid)

25

In [8]:
chembl2mech = RetMech(chemblid)

[]
[{'mechanism_of_action': 'Envelope phospholipase F13 (p37) inhibitor', 'target_chembl_id': 'CHEMBL4296170'}]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[{'mechanism_of_action': 'Human herpesvirus 1 DNA polymerase inhibitor', 'target_chembl_id': 'CHEMBL1872'}]
[]
[]
[]
[]
[{'mechanism_of_action': 'DNA polymerase inhibitor', 'target_chembl_id': 'CHEMBL3988586'}]
[]
[]
[]
[{'mechanism_of_action': "Inosine-5'-monophosphate dehydrogenase 1 inhibitor", 'target_chembl_id': 'CHEMBL1822'}, {'mechanism_of_action': 'Hepatitis C virus NS5B RNA-dependent RNA polymerase inhibitor', 'target_chembl_id': 'CHEMBL5375'}, {'mechanism_of_action': 'RNA inhibitor', 'target_chembl_id': 'CHEMBL2363073'}]
[]
[]
[]


In [9]:
chembl2dis = RetDrugInd(chemblid)

[]
[{'mesh_heading': 'Virus Diseases'}, {'mesh_heading': 'Smallpox'}]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[{'mesh_heading': 'Virus Diseases'}, {'mesh_heading': 'Infections'}, {'mesh_heading': 'Kidney Diseases'}, {'mesh_heading': 'Sarcoma'}, '...(remaining elements truncated)...']
[]
[]
[]
[]
[{'mesh_heading': 'Cytomegalovirus Infections'}, {'mesh_heading': 'Hemorrhagic Fever, Ebola'}, {'mesh_heading': 'Adenovirus Infections, Human'}, {'mesh_heading': 'Adenoviridae Infections'}, '...(remaining elements truncated)...']
[]
[]
[]
[{'mesh_heading': 'Leukemia, Myeloid, Acute'}, {'mesh_heading': 'Hepatitis C, Chronic'}, {'mesh_heading': 'Hepatitis C'}, {'mesh_heading': 'Hepatitis B, Chronic'}, '...(remaining elements truncated)...']
[]
[]
[]


In [10]:
chembl2act = RetAct(chemblid,0)

1
[]
2
[]
3
[]
4
[]
5
[{'assay_chembl_id': 'CHEMBL3294635', 'assay_type': 'B', 'bao_label': 'single protein format', 'pchembl_value': '7.62', 'target_chembl_id': 'CHEMBL2664', 'target_organism': 'Homo sapiens', 'type': 'IC50', 'value': '24.0'}, {'assay_chembl_id': 'CHEMBL3294636', 'assay_type': 'B', 'bao_label': 'cell-based format', 'pchembl_value': '8.03', 'target_chembl_id': 'CHEMBL2664', 'target_organism': 'Homo sapiens', 'type': 'IC50', 'value': '9.4'}]
6
[]
7
[]
8
[]
9
[]
10
[]
11
[{'assay_chembl_id': 'CHEMBL3757541', 'assay_type': 'B', 'bao_label': 'single protein format', 'pchembl_value': '8.49', 'target_chembl_id': 'CHEMBL3751646', 'target_organism': 'Homo sapiens', 'type': 'IC50', 'value': '3.2'}]
12
[]
13
[{'assay_chembl_id': 'CHEMBL746888', 'assay_type': 'F', 'bao_label': 'cell-based format', 'pchembl_value': '6.30', 'target_chembl_id': 'CHEMBL614200', 'target_organism': 'Homo sapiens', 'type': 'IC50', 'value': '0.5'}, {'assay_chembl_id': 'CHEMBL689980', 'assay_type': 'F', '

In [11]:
prtn_as_chembl = Ret_chembl_protein(chembl2act) + Ret_chembl_protein(chembl2mech)
prtn_as_chembl = set(prtn_as_chembl)
prtn_as_chembl = list(prtn_as_chembl)
prtn_as_chembl

['CHEMBL3751646',
 'CHEMBL2664',
 'CHEMBL1293235',
 'CHEMBL340',
 'CHEMBL614149',
 'CHEMBL1872',
 'CHEMBL613888',
 'CHEMBL2363073',
 'CHEMBL5375',
 'CHEMBL3988586',
 'CHEMBL1822',
 'CHEMBL614200',
 'CHEMBL4296170']

In [12]:
chembl2uprot = chembl2uniprot(prtn_as_chembl,0)

old 9
newLen 7
2


In [13]:
chembl2uprot

{'CHEMBL3751646': [{'xref_id': 'R-HSA-112043',
   'xref_name': 'PLC beta mediated events.',
   'xref_src_db': 'Reactome'},
  {'xref_id': 'R-HSA-1489509',
   'xref_name': 'DAG and IP3 signaling.',
   'xref_src_db': 'Reactome'},
  {'xref_id': 'R-HSA-2029485',
   'xref_name': 'Role of phospholipids in phagocytosis.',
   'xref_src_db': 'Reactome'},
  {'xref_id': 'R-HSA-2871809',
   'xref_name': 'FCERI mediated Ca+2 mobilization.',
   'xref_src_db': 'Reactome'},
  {'xref_id': 'R-HSA-422356',
   'xref_name': 'Regulation of insulin secretion.',
   'xref_src_db': 'Reactome'},
  {'xref_id': 'R-HSA-5218921',
   'xref_name': 'VEGFR2 mediated cell proliferation.',
   'xref_src_db': 'Reactome'},
  {'xref_id': 'R-HSA-5578775',
   'xref_name': 'Ion homeostasis.',
   'xref_src_db': 'Reactome'},
  {'xref_id': 'R-HSA-5607763',
   'xref_name': 'CLEC7A (Dectin-1) induces NFAT activation.',
   'xref_src_db': 'Reactome'},
  {'xref_id': 'R-HSA-9664323',
   'xref_name': 'FCGR3A-mediated IL10 synthesis.',
   '

In [14]:
#update chembl protein nodes with gene symbol
#use previously created variable that stores activity info
chembl2act = chembl2gene2path(chembl2uprot,chembl2act)

In [15]:
chembl2act

{'CHEMBL3290663': [{'assay_chembl_id': 'CHEMBL3294635',
   'assay_type': 'B',
   'bao_label': 'single protein format',
   'pchembl_value': '7.62',
   'target_chembl_id': 'CHEMBL2664',
   'target_organism': 'Homo sapiens',
   'type': 'IC50',
   'value': '24.0',
   'Protein': 'AHCY'},
  {'assay_chembl_id': 'CHEMBL3294636',
   'assay_type': 'B',
   'bao_label': 'cell-based format',
   'pchembl_value': '8.03',
   'target_chembl_id': 'CHEMBL2664',
   'target_organism': 'Homo sapiens',
   'type': 'IC50',
   'value': '9.4',
   'Protein': 'AHCY'}],
 'CHEMBL3753637': [{'assay_chembl_id': 'CHEMBL3757541',
   'assay_type': 'B',
   'bao_label': 'single protein format',
   'pchembl_value': '8.49',
   'target_chembl_id': 'CHEMBL3751646',
   'target_organism': 'Homo sapiens',
   'type': 'IC50',
   'value': '3.2',
   'Protein': 'AHCYL1'}],
 'CHEMBL152': [{'assay_chembl_id': 'CHEMBL746888',
   'assay_type': 'F',
   'bao_label': 'cell-based format',
   'pchembl_value': '6.30',
   'target_chembl_id': 'CH

In [16]:
mpox_graph = pybel.BELGraph(name='Monkeypox Graph')

In [None]:
chembl2uprot

In [17]:
def chem2gene2path_rel(named_chem2geneList,itmpGraph):
    for item in named_chem2geneList:
        #print(item)
        itemLen = len(named_chem2geneList[item])-1
        #print(itemLen)
        for j in range(itemLen-1):
            #print(j)
            #print('yo',named_chem2geneList)
            #if 'accession' not in named_chem2geneList[j]:
                #continue
                
            itmpGraph.add_association(Protein(namespace='HP', name=named_chem2geneList[item][itemLen]['component_synonym']),
                                      BiologicalProcess(namespace='Pathway',name=named_chem2geneList[item][j]['xref_name']),
                                      citation='ChEMBL database', evidence='ChEMBL query',
                                      Reactome=named_chem2geneList[item][j]['xref_id'])

    return(itmpGraph)


In [20]:
mpox_graph = chem2moa_rel_2(chembl2mech,mpox_graph)
mpox_graph = chem2dis_rel(chembl2dis,mpox_graph)
mpox_graph = chem2act_rel_2(chembl2act,mpox_graph)
mpox_graph = chem2gene2path_rel(chembl2uprot,mpox_graph)
to_jupyter(mpox_graph)

<IPython.core.display.Javascript object>

In [None]:
# filename = ('data/normalized_data/monkeypox_pickle2')
# outfile = open(filename,'wb')
# pickle.dump(mpox_graph,outfile)

In [None]:
#import monkey pox graph
infile = open('data/normalized_data/monkeypox_pickle2','rb')
mpox_graph = pickle.load(infile)
infile.close()

In [None]:
to_jupyter(mpox_graph)

In [21]:
#Export Uprot ids from the current graph
chemblProt = []
for item in chembl2uprot:
    for j in range(len(chembl2uprot[item])):
        if('accession' in chembl2uprot[item][j]):
            chemblProt.append(chembl2uprot[item][j]['accession'])

chemblProt

['O43865', 'P23526', 'P02545', 'P08684', 'Q8JXU8', 'P06856', 'P20839']

In [None]:
chemblProt = ['O43865', 'P20839', 'Q8JXU8', 'P02545', 'P23526', 'P06856','P08684']

# Working with monkeypox proteins from uniprot

In [22]:
mpox_prot_df = pd.read_excel('data/normalized_data/uniprot-taxonomy Monkeypox+virus+[10244] -filtered-reviewed yes.xlsx')
mpox_prot= list(mpox_prot_df['Entry'])
mpox_prot

  warn("Workbook contains no default style, apply openpyxl's default")


['Q8V4Y0',
 'Q8V4S4',
 'Q8V571',
 'P04363',
 'Q8V4V3',
 'Q8V4U9',
 'Q8V4T3',
 'Q8V518',
 'Q8V4V4',
 'Q8V4T7',
 'Q8V566']

# Genes from Malacards

In [23]:
mpox_genes_df = pd.read_excel('data/normalized_data/malacardsGenes.xlsx')

In [24]:
mpox_genes = list(mpox_genes_df['Entry'])
mpox_genes

['P0C0L4',
 'P0C0L5',
 'Q9Y258',
 'Q07444',
 'P01730',
 'P01732',
 'Q8NH81',
 'P26718',
 'P08174',
 'P00387',
 'P15529',
 'Q14653',
 'P01562']

In [25]:
uprots = mpox_prot+mpox_genes+chemblProt

In [26]:
uprots

['Q8V4Y0',
 'Q8V4S4',
 'Q8V571',
 'P04363',
 'Q8V4V3',
 'Q8V4U9',
 'Q8V4T3',
 'Q8V518',
 'Q8V4V4',
 'Q8V4T7',
 'Q8V566',
 'P0C0L4',
 'P0C0L5',
 'Q9Y258',
 'Q07444',
 'P01730',
 'P01732',
 'Q8NH81',
 'P26718',
 'P08174',
 'P00387',
 'P15529',
 'Q14653',
 'P01562',
 'O43865',
 'P23526',
 'P02545',
 'P08684',
 'Q8JXU8',
 'P06856',
 'P20839']

In [27]:
def ExtractFromUniProt(uniprot_id):
    from bioservices import UniProt
    Uniprot_Dict = []
    # Make a link to the UniProt webservice
    service = UniProt()

    for id in uniprot_id:

        # create URL for each uniprot id
        url = 'https://www.uniprot.org/uniprot/' + id + '.txt'
        print(url)

        #Retrieve data for id in text format if found in uniprot
        
        try:
            ret_uprot = urllib.request.urlopen(url)
        except urllib.request.HTTPError:
            #uniprot_id.remove(id)
            continue
        
        
#         with urllib.request.urlopen(url) as response:
#            ret_uprot = response.read()

        print(id)
        id_copy = id
        i = 0
        j = 0
        k = 0
        id = {}
        id['Disease'] = {}
        id['Reactome'] = {}
        id['Function'] = {}
        id['BioProcess'] = {}
        id['Gene'] = {}
        #id['Gene'] = {}
        # print(id)

        # parse each line looking for info about disease, pathway, funcn, bp and so on
        for line in ret_uprot:

            line = line.decode('utf-8')

            # parse lines with disease and extract disease names and omim ids
            if '-!- DISEASE:' in line:
                if ('[MIM:' in line):
                    dis = line.split(':')
                    # dis returns list of splitted text, [1] = name of dis, [2] = OMIM ID, extra chars need cleaning
                    # print(dis[1][1:-5])
                    # print(dis[2][:-1])
                    id['Disease'].update({dis[1][1:-5]: dis[2][:-1]})

            # extract reactome ids and names
            if 'Reactome;' in line:
                ract = line.split(';')
                # ract returns list with reactome id and name, needs cleaning
                id['Reactome'].update({ract[2][1:-2]: ract[1][1:]})
                # print(ract[1][1:])
                # print(ract[2][1:-2])

            # look for functions
            if ' F:' in line:
                if j < 5:
                    # take only first 5 entries for now
                    # print(j)
                    fn = line.split(';')
                    # fn returns list with GO ids and names
                    id['Function'].update({fn[2][3:]: fn[1][1:]})
                    # print(fn[1][1:])
                    # print(fn[2][3:])
                    j += 1

            # look for biological processes
            if ' P:' in line:
                if i < 5:
                    # take only first 5 entries for now
                    # print(i)
                    bp = line.split(';')
                    # bp returns list with GO ids and names
                    id['BioProcess'].update({bp[2][3:]: bp[1][1:]})
                    #print(bp[1][1:])
                    #print(bp[2][3:])
                    i += 1
                    
            if 'GN   Name' in line:
                #print(line)
                
                if k == 0:
                    gene = line.split('=')
                    print(gene)
                    gene = gene[1].split(' ')
                    #print(gene[0])
                    if ';' in gene[0]:
                        gene=gene[0].split(';')
                        #id['Gene'].update({gene[0]})
                        gene = {'Gene': gene[0]}
                        #id.update(gene)
                    else:
                        gene = {'Gene':gene[0]}
                        #id.update(gene)
                    id.update(gene)
                    #print(id['Gene'])    
                    k +=1
            
            #else:
                #id.update({'Gene': ''})
                    
            #print(id['Gene'])
            
        Uniprot_Dict.append(id)

    Uniprot_Dict = dict(zip(uniprot_id, Uniprot_Dict))

    return(Uniprot_Dict)
    

In [31]:
uprots_ext = ExtractFromUniProt(uprots)

https://www.uniprot.org/uniprot/Q8V4Y0.txt
Q8V4Y0
https://www.uniprot.org/uniprot/Q8V4S4.txt
Q8V4S4
['GN   Name', 'B4R {ECO:0000312|EMBL:AAL40623.1};\n']
{'Gene': 'B4R'}
https://www.uniprot.org/uniprot/Q8V571.txt
Q8V571
['GN   Name', 'p28; ORFNames', 'D5R;\n']
{'Gene': 'p28'}
https://www.uniprot.org/uniprot/P04363.txt
P04363
['GN   Name', 'TK; ORFNames', 'L2R;\n']
{'Gene': 'TK'}
https://www.uniprot.org/uniprot/Q8V4V3.txt
Q8V4V3
['GN   Name', 'RPO132; ORFNames', 'A25R;\n']
{'Gene': 'RPO132'}
https://www.uniprot.org/uniprot/Q8V4U9.txt
Q8V4U9
https://www.uniprot.org/uniprot/Q8V4T3.txt
Q8V4T3
https://www.uniprot.org/uniprot/Q8V518.txt
Q8V518
https://www.uniprot.org/uniprot/Q8V4V4.txt
Q8V4V4
['GN   Name', 'VITF3L; ORFNames', 'A24R;\n']
{'Gene': 'VITF3L'}
https://www.uniprot.org/uniprot/Q8V4T7.txt
Q8V4T7
https://www.uniprot.org/uniprot/Q8V566.txt
Q8V566
https://www.uniprot.org/uniprot/P0C0L4.txt
P0C0L4
['GN   Name', 'C4A; Synonyms', 'CO4, CPAMD2;\n']
{'Gene': 'C4A'}
https://www.uniprot.org/u

In [None]:
uprots_ext = ExtractFromUniProt(['Q8V4Y0','Q8V4S4','P01562','P01730'])

In [None]:
uprots_ext

In [None]:
#a = 'GN   Name=B4R {ECO:0000312|EMBL:AAL40623.1};'
a = 'GN   Name=IFNA1;'
a = a.split('=')
a = a[1].split(" ")
a[0]


In [None]:
if ';' in a[0]:
    a=a[0].split(";")
    


In [None]:
a[0]

In [None]:
def uniprot_rel(named_uprotList,itmpGraph):
    
    for item in named_uprotList:
        #print(named_uprotList[item]['Function'].keys())
        fun=list(named_uprotList[item]['Function'].keys())
        bp = list(named_uprotList[item]['BioProcess'].keys())
        for f in fun:
            if str(named_uprotList[item]['Gene']) != 'nan' and not isinstance(named_uprotList[item]['Gene'],dict) :
                itmpGraph.add_association(Protein(namespace='MP',name=named_uprotList[item]['Gene']),MicroRna(namespace='Function',name=f),
                                         citation='UniProt database',evidence='UniProt query')
            else:
                itmpGraph.add_association(Protein(namespace='MP',name=item),MicroRna(namespace='Function',name=f),
                                         citation='UniProt database',evidence='UniProt query')
                
        
        for b in bp:
            if str(named_uprotList[item]['Gene']) != 'nan' and not isinstance(named_uprotList[item]['Gene'],dict):
                itmpGraph.add_association(Protein(namespace='MP',name=named_uprotList[item]['Gene']),BiologicalProcess(namespace='BioPro',name=b),
                                         citation='UniProt database',evidence='UniProt query')
            else:
                itmpGraph.add_association(Protein(namespace='MP',name=item),MicroRna(namespace='Function',name=b),
                                         citation='UniProt database',evidence='UniProt query')
        
    return(itmpGraph)
        

In [32]:
mpox_graph = uniprot_rel(uprots_ext,mpox_graph)
to_jupyter(mpox_graph)

<IPython.core.display.Javascript object>

In [33]:
# filename = 'data/normalized_data/monkeypox_pickle_15thJuly'
# outfile = open(filename,'wb')
# pickle.dump(mpox_graph,outfile)

In [None]:
#import monkey pox graph
infile = open('data/normalized_data/monkeypox_pickle_15thJuly','rb')
mpox_graph = pickle.load(infile)
infile.close()

In [34]:
to_jupyter(mpox_graph)

<IPython.core.display.Javascript object>

In [None]:

def _get_target_data(protein_list: list, organism: str):
    """Get chemical for target data from ChEMBL"""
    df_data = []

    target = new_client.target
    activity = new_client.activity

    for protein in protein_list:
        if pd.isna(protein):
            continue
        try:
            prot_data = [target.search(protein)[0]]

            # Search for protein with same synonym
            if prot_data == [None]:
                prot_data = target.filter(
                    target_synonym__icontains=protein, target_organism__istartswith=organism
                ).only(['target_chembl_id', 'target_pref_name', 'molecule_chembl_id', 'molecule_pref_name'])
        except HttpBadRequest:
            print(f'Cannot search for {protein} due to chembl error')
            continue

        # No results found
        if not prot_data:
            continue

        for prot in tqdm(prot_data, f'Analying data for {protein}'):
            # Absence of chembl id
            if not prot['target_chembl_id']:
                continue

            prot_activity_data = activity.filter(
                target_chembl_id=prot['target_chembl_id'],
                assay_type_iregex='(B|F)',
            ).only([
                'pchembl_value', 'molecule_chembl_id', 'activity_id', 'target_pref_name', 'molecule_pref_name'
            ])

            if len(prot_activity_data) < 1:
                continue

            for i in prot_activity_data:
                tmp = {}

                if i['pchembl_value'] is None:
                    continue

                pchembl_val = i['pchembl_value']

                if float(pchembl_val) < 6:
                    tmp['activity'] = 'inhibitor'
                else:
                    tmp['activity'] = 'activator'

                tmp['protein_symbol'] = protein
                tmp['protein_name'] = i['target_pref_name']
                tmp['aid'] = str(i['activity_id'])
                tmp['chembl_id'] = i['molecule_chembl_id']
                tmp['compound_name'] = i['molecule_pref_name'].capitalize() if i['molecule_pref_name'] else ''
                df_data.append(tmp)

    # Merge duplicated protein-chemical entries into one
    df = pd.DataFrame()

    for idx, row in tqdm(enumerate(df_data), total=len(df_data), desc='Preparing data'):
        if idx == 0:
            df = df.append(row, ignore_index=True)
        else:
            _in_df = df.loc[
                (df['protein_symbol'] == row['protein_symbol']) & (df['chembl_id'] == row['chembl_id'])
            ]

            if _in_df.empty:
                df = df.append(row, ignore_index=True)
            else:
                row_index = _in_df.index

                # Check existing citations
                existing_assays = set(df.loc[row_index, 'aid'].values[0].split(' | '))
                old_count = len(existing_assays)
                existing_assays.add(row['aid'])
                new_count = len(existing_assays)

                # Check if new citation added, if yes - add respective data
                if old_count < new_count:
                    df.loc[row_index, 'aid'] = ' | '.join(existing_assays)
    df = df[['activity', 'protein_symbol', 'protein_name', 'aid', 'chembl_id', 'compound_name']]
    return df


In [None]:
def target_list_to_chemical(
    proteins: list,
    organism: str = 'Homo sapiens',
    output_dir: str = ''
) -> None:
    """Extract chemical information on list of targets
    Usage:
    >> target_list_to_chemical(proteins=['RIPK'])
    """

    df = _get_target_data(protein_list=proteins, organism=organism)
    #os.makedirs(output_dir, exist_ok=True)
    #df.to_csv(os.path.join(output_dir, 'chemical_annotated.csv'), sep='\t', index=False)
    return(df)


In [None]:
uprot2chem = target_list_to_chemical(uprots)

In [None]:
uprot2chem

In [None]:
# filename = 'data/normalized_data/uprot2chembl'
# outfile = open(filename,'wb')
# pickle.dump(uprot2chem,outfile)

In [35]:
infile = open('data/normalized_data/uprot2chembl','rb')
uprot2chem = pickle.load(infile)
infile.close()

In [36]:
uprot2chem

Unnamed: 0,activity,protein_symbol,protein_name,aid,chembl_id,compound_name
0,activator,P01730,T-cell surface antigen CD4,105411,CHEMBL506605,Isocomplestatin
1,activator,P01730,T-cell surface antigen CD4,106547,CHEMBL525803,Chloropeptin
2,activator,P01730,T-cell surface antigen CD4,106550,CHEMBL3143464,
3,inhibitor,P01730,T-cell surface antigen CD4,346232,CHEMBL3138360,
4,inhibitor,P01730,T-cell surface antigen CD4,346237,CHEMBL3138345,
...,...,...,...,...,...,...
53593,inhibitor,P08684,Cytochrome P450 3A4,22926906,CHEMBL4791586,
53594,inhibitor,P08684,Cytochrome P450 3A4,22953965,CHEMBL4649457,Bms-986166
53595,inhibitor,P08684,Cytochrome P450 3A4,22982169,CHEMBL4784058,
53596,inhibitor,P08684,Cytochrome P450 3A4,22991278 | 22991276,CHEMBL4787795,


In [38]:
active = uprot2chem.loc[uprot2chem['activity'].str.contains('activator',na=False)]
active = active.reset_index(drop=True)

In [None]:
active

In [None]:
active['compound_name'][3] == ''

In [39]:
active = active[active['compound_name'] != '']
active = active.reset_index(drop=True)

In [40]:
chem = active['chembl_id']

In [None]:
len(chem)

In [None]:
chem = ['CHEMBL506605',
 'CHEMBL525803',
 'CHEMBL3143464',
 'CHEMBL3138360',
 'CHEMBL3138345',
 'CHEMBL3138187',
 'CHEMBL3138374',
 'CHEMBL3138095',
 'CHEMBL3138378',
 'CHEMBL3138119',
 'CHEMBL231224',
 'CHEMBL267655',
 'CHEMBL388209',
 'CHEMBL230799',
 'CHEMBL267658',
 'CHEMBL388208',
 'CHEMBL267657',
 'CHEMBL230798',
 'CHEMBL388207',
 'CHEMBL230796',
 'CHEMBL1852155',
 'CHEMBL1819181',
 'CHEMBL4783030',
 'CHEMBL4755141',
 'CHEMBL4783761',
 'CHEMBL4750160',
 'CHEMBL4743003',
 'CHEMBL4782080',
 'CHEMBL4746252',
 'CHEMBL4747494',
 'CHEMBL4758659',
 'CHEMBL4750983',
 'CHEMBL4786809',
 'CHEMBL4748787',
 'CHEMBL4785236',
 'CHEMBL4791666',
 'CHEMBL4525466',
 'CHEMBL4471585']

In [None]:
chem2dis = RetDrugInd(chem)

In [None]:
chem2mech = RetMech(chem)

In [None]:
chem2act = RetAct(chem,0)

In [None]:
chembl2actmechdis = {}
chembl2actmechdis['activity'] = chem2act
chembl2actmechdis['mechanism'] = chem2mech
chembl2actmechdis['disease']= chem2dis

In [None]:
# filename = 'data/normalized_data/chembl2actmechdis'
# outfile = open(filename,'wb')
# pickle.dump(chembl2actmechdis,outfile)

In [41]:
infile = open('data/normalized_data/chembl2actmechdis','rb')
chembl2actmechdis = pickle.load(infile)
infile.close()

In [42]:
chem2act = chembl2actmechdis['activity']
chem2mech = chembl2actmechdis['mechanism']
chem2dis = chembl2actmechdis['disease']

In [43]:
chemid2 = Ret_chembl_protein(chem2act) + Ret_chembl_protein(chem2mech)
chemid2

['CHEMBL1827',
 'CHEMBL232',
 'CHEMBL2034',
 'CHEMBL4878',
 'CHEMBL5514',
 'CHEMBL614335',
 'CHEMBL1075322',
 'CHEMBL275',
 'CHEMBL614177',
 'CHEMBL3746',
 'CHEMBL3437',
 'CHEMBL614274',
 'CHEMBL203',
 'CHEMBL223',
 'CHEMBL2095173',
 'CHEMBL1947',
 'CHEMBL3622',
 'CHEMBL4523454',
 'CHEMBL614925',
 'CHEMBL1907589',
 'CHEMBL3155',
 'CHEMBL2414',
 'CHEMBL614721',
 'CHEMBL1293237',
 'CHEMBL3305',
 'CHEMBL4235',
 'CHEMBL614075',
 'CHEMBL1951',
 'CHEMBL5463',
 'CHEMBL4302',
 'CHEMBL1871',
 'CHEMBL1916',
 'CHEMBL1867',
 'CHEMBL1791',
 'CHEMBL4429',
 'CHEMBL2722',
 'CHEMBL614685',
 'CHEMBL1781',
 'CHEMBL2094124',
 'CHEMBL2002',
 'CHEMBL220',
 'CHEMBL4305',
 'CHEMBL2326',
 'CHEMBL254',
 'CHEMBL612250',
 'CHEMBL4685',
 'CHEMBL613835',
 'CHEMBL614127',
 'CHEMBL1978',
 'CHEMBL3772',
 'CHEMBL383',
 'CHEMBL612554',
 'CHEMBL2094251',
 'CHEMBL1821',
 'CHEMBL1293226',
 'CHEMBL2039',
 'CHEMBL3251',
 'CHEMBL1899',
 'CHEMBL4036',
 'CHEMBL2095186',
 'CHEMBL4040',
 'CHEMBL392',
 'CHEMBL1875',
 'CHEMBL107513

In [47]:
chem2uni = chembl2uniprot(chemid2,0)

old 372
newLen 367
5


In [None]:
chem2uni

In [None]:
protList = []
for key in chem2uni.keys():
    #print(chem2uni[key])
    for item in chem2uni[key]:
        if 'accession' in item:
            protList.append(item['accession'])
            #print(item['accession'])

In [None]:
len(protList)

In [None]:
protList_ext = ExtractFromUniProt(protList[:100])

In [None]:
protList_ext_1 = ExtractFromUniProt(protList[100:200])

In [None]:
protList_ext_2 = ExtractFromUniProt(protList[200:310])

In [None]:
allprot = protList_ext | protList_ext_1 | protList_ext_2

In [None]:
allprot

In [None]:
# filename = 'data/normalized_data/allprot_fromchembl'
# outfile = open(filename,'wb')
# pickle.dump(allprot,outfile)

In [44]:
infile = open('data/normalized_data/allprot_fromchembl','rb')
allprot = pickle.load(infile)
infile.close()

In [None]:
mpox_graph = pybel.BELGraph(name='Monkeypox Graph')

In [48]:
mpox_graph = uniprot_rel(allprot,mpox_graph)
#to_jupyter(mpox_graph)

In [49]:
pchem_act_new= chembl2gene2path(chem2uni,chem2act)

In [None]:
chem2uni

In [None]:
def chem2gene2path_rel(named_chem2geneList,itmpGraph):
    for item in named_chem2geneList:
        #print(item)
        itemLen = len(named_chem2geneList[item])-1
        #print(itemLen)
        for j in range(itemLen-1):
            #print(named_chem2geneList)
            itmpGraph.add_association(MicroRna(namespace='HP', name=named_chem2geneList[item][itemLen]['component_synonym']),
                                      Rna(namespace='Pathway',name=named_chem2geneList[item][j]['xref_name']),
                                      citation='ChEMBL database', evidence='ChEMBL query',
                                      Reactome=named_chem2geneList[item][j]['xref_id'])

    return(itmpGraph)

In [50]:
mpox_graph = chem2act_rel_2(pchem_act_new,mpox_graph)
mpox_graph = chem2gene2path_rel(chem2uni,mpox_graph)
#to_jupyter(mpox_graph)

In [None]:
chem2uni['CHEMBL2094125'][2]['component_synonym']

In [51]:
# filename = 'data/normalized_data/monkeypox_pickle_final_15thJuly'
# outfile = open(filename,'wb')
# pickle.dump(mpox_graph,outfile)
# outfile.close()

In [52]:
infile = open('data/normalized_data/monkeypox_pickle_final_15thJuly','rb')
mpox_graph = pickle.load(infile)
infile.close()

In [53]:
mpox_graph.summarize()

---------------------  ---------------
Name                   Monkeypox Graph
Version
Number of Nodes        3905
Number of Namespaces   8
Number of Edges        13320
Number of Annotations  0
Number of Citations    2
Number of Authors      0
Number of Components   3
Network Density        8.74E-04
---------------------  ---------------

Type (4)             Count  Example
-----------------  -------  --------------------------------------------
BiologicalProcess     2068  bp(GOMF:"beta-adrenergic receptor activity")
Abundance             1420  a(ChEMBLAssay:CHEMBL647811)
Protein                381  p(HP:CHEMBL614530)
Pathology               36  path(Disease:"Uterine Cervical Dysplasia")

Prefix (8)    Name                        Count  Example
------------  ------------------------  -------  -------------------------------------------------------
ChEMBLAssay                                 862  a(ChEMBLAssay:CHEMBL682298)
GOBP          Gene Ontology                 774  bp(GOBP:"cellul