Radboud uses CHEMBL.  Need to go to PubChem CIDs

In [1]:
require 'rest-client'
require 'json'

def map_chembl_to_cid(chembl_id)
  url = "https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/name/#{URI.encode_www_form_component(chembl_id)}/cids/JSON"
  warn url
  response = RestClient.get(url)
  
  if response.code == 200
    data = JSON.parse(response.body)
    cid = data.dig('IdentifierList', 'CID', 0)
    { chembl_id: chembl_id, cid: cid || 'No CID found' }
  else
    { chembl_id: chembl_id, error: "No CID found (Status: #{response.code})" }
  end
rescue RestClient::ExceptionWithResponse => e
  { chembl_id: chembl_id, error: "Error: #{e.message}" }
end

def get_more_metadata(cid)
  url = "https://pubchem.ncbi.nlm.nih.gov/rest/pug_view/data/compound/#{cid}/JSON"
  warn url
  response = RestClient.get(url)
  
  if response.code == 200
    data = JSON.parse(response.body)
    name = data.dig('Record', 'RecordTitle')
    { label: name, cid: cid || 'No CID found' }
  end
rescue RestClient::ExceptionWithResponse => e
  { cid: cid, error: "Error: #{e.message}" }
end
  

:get_more_metadata

In [2]:
# test
a = map_chembl_to_cid("CHEMBL1200656")
a.merge! get_more_metadata(a[:cid])
puts a


https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/name/CHEMBL1200656/cids/JSON
https://pubchem.ncbi.nlm.nih.gov/rest/pug_view/data/compound/5284447/JSON


{:chembl_id=>"CHEMBL1200656", :cid=>5284447, :label=>"Natamycin"}


In [3]:
require 'csv'

filelist = ["drug-disease.csv","drug-drugtype.csv","drug-gene.csv"]

druglist = {}
filelist.each do |csvfile|
  CSV.foreach("./rawdata/#{csvfile}", headers: true) do |row|
    next if row.size < 3

    source = row['source']
    abort unless source
    druglist[source] = 1
  end
end
puts druglist.length  

1966


In [None]:
require 'csv'
f = File.open("./mappings/drugs.map", "w")
f.write CSV.generate_line(["chembl","label","CID","IUPACname"])
e = File.open("./mappings/drugs-errors.txt", "w")

druglist.each_key do |chembl|
  a = map_chembl_to_cid(chembl)
  unless a[:cid]
    warn "FAILED CID lookup for #{chembl}\n"
    e.write "FAILED CID lookup for #{chembl}\n"
    next
  end
  a.merge! get_more_metadata(a[:cid])
  
  warn a
  if a[:label].is_a? Array
    label = a[:label][1]
  else
    label = a[:label]
  end
  f.write CSV.generate_line([chembl,chembl,a[:cid],a[:label]])
end
f.close
e.close
puts "done!"


https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/name/CHEMBL1200656/cids/JSON
https://pubchem.ncbi.nlm.nih.gov/rest/pug_view/data/compound/5284447/JSON
{:chembl_id=>"CHEMBL1200656", :cid=>5284447, :label=>"Natamycin"}
https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/name/CHEMBL1201746/cids/JSON
https://pubchem.ncbi.nlm.nih.gov/rest/pug_view/data/compound/148121/JSON
{:chembl_id=>"CHEMBL1201746", :cid=>148121, :label=>"Pralatrexate"}
https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/name/CHEMBL1231/cids/JSON
https://pubchem.ncbi.nlm.nih.gov/rest/pug_view/data/compound/4634/JSON
{:chembl_id=>"CHEMBL1231", :cid=>4634, :label=>"Oxybutynin"}
https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/name/CHEMBL2023898/cids/JSON
https://pubchem.ncbi.nlm.nih.gov/rest/pug_view/data/compound/25154714/JSON
{:chembl_id=>"CHEMBL2023898", :cid=>25154714, :label=>"Daclatasvir"}
https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/name/CHEMBL1091/cids/JSON
https://pubchem.ncbi.nlm.nih.gov/rest/pug_view/da

In [31]:
def map_chembl_to_name(chembl_id)
  url = "https://www.ebi.ac.uk/chembl/api/data/molecule/#{chembl_id}.json"
  response = RestClient.get(url)
  
  if response.code == 200
    data = JSON.parse(response.body)
    name = data.dig('molecule_properties', 'pref_name') || data.dig('molecule_synonyms')&.find { |s| s['syn_type'] == 'TRADE_NAME' }&.dig('molecule_synonym') || chembl_id
    { chembl_id: chembl_id, name: name }
  else
    { chembl_id: chembl_id, error: "No name found (Status: #{response.code})" }
  end
rescue RestClient::ExceptionWithResponse => e
  { chembl_id: chembl_id, error: "Error: #{e.message}" }
end

def map_name_to_sid(name)
  url = "https://pubchem.ncbi.nlm.nih.gov/rest/pug/substance/name/#{URI.encode_www_form_component(name)}/sids/JSON"
  response = RestClient.get(url)
  
  if response.code == 200
    data = JSON.parse(response.body)
    sid = data.dig('IdentifierList', 'SID', 0)
    { name: name, sid: sid || 'No SID found' }
  else
    { name: name, error: "No SID found (Status: #{response.code})" }
  end
rescue RestClient::ExceptionWithResponse => e
  { name: name, error: "Error: #{e.message}" }
end

# Don't bother - just stick to the substance id
# def map_sid_to_cid(sid)
#   url = "https://pubchem.ncbi.nlm.nih.gov/rest/pug/substance/sid/#{sid}/cids/JSON"
#   warn url
#   response = RestClient.get(url)
  
#   if response.code == 200
#     data = JSON.parse(response.body)
#     cid = data.dig('IdentifierList', 'CID', 0)
#     { sid: sid, cid: cid || 'No CID found' }
#   else
#     { sid: sid, error: "No CID found (Status: #{response.code})" }
#   end
# rescue RestClient::ExceptionWithResponse => e
#   { sid: sid, error: "Error: #{e.message}" }
# end

:map_name_to_sid

In [32]:
# test

puts map_chembl_to_name("CHEMBL1201460")
res = map_chembl_to_name("CHEMBL1201460")
puts map_name_to_sid(res[:name])
res.merge! map_name_to_sid(res[:name])

puts res

{:chembl_id=>"CHEMBL1201460", :name=>"Fragmin"}
{:name=>"Fragmin", :sid=>49975203}
{:chembl_id=>"CHEMBL1201460", :name=>"Fragmin", :sid=>49975203}


In [34]:
e.close
f.close

# Try to get biologics substance ids

f = File.open("./mappings/drugs.map", "a")
r = File.open("./mappings/drugs-errors.txt")
e = File.open("./mappings/drugs-biologics-errors.txt", "w")

r.each_line do |line|  # go over all errors
  chembl = line.match(/.*(CHEMBL\d+)/)[1]
  warn "runnign chembl #{chembl}"
  res = map_chembl_to_name(chembl)
  warn res
  res.merge! map_name_to_sid(res[:name])
  warn res
  unless res[:sid]
    warn "failed again for #{chembl}"
    e.write "failed again for #{chembl}\n"
    next
  end
  warn "write"
  f.write CSV.generate_line([chembl,chembl,res[:sid],a[:name]])
end
f.close
e.close
r.close



runnign chembl CHEMBL1201460
{:chembl_id=>"CHEMBL1201460", :name=>"Fragmin"}
{:chembl_id=>"CHEMBL1201460", :name=>"Fragmin", :sid=>49975203}
write
runnign chembl CHEMBL1201666
{:chembl_id=>"CHEMBL1201666", :name=>"Refludan"}
{:chembl_id=>"CHEMBL1201666", :name=>"Refludan", :sid=>85342099}
write
runnign chembl CHEMBL1742996
{:chembl_id=>"CHEMBL1742996", :name=>"Kyntheum"}
{:chembl_id=>"CHEMBL1742996", :name=>"Kyntheum", :sid=>187051841}
write
runnign chembl CHEMBL2108222
{:chembl_id=>"CHEMBL2108222", :name=>"Palaprin fte"}
{:chembl_id=>"CHEMBL2108222", :name=>"Palaprin fte", :error=>"Error: 404 Not Found"}
failed again for CHEMBL2108222
runnign chembl CHEMBL1201476
{:chembl_id=>"CHEMBL1201476", :name=>"Clexane"}
{:chembl_id=>"CHEMBL1201476", :name=>"Clexane", :sid=>53787364}
write
runnign chembl CHEMBL2108147
{:chembl_id=>"CHEMBL2108147", :name=>"Kabikinase"}
{:chembl_id=>"CHEMBL2108147", :name=>"Kabikinase", :sid=>49988401}
write
runnign chembl CHEMBL1201833
{:chembl_id=>"CHEMBL1201833

{:chembl_id=>"CHEMBL1396", :error=>"Error: 404 Not Found", :name=>nil}
failed again for CHEMBL1396
runnign chembl CHEMBL1201662
{:chembl_id=>"CHEMBL1201662", :name=>"Iprivask"}
{:chembl_id=>"CHEMBL1201662", :name=>"Iprivask", :sid=>50064254}
write
runnign chembl CHEMBL1201825
{:chembl_id=>"CHEMBL1201825", :name=>"Lucentis"}
{:chembl_id=>"CHEMBL1201825", :name=>"Lucentis", :sid=>50070212}
write
runnign chembl CHEMBL1237026
{:chembl_id=>"CHEMBL1237026", :name=>"CHEMBL1237026"}
{:chembl_id=>"CHEMBL1237026", :name=>"CHEMBL1237026", :sid=>104253157}
write
runnign chembl CHEMBL1201561
{:chembl_id=>"CHEMBL1201561", :name=>"Peg-Intron"}
{:chembl_id=>"CHEMBL1201561", :name=>"Peg-Intron", :sid=>50069692}
write
runnign chembl CHEMBL1201593
{:chembl_id=>"CHEMBL1201593", :name=>"Actilyse"}
{:chembl_id=>"CHEMBL1201593", :name=>"Actilyse", :sid=>50018415}
write
runnign chembl CHEMBL33
{:chembl_id=>"CHEMBL33", :name=>"Floxacin"}
{:chembl_id=>"CHEMBL33", :name=>"Floxacin", :sid=>12014081}
write
runnign

write
runnign chembl CHEMBL4594244
{:chembl_id=>"CHEMBL4594244", :name=>"CHEMBL4594244"}
{:chembl_id=>"CHEMBL4594244", :name=>"CHEMBL4594244", :sid=>440234800}
write
runnign chembl CHEMBL1200557
{:chembl_id=>"CHEMBL1200557", :name=>"CHEMBL1200557"}
{:chembl_id=>"CHEMBL1200557", :name=>"CHEMBL1200557", :sid=>103770540}
write
runnign chembl CHEMBL1201474
{:chembl_id=>"CHEMBL1201474", :name=>"Colestid"}
{:chembl_id=>"CHEMBL1201474", :name=>"Colestid", :sid=>49955761}
write
runnign chembl CHEMBL1201831
{:chembl_id=>"CHEMBL1201831", :name=>"Cimzia"}
{:chembl_id=>"CHEMBL1201831", :name=>"Cimzia", :sid=>135347437}
write
runnign chembl CHEMBL2108791
{:chembl_id=>"CHEMBL2108791", :name=>"Metalyse"}
{:chembl_id=>"CHEMBL2108791", :name=>"Metalyse", :sid=>134223915}
write
runnign chembl CHEMBL2109624
{:chembl_id=>"CHEMBL2109624", :name=>"Cablivi"}
{:chembl_id=>"CHEMBL2109624", :name=>"Cablivi", :sid=>315661208}
write
runnign chembl CHEMBL3137343
{:chembl_id=>"CHEMBL3137343", :name=>"Keytruda"}
{:c

write
runnign chembl CHEMBL2108728
{:chembl_id=>"CHEMBL2108728", :name=>"Asparlas"}
{:chembl_id=>"CHEMBL2108728", :name=>"Asparlas", :error=>"Error: 404 Not Found"}
failed again for CHEMBL2108728
runnign chembl CHEMBL1201464
{:chembl_id=>"CHEMBL1201464", :name=>"Ovidrel"}
{:chembl_id=>"CHEMBL1201464", :name=>"Ovidrel", :sid=>50070186}
write
runnign chembl CHEMBL2108681
{:chembl_id=>"CHEMBL2108681", :name=>"Ilumetri"}
{:chembl_id=>"CHEMBL2108681", :name=>"Ilumetri", :sid=>249565773}
write
runnign chembl CHEMBL2109027
{:chembl_id=>"CHEMBL2109027", :name=>"Ananase"}
{:chembl_id=>"CHEMBL2109027", :name=>"Ananase", :sid=>49895175}
write
runnign chembl CHEMBL414804
{:chembl_id=>"CHEMBL414804", :name=>"Eloxatin"}
{:chembl_id=>"CHEMBL414804", :name=>"Eloxatin", :sid=>53790063}
write
runnign chembl CHEMBL3833393
{:chembl_id=>"CHEMBL3833393", :name=>"Hemlibra"}
{:chembl_id=>"CHEMBL3833393", :name=>"Hemlibra", :sid=>354702217}
write
runnign chembl CHEMBL3833405
{:chembl_id=>"CHEMBL3833405", :erro

{:chembl_id=>"CHEMBL1201608", :name=>"Orthoclone okt3", :error=>"Error: 404 Not Found"}
failed again for CHEMBL1201608
runnign chembl CHEMBL2109016
{:chembl_id=>"CHEMBL2109016", :name=>"Hyperdrol"}
{:chembl_id=>"CHEMBL2109016", :name=>"Hyperdrol", :sid=>175267407}
write
runnign chembl CHEMBL2109152
{:chembl_id=>"CHEMBL2109152", :name=>"Alpha keri"}
{:chembl_id=>"CHEMBL2109152", :name=>"Alpha keri", :error=>"Error: 404 Not Found"}
failed again for CHEMBL2109152
runnign chembl CHEMBL3545189
{:chembl_id=>"CHEMBL3545189", :name=>"Takhzyro"}
{:chembl_id=>"CHEMBL3545189", :name=>"Takhzyro", :sid=>315661179}
write
runnign chembl CHEMBL1201835
{:chembl_id=>"CHEMBL1201835", :name=>"Stelara"}
{:chembl_id=>"CHEMBL1201835", :name=>"Stelara", :sid=>135302166}
write
runnign chembl CHEMBL1201481
{:chembl_id=>"CHEMBL1201481", :name=>"Photobarr"}
{:chembl_id=>"CHEMBL1201481", :name=>"Photobarr", :sid=>103771429}
write
runnign chembl CHEMBL1201668
{:chembl_id=>"CHEMBL1201668", :name=>"Natrecor"}
{:chemb

write
runnign chembl CHEMBL1201477
{:chembl_id=>"CHEMBL1201477", :name=>"Estradurin"}
{:chembl_id=>"CHEMBL1201477", :name=>"Estradurin", :sid=>49965785}
write
runnign chembl CHEMBL1201607
{:chembl_id=>"CHEMBL1201607", :name=>"Tysabri"}
{:chembl_id=>"CHEMBL1201607", :name=>"Tysabri", :sid=>50069162}
write
runnign chembl CHEMBL1201563
{:chembl_id=>"CHEMBL1201563", :name=>"Betaseron"}
{:chembl_id=>"CHEMBL1201563", :name=>"Betaseron", :sid=>509364}
write
runnign chembl CHEMBL1743087
{:chembl_id=>"CHEMBL1743087", :name=>"Entyvio"}
{:chembl_id=>"CHEMBL1743087", :name=>"Entyvio", :sid=>135277804}
write
runnign chembl CHEMBL2108558
{:chembl_id=>"CHEMBL2108558", :name=>"Califig"}
{:chembl_id=>"CHEMBL2108558", :name=>"Califig", :error=>"Error: 404 Not Found"}
failed again for CHEMBL2108558
runnign chembl CHEMBL4297535
{:chembl_id=>"CHEMBL4297535", :name=>"Omegaven"}
{:chembl_id=>"CHEMBL4297535", :name=>"Omegaven", :sid=>135310895}
write
runnign chembl CHEMBL1201567
{:chembl_id=>"CHEMBL1201567", 