In [1]:
puts `ls -l rawdata`

total 7716
-rw-rw-r-- 1 osboxes osboxes 3678593 Jul 15 10:40 disease-gene.csv
-rw-rw-r-- 1 osboxes osboxes 1419434 Jul 15 10:40 disease-therapeuticarea.csv
-rw-rw-r-- 1 osboxes osboxes   50286 Jul 15 10:40 drug-disease.csv
-rw-rw-r-- 1 osboxes osboxes   97134 Jul 15 10:40 drug-drugtype.csv
-rw-rw-r-- 1 osboxes osboxes  148014 Jul 15 10:40 drug-gene.csv
-rw-rw-r-- 1 osboxes osboxes     576 Jul 15 10:40 edges.csv
-rw-rw-r-- 1 osboxes osboxes  201313 Jul 15 10:40 gene-reactome.csv
-rw-rw-r-- 1 osboxes osboxes 2282825 Jul 15 10:40 Graph_edge_info.xlsx


In [2]:
puts `head -2 ./rawdata/drug-disease.csv`

"source","source_type","target","target_type"
"CHEMBL1200656","drug","EFO_1001888","disease"


In [3]:
puts `ls -l mappings`

total 1052
drwxrwxr-x 2 osboxes osboxes   4096 Aug  8 14:52 DEPRECATED
-rw-rw-r-- 1 osboxes osboxes 378534 Jul 15 10:40 diseases-errors.txt
-rw-rw-r-- 1 osboxes osboxes 656312 Jul 15 10:40 diseases.map
-rw-rw-r-- 1 osboxes osboxes   1320 Aug 11 12:48 drugs-biologics-errors.txt
-rw-rw-r-- 1 osboxes osboxes  11009 Aug  8 15:18 drugs-errors.txt
-rw-rw-r-- 1 osboxes osboxes  88654 Aug 11 12:48 drugs.map
-rw-rw-r-- 1 osboxes osboxes 135160 Aug  8 14:45 genes.map


In [4]:
puts `head -2 ./mappings/drugs.map`

chembl,label,CID,IUPACname
CHEMBL1200656,CHEMBL1200656,5284447,Natamycin


In [5]:
puts `head -2 ./mappings/diseases.map`

source,snomedct,orpha,prefname
DOID_7551,http://purl.bioontology.org/ontology/SNOMEDCT/15628003,https://fake.orphanet/not-found,gonorrhea


In [1]:
require 'linkeddata'
require 'rdf/nquads'
require 'csv'

graphing_errors = File.open('./graph/drug-disease-errors.txt', 'w') 

# Define namespaces
SIMPATHIC = RDF::Vocabulary.new('urn:simpathic:')
RDFS = RDF::Vocabulary.new('http://www.w3.org/2000/01/rdf-schema#')

# Read input files
drug_mappings = CSV.read('./mappings/drugs.map', headers: true)
disease_mappings = CSV.read('./mappings/diseases.map', headers: true)
failures = {}

# refresh
f = File.open('./graph/drug-disease.nq', 'w')
f.close

recordcount = 0
CSV.foreach('./rawdata/drug-disease.csv', col_sep: ",", quote_char: '"', 
  liberal_parsing: true, headers: true) do |row|
  # "source","source_type","target","target_type"
  # "CHEMBL1200656","drug","EFO_1001888","disease"
  drug_id = row['source']
  disease_id = row['target']

#   warn "searching for #{drug_id}"
  # chembl,label,CID,IUPACname
  # CHEMBL1200656,CHEMBL1200656,5284447,Natamycin
  drug = drug_mappings.find { |d| d['chembl'] == drug_id }

#   source,snomedct,orpha,prefname
#   DOID_7551,http://purl.bioontology.org/ontology/SNOMEDCT/15628003,https://fake.orphanet/not-found,gonorrhea
  disease = disease_mappings.find { |d| d['source'] == disease_id }
  
#     warn "found #{drug}"
  unless drug
    next if failures[drug_id]
    failures[drug_id] = 1
    warn "drug lookup failed #{drug_id}"
    graphing_errors.write "drug lookup failed #{drug_id}\n"
    next
  end
  unless disease
    next if failures[disease_id]
    failures[disease_id] = 1
    warn "disease lookup failed #{disease_id}"
    graphing_errors.write "disease lookup failed #{disease_id}\n"
    next
  end
  
  
  # Extract relevant IDs and labels
  # chembl,label,CID,IUPACname
  # CHEMBL1200656,CHEMBL1200656,5284447,Natamycin
  pubchem_uri = RDF::URI.new("https://pubchem.ncbi.nlm.nih.gov/compound/#{drug['CID']}")
  pubchem_type = RDF::URI.new("http://semanticscience.org/resource/CHEMINF_000302")
  pubchem_label =  RDF::Literal.new("PubChem Identifier")
  pubchem_core_type = RDF::URI.new("https://w3id.org/biolink/vocab/Drug")
  drug_label = RDF::Literal.new(drug['label'])
  iupac_drug_label = RDF::Literal.new(drug['IUPACname'])
  original_drug = RDF::Literal.new(drug['chembl'])

#   source,snomedct,orpha,prefname
#   DOID_7551,http://purl.bioontology.org/ontology/SNOMEDCT/15628003,https://fake.orphanet/not-found,gonorrhea
  snomed_uri = RDF::URI.new(disease['snomedct'])
  snomed_type = RDF::URI.new("https://bioportal.bioontology.org/ontologies/SNOMEDCT")
  snomed_core_type = RDF::URI.new("https://w3id.org/biolink/vocab/Disease")
  snomed_label =  RDF::Literal.new("SNOMED Term")
  orphanet = RDF::URI.new(disease['orpha'])
  disease_label = RDF::Literal.new(disease['prefname'])
  original_disease = RDF::Literal.new(disease['source'])
  
  # Create context URI
  context_uri = RDF::URI.new("urn:simpathic:context:#{drug_id}_#{disease_id}")
  general_context = RDF::URI.new("urn:simpathic:context:all_metadata")
  
  # Create RDF repository (need to do this each time, since there are hundreds of thousands of lines, and the graph gets too big for memory)
  graph = RDF::Repository.new

  # Add quads to graph using RDF::Statement
  graph << RDF::Statement.new(pubchem_uri, SIMPATHIC['associated-with'], snomed_uri, graph_name: context_uri)
  graph << RDF::Statement.new(snomed_uri, SIMPATHIC['associated-with'], pubchem_uri, graph_name: context_uri)
  
  graph << RDF::Statement.new(pubchem_uri,  RDFS.label,     drug_label, graph_name: context_uri)
  graph << RDF::Statement.new(pubchem_uri,  RDFS.label,     iupac_drug_label, graph_name: context_uri)
  graph << RDF::Statement.new(pubchem_uri,  RDF.type,       pubchem_type,          graph_name: context_uri)
  graph << RDF::Statement.new(pubchem_uri,  RDF.type,       pubchem_core_type,             graph_name: context_uri)
  graph << RDF::Statement.new(pubchem_type, RDFS.label,     RDF::Literal.new("PubChem"), graph_name: context_uri)
  graph << RDF::Statement.new(pubchem_core_type, RDFS.label,     RDF::Literal.new("Drug"), graph_name: context_uri)
  graph << RDF::Statement.new(pubchem_uri,  SIMPATHIC['original-id'], original_drug, graph_name: context_uri)
 
  graph << RDF::Statement.new(snomed_uri, RDFS.label, disease_label, graph_name: context_uri)
  graph << RDF::Statement.new(snomed_uri, RDF.type, snomed_type, graph_name: context_uri)
  graph << RDF::Statement.new(snomed_uri, RDF.type, snomed_core_type, graph_name: context_uri)
  graph << RDF::Statement.new(snomed_type, RDFS.label, snomed_label, graph_name: context_uri)
  graph << RDF::Statement.new(snomed_uri, SIMPATHIC['orphanet'], orphanet, graph_name: context_uri)
  graph << RDF::Statement.new(snomed_uri, SIMPATHIC['original-id'], original_disease, graph_name: context_uri)
  
  graph << RDF::Statement.new(context_uri, SIMPATHIC['skg-source'], RDF::Literal.new("Radboud"), graph_name: general_context)
  #   graph << RDF::Statement.new(context_uri, SIMPATHIC['skg-evidence'], RDF::Literal.new("TBD"), graph_name: context_uri)
  
  # Write RDF to file in N-Quads format
  File.open('./graph/drug-disease.nq.large', 'w') do |f|
    RDF::Writer.for(:nquads).new(f) do |writer|
      writer << graph
    end
  end
end

graphing_errors.close

puts "RDF quads written"

disease lookup failed EFO_1001888
disease lookup failed EFO_0000211
disease lookup failed HP_0000103
disease lookup failed EFO_1000781
disease lookup failed EFO_0004220
disease lookup failed EFO_0009552
disease lookup failed EFO_0005752
disease lookup failed EFO_0003894
disease lookup failed HP_0012735
disease lookup failed HP_0001742
disease lookup failed HP_0100607
disease lookup failed EFO_0010072
disease lookup failed EFO_0007214
disease lookup failed EFO_0000180
disease lookup failed EFO_0004239
disease lookup failed HP_0004419
disease lookup failed EFO_0003907
disease lookup failed EFO_0003956
disease lookup failed EFO_0004274
disease lookup failed EFO_1001896
drug lookup failed CHEMBL2108222
disease lookup failed EFO_0003929
disease lookup failed EFO_1001069
disease lookup failed EFO_1001485
disease lookup failed HP_0002069
disease lookup failed EFO_1000760
disease lookup failed EFO_0005687
disease lookup failed EFO_0007149
disease lookup failed EFO_1001459
disease lookup failed

RDF quads written
