In [1]:
puts `ls -l raw-data`

total 484
-rw-rw-r-- 1 osboxes osboxes  34777 Aug 12 17:45 Demokritos-KG-information.xlsx
-rw-rw-r-- 1 osboxes osboxes 207331 Aug 12 17:45 Disease-Therapeutic_Area.tsv
-rw-rw-r-- 1 osboxes osboxes  36869 Aug 12 17:45 Drug-Diseasetriples.tsv
-rw-rw-r-- 1 osboxes osboxes 111643 Aug 12 17:45 Drug-Drug_type.tsv
-rw-rw-r-- 1 osboxes osboxes  32600 Aug 12 17:45 Drug-Genetriples.tsv
-rw-rw-r-- 1 osboxes osboxes  51800 Aug 12 17:45 Gene-Diseasetriples.tsv
-rw-rw-r-- 1 osboxes osboxes   7965 Aug 12 17:45 Gene-Pathwaytriples.tsv


In [4]:
puts `head -5 ./mappings/drug-mappings.map`

demokritosid,xref,demokritos_label,pubchem_cid,IUPACname
C0613621,http://purl.bioontology.org/ontology/MESH/C030536,"2,2-dichloro-1,1-difluoroethyl difluoromethyl ether",https://pubchem.ncbi.nlm.nih.gov/compound/152803,"2,2-dichloro-1,1-difluoroethyl%20difluoromethyl%20ether"
C0042291,http://purl.bioontology.org/ontology/SNOMEDCT/387080000,Valproic Acid,https://pubchem.ncbi.nlm.nih.gov/compound/3121,Valproic%20acid
C0059747,http://purl.bioontology.org/ontology/SNOMEDCT/83298009,ethyl acetate,https://pubchem.ncbi.nlm.nih.gov/compound/8857,Ethyl%20acetate
C0059747,http://purl.bioontology.org/ontology/MESH/C007650,ethyl acetate,https://pubchem.ncbi.nlm.nih.gov/compound/8857,ethyl%20acetate


In [5]:
puts `head -5 ./mappings/disease-mappings.map`

source,snomedct,orpha,prefname
C0019247,http://purl.bioontology.org/ontology/SNOMEDCT/32895009,https://fake.orpha.net/not-found,Hereditary disease
C0019247,http://purl.bioontology.org/ontology/SNOMEDCT/782964007,https://fake.orpha.net/not-found,Genetic disease
C0007760,http://purl.bioontology.org/ontology/SNOMEDCT/223176004,https://fake.orpha.net/not-found,Cerebellar disorder
C0007760,http://purl.bioontology.org/ontology/SNOMEDCT/224186005,https://fake.orpha.net/not-found,Cerebellar deficiency syndrome


In [6]:
require 'linkeddata'
require 'csv'

graphing_errors = File.open('./graph/drug-disease-errors.txt', 'w') 

# Define namespaces
SIMPATHIC = RDF::Vocabulary.new('urn:simpathic:')
RDFS = RDF::Vocabulary.new('http://www.w3.org/2000/01/rdf-schema#')

# Read input files
entity_relations = CSV.read('./raw-data/Drug-Diseasetriples.tsv', col_sep: "\t", headers: true)
drug_mappings = CSV.read('./mappings/drug-mappings.map', headers: true)
disease_mappings = CSV.read('./mappings/disease-mappings.map', headers: true)

# Create RDF graph
graph = RDF::Repository.new

failures = {}
# Process each entity relation
entity_relations.each do |row|
  # warn row.inspect
  drug_id = row['Drug_id']
  disease_id = row['Disease_id']
  
  # Find corresponding mappings
  drug = drug_mappings.find { |d| d['demokritosid'] == drug_id }
  disease = disease_mappings.find { |d| d['source'] == disease_id }
  
  unless drug
    next if failures[drug_id]
    failures[drug_id] = 1
    warn "drug lookup failed #{drug_id}"
    graphing_errors.write "drug lookup failed #{drug_id}\n"
    next
  end
  unless disease
    next if failures[disease_id]
    failures[disease_id] = 1
    warn "disease lookup failed #{disease_id}"
    graphing_errors.write "disease lookup failed #{disease_id}\n"
    next
  end
  
  # Extract relevant IDs and labels
  # demokratisid,xref,demokratis_label,pubchem_cid,IUPACname
  # C0613621,http://purl.bioontology.org/ontology/MESH/C030536,"2,2-dichloro-1,1-difluoroethyl difluoromethyl ether",https://pubchem.ncbi.nlm.nih.gov/compound/152803,"2,2-dichloro-1,1-difluoroethyl%20difluoromethyl%20ether"
  pubchem_uri = RDF::URI.new(drug['pubchem_cid'])
  pubchem_type = RDF::URI.new("http://semanticscience.org/resource/CHEMINF_000302")
  pubchem_label =  RDF::Literal.new(drug['xref'])
  pubchem_core_type = RDF::URI.new("https://w3id.org/biolink/vocab/Drug")
  human_drug_label = RDF::Literal.new(drug['demokritos_label'])
  iupac_drug_label = RDF::Literal.new(drug['IUPACname'])
#   source,snomedct,orpha,prefname
#   C0019247,http://purl.bioontology.org/ontology/SNOMEDCT/32895009,https://fake.orpha.net/not-found,Hereditary disease
  snomed_uri = RDF::URI.new(disease['snomedct'])
  snomed_type = RDF::URI.new("https://bioportal.bioontology.org/ontologies/SNOMEDCT")
  snomed_core_type = RDF::URI.new("https://w3id.org/biolink/vocab/Disease")
  snomed_label =  RDF::Literal.new("SNOMED Term")
  orphanet = RDF::URI.new(disease['orpha'])
  disease_label = RDF::Literal.new(disease['prefname'])
  original_disease = RDF::Literal.new(disease_id)
  
  # Create context URI
  context_uri = RDF::URI.new("urn:simpathic:context:#{drug_id}_#{disease_id}")
  general_context = RDF::URI.new("urn:simpathic:context:all_metadata")
  
  # Add quads to graph using RDF::Statement
  graph << RDF::Statement.new(pubchem_uri, SIMPATHIC['associated-with'], snomed_uri, graph_name: context_uri)
  graph << RDF::Statement.new(snomed_uri, SIMPATHIC['associated-with'], pubchem_uri, graph_name: context_uri)
  
  graph << RDF::Statement.new(pubchem_uri,  RDFS.label,     human_drug_label, graph_name: context_uri)
  graph << RDF::Statement.new(pubchem_uri,  RDFS.label,     iupac_drug_label, graph_name: context_uri)
  graph << RDF::Statement.new(pubchem_uri,  RDF.type,       pubchem_type,          graph_name: context_uri)
  graph << RDF::Statement.new(pubchem_uri,  RDF.type,       pubchem_core_type,             graph_name: context_uri)
  graph << RDF::Statement.new(pubchem_type, RDFS.label,     RDF::Literal.new("PubChem"), graph_name: context_uri)
  graph << RDF::Statement.new(pubchem_core_type, RDFS.label,     RDF::Literal.new("Drug"), graph_name: context_uri)
  graph << RDF::Statement.new(pubchem_uri,  SIMPATHIC['original-id'], RDF::Literal.new("#{drug_id}"), graph_name: context_uri)
  
    
  graph << RDF::Statement.new(snomed_uri, RDFS.label, disease_label, graph_name: context_uri)
  graph << RDF::Statement.new(snomed_uri, RDF.type, snomed_type, graph_name: context_uri)
  graph << RDF::Statement.new(snomed_uri, RDF.type, snomed_core_type, graph_name: context_uri)
  graph << RDF::Statement.new(snomed_type, RDFS.label, snomed_label, graph_name: context_uri)
  graph << RDF::Statement.new(snomed_uri, SIMPATHIC['orphanet'], orphanet, graph_name: context_uri)
  graph << RDF::Statement.new(snomed_uri, SIMPATHIC['original-id'], original_disease, graph_name: context_uri)

  graph << RDF::Statement.new(context_uri, SIMPATHIC['skg-source'], RDF::Literal.new("DEMOKRITOS"), graph_name: general_context)
#   graph << RDF::Statement.new(context_uri, SIMPATHIC['skg-evidence'], RDF::Literal.new("TBD"), graph_name: context_uri)
end

# Write RDF to file in N-Quads format
File.open('./graph/drug-disease.nq.large', 'w') do |f|
  RDF::Writer.for(:nquads).new(f) do |writer|
    writer << graph
  end
end
graphing_errors.close

puts "RDF quads written"

disease lookup failed C0524851
drug lookup failed C0026457
drug lookup failed C1567285
drug lookup failed C0723712
drug lookup failed C3541953
drug lookup failed C0008425
drug lookup failed C0006507
drug lookup failed C0591139
drug lookup failed C0002062
drug lookup failed C0024808
drug lookup failed C0060794
drug lookup failed C0056077
drug lookup failed C3492623
drug lookup failed C0042672
drug lookup failed C2825610
drug lookup failed C0041044
drug lookup failed C0070971
drug lookup failed C0699680
drug lookup failed C0082384
disease lookup failed C0679225
disease lookup failed C0742034
disease lookup failed C0694564
disease lookup failed C0262405
drug lookup failed C0017973
drug lookup failed C0073590
drug lookup failed C0772257
drug lookup failed C0242402
disease lookup failed C0031117
disease lookup failed C0004096
drug lookup failed C0595726
drug lookup failed C0001617
drug lookup failed C0045550
drug lookup failed C0939898
drug lookup failed C0968917
drug lookup failed C0074332

RDF quads written


In [18]:
graphing_errors.close