In [2]:
puts `ls -l mappings`

total 17924
-rw-rw-r-- 1 osboxes osboxes        0 May 19 11:26 diseaseID-errors.txt
-rw-rw-r-- 1 osboxes osboxes   519140 May 19 11:56 diseaseID-mappings.csv
-rw-rw-r-- 1 osboxes osboxes        0 May 19 11:50 disease-therapeuticarea-errors.txt
-rw-rw-r-- 1 osboxes osboxes  3880446 May 20 11:13 disease-therapeuticarea-mappings.csv
-rw-rw-r-- 1 osboxes osboxes        0 May 22 14:57 drug-disease-drugtype-errors.txt
-rw-rw-r-- 1 osboxes osboxes        0 May 20 13:03 drug-disease-errors.txt
-rw-rw-r-- 1 osboxes osboxes   166289 May 22 16:15 drug-disease-mappings.csv
-rw-rw-r-- 1 osboxes osboxes   431330 May 20 20:17 drug-drugtype-mappings.csv
-rw-rw-r-- 1 osboxes osboxes   402206 May 20 13:03 drug-gene-mappings.csv
-rw-rw-r-- 1 osboxes osboxes      120 May 19 01:57 ENSG-UP-errors.txt
-rw-rw-r-- 1 osboxes osboxes 12704440 May 20 12:31 ENSG-UP-mappings.csv
-rw-rw-r-- 1 osboxes osboxes   215231 May 22 09:46 gene-reactome.csv
-rw-rw-r-- 1 osboxes osboxes        0 May 22 09:33 gene-reactome-erro

In [3]:
puts `head -5 ./mappings/drug-disease-mappings.csv`

radboudsource,sourceguid,sourcelabel,radboudtarget,targetguid,targetlabel
CHEMBL1200656,http://rdf.ebi.ac.uk/resource/chembl/molecule/CHEMBL1200656,NATAMYCIN,EFO_1001888,http://www.ebi.ac.uk/efo/EFO_1001888,eye infection
CHEMBL1201746,http://rdf.ebi.ac.uk/resource/chembl/molecule/CHEMBL1201746,PRALATREXATE,EFO_0000211,http://www.ebi.ac.uk/efo/EFO_0000211,unspecified peripheral T-cell lymphoma
CHEMBL1231,http://rdf.ebi.ac.uk/resource/chembl/molecule/CHEMBL1231,OXYBUTYNIN,HP_0000103,http://purl.obolibrary.org/obo/HP_0000103,Polyuria
CHEMBL1231,http://rdf.ebi.ac.uk/resource/chembl/molecule/CHEMBL1231,OXYBUTYNIN,EFO_1000781,http://www.ebi.ac.uk/efo/EFO_1000781,overactive bladder


In [6]:
puts `cat ./maps/2025-biovista-disease-snomed.map`

biovista_meshid,orphanet,snomed,name
C0027126,http://www.orpha.net/ORDO/Orphanet_273,http://purl.bioontology.org/ontology/SNOMEDCT/77956009,MYOTONIC DYSTROPHY TYPE 1
C0349653,http://www.orpha.net/ORDO/Orphanet_79318,http://purl.bioontology.org/ontology/SNOMEDCT/459063003,PMM2-CDG
C0023264,http://www.orpha.net/ORDO/Orphanet_506,http://purl.bioontology.org/ontology/SNOMEDCT/29570005,LEIGH SYNDROME
C0268467,http://www.orpha.net/ORDO/Orphanet_2102,http://purl.bioontology.org/ontology/SNOMEDCT/23447005,GTPCH DEFICIENCY
C0268631,http://www.orpha.net/ORDO/Orphanet_22,http://purl.bioontology.org/ontology/SNOMEDCT/49748000,SSADH DEFICIENCY
C0043459,http://www.orpha.net/ORDO/Orphanet_912,http://purl.bioontology.org/ontology/SNOMEDCT/88469006,ZELLWEGER SYNDROME
C0751882,http://www.orpha.net/ORDO/Orphanet_590,http://purl.bioontology.org/ontology/SNOMEDCT/230672006,CONGENITAL MYASTHENIC SYNDROME


In [None]:
require 'linkeddata'
require 'csv'

graphing_errors = File.open('./graph/drug-disease-errors.txt', 'w') 

# Define namespaces
SIMPATHIC = RDF::Vocabulary.new('urn:simpathic:')
RDFS = RDF::Vocabulary.new('http://www.w3.org/2000/01/rdf-schema#')

# Read input files
drug_mappings = CSV.read('./maps/2025-biovista-drugs.map', headers: true)
disease_mappings = CSV.read('./maps/2025-biovista-disease-snomed.map', headers: true)

# Create RDF graph
graph = RDF::Repository.new

failures = {}

# Process each entity relation
CSV.foreach('./raw_data/bv-kg-20250225.large', col_sep: "\t", quote_char: '"', 
  liberal_parsing: true, headers: true) do |row|
        # Disease
        # Pathway
        # Drug
        # Human Phenotype
        # Gene
  next unless row['type_1'] == "Drug" or row['type_2'] == "Drug"
  next unless row['type_1'] == "Disease" or row['type_2'] == "Disease"
  if row['type_1'] == "Drug"
    drug_id = row['id_1']
    disease_id = row['id_2']  # gene ids from NCBI are only numerical, from MeSH have a letter
  else
    drug_id = row['id_2']
    disease_id = row['id_1']
  end


  # Find corresponding mappings
  drug = drug_mappings.find { |d| d['biovista_meshid'] == drug_id }
  disease = disease_mappings.find { |d| d['biovista_meshid'] == disease_id }
  
  unless drug
    next if failures[drug_id]
    failures[drug_id] = 1
    warn "drug lookup failed #{drug_id}"
    graphing_errors.write "drug lookup failed #{drug_id}\n"
    next
  end
  unless disease
    next if failures[disease_id]
    failures[disease_id] = 1
    warn "disease lookup failed #{disease_id}"
    graphing_errors.write "disease lookup failed #{disease_id}\n"
    next
  end
  
  # Extract relevant IDs and labels
  # biovista_meshid,biovista_label,CID,IUPACname
  # http://purl.bioontology.org/ontology/MESH/D005680,  gamma-Aminobutyric Acid,  119,  4-aminobutanoic acid
  pubchem_uri = RDF::URI.new("https://pubchem.ncbi.nlm.nih.gov/compound/#{drug['CID']}")
  pubchem_type = RDF::URI.new("http://semanticscience.org/resource/CHEMINF_000302")
  pubchem_label =  RDF::Literal.new("PubChem Identifier")
  pubchem_core_type = RDF::URI.new("https://w3id.org/biolink/vocab/Drug")
  drug_label = RDF::Literal.new(drug['biovista_label'])
  iupac_drug_label = RDF::Literal.new(drug['IUPACname'])
  original_drug = RDF::Literal.new(drug['biovista_meshid'])

  #   biovista_meshid,orphanet,snomed,name
  #   C0027126,http://www.orpha.net/ORDO/Orphanet_273,http://purl.bioontology.org/ontology/SNOMEDCT/77956009,MYOTONIC DYSTROPHY TYPE 1
  snomed_uri = RDF::URI.new(disease['snomed'])
  snomed_type = RDF::URI.new("https://bioportal.bioontology.org/ontologies/SNOMEDCT")
  snomed_core_type = RDF::URI.new("https://w3id.org/biolink/vocab/Disease")
  snomed_label =  RDF::Literal.new("SNOMED Term")
  orphanet = RDF::URI.new(disease['orphanet'])
  disease_label = RDF::Literal.new(disease['name'])
  original_disease = RDF::Literal.new(disease['biovista_meshid'])
  
  # Create context URI
  context_uri = RDF::URI.new("urn:simpathic:context:#{drug_id}_#{disease_id}")
  
  # Add quads to graph using RDF::Statement
  graph << RDF::Statement.new(pubchem_uri, SIMPATHIC['associated-with'], snomed_uri, graph_name: context_uri)
  
  graph << RDF::Statement.new(pubchem_uri,  RDFS.label,     drug_label, graph_name: context_uri)
  graph << RDF::Statement.new(pubchem_uri,  RDFS.label,     iupac_drug_label, graph_name: context_uri)
  graph << RDF::Statement.new(pubchem_uri,  RDF.type,       pubchem_type,          graph_name: context_uri)
  graph << RDF::Statement.new(pubchem_uri,  RDF.type,       pubchem_core_type,             graph_name: context_uri)
  graph << RDF::Statement.new(pubchem_type, RDFS.label,     RDF::Literal.new("PubChem"), graph_name: context_uri)
  graph << RDF::Statement.new(pubchem_core_type, RDFS.label,     RDF::Literal.new("Drug"), graph_name: context_uri)
  graph << RDF::Statement.new(pubchem_uri,  SIMPATHIC['original-id'], original_drug, graph_name: context_uri)
 
  graph << RDF::Statement.new(snomed_uri, RDFS.label, disease_label, graph_name: context_uri)
  graph << RDF::Statement.new(snomed_uri, RDF.type, snomed_type, graph_name: context_uri)
  graph << RDF::Statement.new(snomed_uri, RDF.type, snomed_core_type, graph_name: context_uri)
  graph << RDF::Statement.new(snomed_type, RDFS.label, snomed_label, graph_name: context_uri)
  graph << RDF::Statement.new(snomed_uri, SIMPATHIC['orphanet'], orphanet, graph_name: context_uri)
  graph << RDF::Statement.new(snomed_uri, SIMPATHIC['original-id'], original_disease, graph_name: context_uri)
  
  graph << RDF::Statement.new(context_uri, SIMPATHIC['skg-source'], RDF::Literal.new("DEMOKRATIS"), graph_name: context_uri)
  graph << RDF::Statement.new(context_uri, SIMPATHIC['skg-evidence'], RDF::Literal.new("TBD"), graph_name: context_uri)
end

# Write RDF to file in N-Quads format
File.open('./graph/drug-disease.nq', 'w') do |f|
  RDF::Writer.for(:nquads).new(f) do |writer|
    writer << graph
  end
end
graphing_errors.close

puts "RDF quads written"

drug lookup failed D013482
drug lookup failed D008110
drug lookup failed C545824
drug lookup failed 4901d7c1b4e2aef6913d880bfc91240e
drug lookup failed D011113
drug lookup failed D007252
drug lookup failed D000068258
drug lookup failed D016756
drug lookup failed D002266
drug lookup failed D007371
drug lookup failed D022242
drug lookup failed D000068818
drug lookup failed D000069283
drug lookup failed D000086663
drug lookup failed C522181
drug lookup failed D000068800
drug lookup failed D019904
drug lookup failed D002364
drug lookup failed D009113
drug lookup failed D003176
drug lookup failed D013835
drug lookup failed D013838
drug lookup failed C065640
drug lookup failed D019690
drug lookup failed D001905
drug lookup failed C044447
drug lookup failed D019274
drug lookup failed D000069896
drug lookup failed D007372
drug lookup failed C091590
drug lookup failed C059659
drug lookup failed D002793
drug lookup failed C079420
drug lookup failed D024502
drug lookup failed D050759
drug lookup 

In [18]:
graphing_errors.close