In [4]:
puts `ls -l`

total 620
-rw-rw-r-- 1 osboxes osboxes 259423 May 29 19:23 biovista-diseases-2025.ipynb
-rw-rw-r-- 1 osboxes osboxes  66941 Jun  4 15:25 biovista-drug-2025.ipynb
-rw-rw-r-- 1 osboxes osboxes  49944 May 29 17:37 biovista-gene-2025.ipynb
-rw-rw-r-- 1 osboxes osboxes 145025 Jun  4 17:16 biovista-pathways-2025.ipynb
-rw-rw-r-- 1 osboxes osboxes  31822 Jun  4 15:38 biovista-phenotypes.ipynb
-rw-rw-r-- 1 osboxes osboxes  18570 Jul  1 10:51 BV Drug-Disease Graphing.ipynb
-rw-rw-r-- 1 osboxes osboxes  26990 Jul  1 11:46 BV Drug-Gene Graphing.ipynb
drwxrwxr-x 2 osboxes osboxes   4096 Jun  4 15:37 deprecated
-rw-rw-r-- 1 osboxes osboxes   1282 May 29 21:19 drug mapping strategy
drwxrwxr-x 2 osboxes osboxes   4096 Jun  4 16:05 maps
drwxrwxr-x 2 osboxes osboxes   4096 May 22 16:58 maps backup
drwxrwxr-x 2 osboxes osboxes   4096 Jun  4 17:07 raw_data


In [6]:
puts `head -2 ./raw_data/bv-kg-20250225.large`

source_1	id_1	type_1	name_1	source_2	id_2	type_2	name_2	score	url
UMLS:Disease or Syndrome:MSH	C0268631	Disease	SSADH Deficiency	HP:human_phenotype	HP:0001263	Human Phenotype	Global developmental delay	0.0501869	https://www.biovista.com/db/link/%5B%5B%22Disease%7CSSADH%20Deficiency%22%5D,%20%5B%22Human%20Phenotype%7CGlobal%20developmental%20delay%22%5D%5D?strength-weight-map=%257B%2522MEDLINE_STRENGTH_AB%2522:1.0,%2522HPO%2522:100.0%257D


In [9]:
require 'csv'
drugs = {}
sourcesleft = {}
sourcesright = {}
CSV.foreach('./raw_data/bv-kg-20250225.large', col_sep: "\t", quote_char: '"', liberal_parsing: true, headers: true) do |row|
  sourcesleft[row["type_1"]] = 1
  sourcesright[row["type_2"]] = 1
end
puts sourcesleft.keys
puts
puts sourcesright.keys


Disease
Pathway
Drug
Human Phenotype
Gene

Human Phenotype
Gene
Pathway
Drug


In [12]:
puts `head -5 ./maps/2025-biovista-drugs.map`

biovista_meshid,biovista_label,CID,IUPACname
http://purl.bioontology.org/ontology/MESH/D005680,gamma-Aminobutyric Acid,119,4-aminobutanoic acid
http://purl.bioontology.org/ontology/MESH/D012978,Sodium Oxybate,23663870,sodium;4-hydroxybutanoate
http://purl.bioontology.org/ontology/MESH/D020888,Vigabatrin,5665,4-aminohex-5-enoic acid
http://purl.bioontology.org/ontology/MESH/C066471,NCS 382,3613485,"sodium;2-(5-hydroxy-5,7,8,9-tetrahydrobenzo[7]annulen-6-ylidene)acetate"


# genes have two sources

biovista-genes  are from NCBI and UniProt

and

biovista-mesh (these are mostly gene categories not genes...)

Handled separately

In [1]:
puts `head -5 ./maps/2025-biovista-genes.map`

bv_geneid,bv_label,geneid,protein,recommended_full,taxon
11758,GPx,http://purl.uniprot.org/geneid/11758,http://purl.uniprot.org/uniprot/D3Z0Y2,Peroxiredoxin-6,http://purl.uniprot.org/taxonomy/10090
11758,GPx,http://purl.uniprot.org/geneid/11758,http://purl.uniprot.org/uniprot/O08709,Peroxiredoxin-6,http://purl.uniprot.org/taxonomy/10090
11758,GPx,http://purl.uniprot.org/geneid/11758,http://purl.uniprot.org/uniprot/Q6GT24,Peroxiredoxin-6,http://purl.uniprot.org/taxonomy/10090
1213,HC,http://purl.uniprot.org/geneid/1213,http://purl.uniprot.org/uniprot/A0A087WVQ6,Clathrin heavy chain,http://purl.uniprot.org/taxonomy/9606


In [2]:
require 'linkeddata'
require 'csv'

graphing_errors = File.open('./graph/drug-gene-errors.txt', 'w') 

# Define namespaces
SIMPATHIC = RDF::Vocabulary.new('urn:simpathic:')
RDFS = RDF::Vocabulary.new('http://www.w3.org/2000/01/rdf-schema#')

# Read input files
drug_mappings = CSV.read('./maps/2025-biovista-drugs.map', headers: true)
gene_mappings = CSV.read('./maps/2025-biovista-genes.map', headers: true)

# Create RDF graph
graph = RDF::Graph.new

failures = {}
# Process each entity relation
CSV.foreach('./raw_data/bv-kg-20250225.large', col_sep: "\t", quote_char: '"', 
  liberal_parsing: true, headers: true) do |row|
        # Disease
        # Pathway
        # Drug
        # Human Phenotype
        # Gene
  next unless row['type_1'] == "Drug" or row['type_2'] == "Drug"
  next unless row['type_1'] == "Gene" or row['type_2'] == "Gene"
  if row['type_1'] == "Drug"
    drug_id = row['id_1']
    gene_id = row['id_2']  # gene ids from NCBI are only numerical, from MeSH have a letter
  else
    drug_id = row['id_2']
    gene_id = row['id_1']
  end
  
  # Find corresponding mappings
  
  # mesh is going to be differnt from NCBI
  next if gene_id =~ /\w/  # this is a MeSH id, so we can't deal with it here

  
  drug = drug_mappings.find { |d| d['biovista_meshid'] == "http://purl.bioontology.org/ontology/MESH/#{drug_id}"}
  gene = gene_mappings.find { |d| d['bv_geneid'] == gene_id }
  
  unless drug
    next if failures[drug_id]
    failures[drug_id] = 1
    warn "drug lookup failed #{drug_id}"
    graphing_errors.write "drug lookup failed #{drug_id}\n"
    next
  end
  unless gene
    next if failures[gene_id]
    failures[gene_id] = 1
    warn "gene lookup failed #{gene_id}"
    graphing_errors.write "gene lookup failed #{gene_id}\n"
    next
  end
  
  # Extract relevant IDs and labels
  # biovista_meshid,biovista_label,CID,IUPACname
  # http://purl.bioontology.org/ontology/MESH/D005680,gamma-Aminobutyric Acid,119,4-aminobutanoic acid
  pubchem_uri = RDF::URI.new(drug['CID'])
  pubchem_type = RDF::URI.new("http://semanticscience.org/resource/CHEMINF_000302")
  pubchem_label =  RDF::Literal.new("PubChem Identifier")
  pubchem_core_type = RDF::URI.new("https://w3id.org/biolink/vocab/Drug")
  biovista_drug_label = RDF::Literal.new(drug['biovista_label'])
  iupac_drug_label = RDF::Literal.new(drug['IUPACname'])

  #   bv_geneid,bv_label,geneid,protein,recommended_full,taxon
  # 11758,GPx,http://purl.uniprot.org/geneid/11758,http://purl.uniprot.org/uniprot/D3Z0Y2,Peroxiredoxin-6,http://purl.uniprot.org/taxonomy/10090
  gene_uri = RDF::URI.new(gene['geneid'])
  gene_type = RDF::URI.new("http://edamontology.org/data_1027")
  gene_label =  RDF::Literal.new("NCBI/UniProt Gene Identifier")
  gene_core_type = RDF::URI.new("https://w3id.org/biolink/vocab/Gene")
  biovista_gene_label = RDF::Literal.new(gene['bv_label'])

  
  protein_uri = RDF::URI.new(gene['protein'])
  protein_type = RDF::URI.new("http://edamontology.org/data_2291")
  protein_label =  RDF::Literal.new("UniProt Identifier")
  protein_core_type = RDF::URI.new("https://w3id.org/biolink/vocab/Protein")
  biovista_protein_label = RDF::Literal.new(gene['recommended_full'])
    
  taxon = RDF::URI.new(gene['taxon'])
  
  # Create context URI
  context_uri = RDF::URI.new("urn:simpathic:context:#{drug_id}_#{gene_id}")
  
  # Add quads to graph using RDF::Statement
  graph << RDF::Statement.new(pubchem_uri, SIMPATHIC['associated-with'], gene_uri, graph_name: context_uri)
  
  graph << RDF::Statement.new(pubchem_uri,  RDFS.label,     biovista_drug_label, graph_name: context_uri)
  graph << RDF::Statement.new(pubchem_uri,  RDFS.label,     iupac_drug_label, graph_name: context_uri)
  graph << RDF::Statement.new(pubchem_uri,  RDF.type,       pubchem_type,          graph_name: context_uri)
  graph << RDF::Statement.new(pubchem_uri,  RDF.type,       pubchem_core_type,             graph_name: context_uri)
  graph << RDF::Statement.new(pubchem_type, RDFS.label,     RDF::Literal.new("PubChem"), graph_name: context_uri)
  graph << RDF::Statement.new(pubchem_core_type, RDFS.label,     RDF::Literal.new("Drug"), graph_name: context_uri)
  graph << RDF::Statement.new(pubchem_uri,  SIMPATHIC['original-id'], RDF::Literal.new("#{drug_id}"), graph_name: context_uri)
  
  graph << RDF::Statement.new(gene_uri,  RDFS.label,       biovista_gene_label , graph_name: context_uri)
  graph << RDF::Statement.new(gene_uri,  RDF.type,         gene_type, graph_name: context_uri)
  graph << RDF::Statement.new(gene_uri,  RDF.type,         gene_core_type, graph_name: context_uri)
  graph << RDF::Statement.new(gene_type, RDFS.label,       RDF::Literal.new("NCBI Gene"), graph_name: context_uri)
  graph << RDF::Statement.new(gene_core_type, RDFS.label,  RDF::Literal.new("Gene"), graph_name: context_uri)
  graph << RDF::Statement.new(gene_uri,  SIMPATHIC['original-id'], RDF::Literal.new("#{gene_id}"), graph_name: context_uri)
  graph << RDF::Statement.new(gene_uri,  SIMPATHIC['in-taxon'], taxon, graph_name: context_uri)
  

  graph << RDF::Statement.new(pubchem_uri, SIMPATHIC['associated-with'], protein_uri, graph_name: context_uri)
  
  graph << RDF::Statement.new(protein_uri,  RDFS.label,       biovista_protein_label , graph_name: context_uri)
  graph << RDF::Statement.new(protein_uri,  RDF.type,         protein_type, graph_name: context_uri)
  graph << RDF::Statement.new(protein_uri,  RDF.type,         protein_core_type, graph_name: context_uri)
  graph << RDF::Statement.new(protein_type, RDFS.label,       RDF::Literal.new("UniProt"), graph_name: context_uri)
  graph << RDF::Statement.new(protein_core_type, RDFS.label,  RDF::Literal.new("Protein"), graph_name: context_uri)
  graph << RDF::Statement.new(protein_uri,  SIMPATHIC['original-id'], RDF::Literal.new("#{gene_id}"), graph_name: context_uri)
  graph << RDF::Statement.new(protein_uri,  SIMPATHIC['in-taxon'], taxon, graph_name: context_uri)

  
  graph << RDF::Statement.new(context_uri, SIMPATHIC['skg-source'], RDF::Literal.new("BioVista"), graph_name: context_uri)
  graph << RDF::Statement.new(context_uri, SIMPATHIC['skg-evidence'], RDF::Literal.new("TBD"), graph_name: context_uri)
end

# Write RDF to file in N-Quads format
File.open('./graph/drug-gene.nq', 'w') do |f|
  RDF::Writer.for(:nquads).new(f) do |writer|
    writer << graph
  end
end
graphing_errors.close

puts "RDF quads written"

RDF quads written


In [18]:
graphing_errors.close