In [1]:
puts `ls -l`

total 776
-rw-rw-r-- 1 osboxes osboxes 263521 Jul 15 10:40 biovista-diseases-2025.ipynb
-rw-rw-r-- 1 osboxes osboxes  66941 Jul 15 10:40 biovista-drug-2025.ipynb
-rw-rw-r-- 1 osboxes osboxes 206070 Jul 15 10:40 biovista-gene-2025.ipynb
-rw-rw-r-- 1 osboxes osboxes 145025 Jul 15 10:40 biovista-pathways-2025.ipynb
-rw-rw-r-- 1 osboxes osboxes  31822 Jul 15 10:40 biovista-phenotypes.ipynb
-rw-rw-r-- 1 osboxes osboxes     72 Aug 11 14:33 BV Disease-Gene Graphing.ipynb
-rw-rw-r-- 1 osboxes osboxes  13191 Jul 15 10:40 BV Drug-Disease Graphing.ipynb
-rw-rw-r-- 1 osboxes osboxes  22163 Jul 15 10:40 BV Drug-Gene Graphing.ipynb
drwxrwxr-x 2 osboxes osboxes   4096 Jul 15 10:40 deprecated
-rw-rw-r-- 1 osboxes osboxes   1282 Jul 15 10:40 drug mapping strategy
drwxrwxr-x 2 osboxes osboxes   4096 Jul 15 10:40 graph
drwxrwxr-x 2 osboxes osboxes   4096 Jul 15 10:40 maps
drwxrwxr-x 2 osboxes osboxes   4096 Jul 15 10:40 maps backup
drwxrwxr-x 2 osboxes osboxes   4096 Jul 15 10:40 raw_data


In [5]:
puts `head -2 ./raw_data/bv-kg-20250225.large`

source_1	id_1	type_1	name_1	source_2	id_2	type_2	name_2	score	url
UMLS:Disease or Syndrome:MSH	C0268631	Disease	SSADH Deficiency	HP:human_phenotype	HP:0001263	Human Phenotype	Global developmental delay	0.0501869	https://www.biovista.com/db/link/%5B%5B%22Disease%7CSSADH%20Deficiency%22%5D,%20%5B%22Human%20Phenotype%7CGlobal%20developmental%20delay%22%5D%5D?strength-weight-map=%257B%2522MEDLINE_STRENGTH_AB%2522:1.0,%2522HPO%2522:100.0%257D


In [2]:
puts `head -5 ./maps/2025-biovista-genes.map`

bv_geneid,bv_label,geneid,protein,recommended_full,taxon
11758,GPx,http://purl.uniprot.org/geneid/11758,http://purl.uniprot.org/uniprot/D3Z0Y2,Peroxiredoxin-6,http://purl.uniprot.org/taxonomy/10090
11758,GPx,http://purl.uniprot.org/geneid/11758,http://purl.uniprot.org/uniprot/O08709,Peroxiredoxin-6,http://purl.uniprot.org/taxonomy/10090
11758,GPx,http://purl.uniprot.org/geneid/11758,http://purl.uniprot.org/uniprot/Q6GT24,Peroxiredoxin-6,http://purl.uniprot.org/taxonomy/10090
1213,HC,http://purl.uniprot.org/geneid/1213,http://purl.uniprot.org/uniprot/A0A087WVQ6,Clathrin heavy chain,http://purl.uniprot.org/taxonomy/9606


In [3]:
puts `cat ./maps/2025-biovista-disease-snomed.map`

biovista_meshid,orphanet,snomed,name
C0027126,http://www.orpha.net/ORDO/Orphanet_273,http://purl.bioontology.org/ontology/SNOMEDCT/77956009,MYOTONIC DYSTROPHY TYPE 1
C0349653,http://www.orpha.net/ORDO/Orphanet_79318,http://purl.bioontology.org/ontology/SNOMEDCT/459063003,PMM2-CDG
C0023264,http://www.orpha.net/ORDO/Orphanet_506,http://purl.bioontology.org/ontology/SNOMEDCT/29570005,LEIGH SYNDROME
C0268467,http://www.orpha.net/ORDO/Orphanet_2102,http://purl.bioontology.org/ontology/SNOMEDCT/23447005,GTPCH DEFICIENCY
C0268631,http://www.orpha.net/ORDO/Orphanet_22,http://purl.bioontology.org/ontology/SNOMEDCT/49748000,SSADH DEFICIENCY
C0043459,http://www.orpha.net/ORDO/Orphanet_912,http://purl.bioontology.org/ontology/SNOMEDCT/88469006,ZELLWEGER SYNDROME
C0751882,http://www.orpha.net/ORDO/Orphanet_590,http://purl.bioontology.org/ontology/SNOMEDCT/230672006,CONGENITAL MYASTHENIC SYNDROME


In [1]:
require 'linkeddata'
require 'csv'

graphing_errors = File.open('./graph/gene-disease-errors.txt', 'w') 

# Define namespaces
SIMPATHIC = RDF::Vocabulary.new('urn:simpathic:')
RDFS = RDF::Vocabulary.new('http://www.w3.org/2000/01/rdf-schema#')

# Read input files
gene_mappings = CSV.read('./maps/2025-biovista-genes.map', headers: true)
disease_mappings = CSV.read('./maps/2025-biovista-disease-snomed.map', headers: true)

# Create RDF graph
graph = RDF::Repository.new

failures = {}

# refresh
f = File.open('./graph/disease-gene.nq', 'w')
f.close

# Process each entity relation
CSV.foreach('./raw_data/bv-kg-20250225.large', col_sep: "\t", quote_char: '"', 
  liberal_parsing: true, headers: true) do |row|
        # Disease
        # Pathway
        # Drug
        # Human Phenotype
        # Gene
  next unless row['type_1'] == "Gene" or row['type_2'] == "Gene"
  next unless row['type_1'] == "Disease" or row['type_2'] == "Disease"
  if row['type_1'] == "Gene"
    gene_id = row['id_1']
    disease_id = row['id_2']  
  else
    gene_id = row['id_2']
    disease_id = row['id_1']
  end

  score = row['score']
  evidence = row['url']

  # mesh is going to be differnt from NCBI
  next if gene_id =~ /[A-Z]/  # this is a MeSH id, so we can't deal with it here

  # Find corresponding mappings
  gene = gene_mappings.find { |d| d['bv_geneid'] == gene_id }
  disease = disease_mappings.find { |d| d['biovista_meshid'] == disease_id }
  
  unless gene
    next if failures[gene_id]
    failures[gene_id] = 1
    warn "gene lookup failed #{gene_id}"
    graphing_errors.write "gene lookup failed #{gene_id}\n"
    next
  end
  unless disease
    next if failures[disease_id]
    failures[disease_id] = 1
    warn "disease lookup failed #{disease_id}"
    graphing_errors.write "disease lookup failed #{disease_id}\n"
    next
  end
  

  #   biovista_meshid,orphanet,snomed,name
  #   C0027126,http://www.orpha.net/ORDO/Orphanet_273,http://purl.bioontology.org/ontology/SNOMEDCT/77956009,MYOTONIC DYSTROPHY TYPE 1
  snomed_uri = RDF::URI.new(disease['snomed'])
  snomed_type = RDF::URI.new("https://bioportal.bioontology.org/ontologies/SNOMEDCT")
  snomed_core_type = RDF::URI.new("https://w3id.org/biolink/vocab/Disease")
  snomed_label =  RDF::Literal.new("SNOMED Term")
  orphanet = RDF::URI.new(disease['orphanet'])
  disease_label = RDF::Literal.new(disease['name'])
  original_disease = RDF::Literal.new(disease['biovista_meshid'])

  #   bv_geneid,bv_label,geneid,protein,recommended_full,taxon
  # 11758,GPx,http://purl.uniprot.org/geneid/11758,http://purl.uniprot.org/uniprot/D3Z0Y2,Peroxiredoxin-6,http://purl.uniprot.org/taxonomy/10090
  gene_uri = RDF::URI.new(gene['geneid'])
  gene_type = RDF::URI.new("http://edamontology.org/data_1027")
  gene_label =  RDF::Literal.new("NCBI/UniProt Gene Identifier")
  gene_core_type = RDF::URI.new("https://w3id.org/biolink/vocab/Gene")
  biovista_gene_label = RDF::Literal.new(gene['bv_label'])

  protein_uri = RDF::URI.new(gene['protein'])
  protein_type = RDF::URI.new("http://edamontology.org/data_2291")
  protein_label =  RDF::Literal.new("UniProt Identifier")
  protein_core_type = RDF::URI.new("https://w3id.org/biolink/vocab/Protein")
  biovista_protein_label = RDF::Literal.new(gene['recommended_full'])
    
  taxon = RDF::URI.new(gene['taxon'])
  
  # Create context URI
  # Create context URI
  context_uri = RDF::URI.new("urn:simpathic:context:#{gene_id}_#{disease_id}")
  general_context = RDF::URI.new("urn:simpathic:context:all_metadata")
  

  graph << RDF::Statement.new(snomed_uri, SIMPATHIC['associated-with'], protein_uri, graph_name: context_uri)
  graph << RDF::Statement.new(snomed_uri, SIMPATHIC['associated-with'], gene_uri, graph_name: context_uri)

  graph << RDF::Statement.new(snomed_uri, RDFS.label, disease_label, graph_name: context_uri)
  graph << RDF::Statement.new(snomed_uri, RDF.type, snomed_type, graph_name: context_uri)
  graph << RDF::Statement.new(snomed_uri, RDF.type, snomed_core_type, graph_name: context_uri)
  graph << RDF::Statement.new(snomed_type, RDFS.label, snomed_label, graph_name: context_uri)
  graph << RDF::Statement.new(snomed_uri, SIMPATHIC['orphanet'], orphanet, graph_name: context_uri)
  graph << RDF::Statement.new(snomed_uri, SIMPATHIC['original-id'], original_disease, graph_name: context_uri)
  
  graph << RDF::Statement.new(gene_uri,  RDFS.label,       biovista_gene_label , graph_name: context_uri)
  graph << RDF::Statement.new(gene_uri,  RDF.type,         gene_type, graph_name: context_uri)
  graph << RDF::Statement.new(gene_uri,  RDF.type,         gene_core_type, graph_name: context_uri)
  graph << RDF::Statement.new(gene_type, RDFS.label,       RDF::Literal.new("NCBI Gene"), graph_name: context_uri)
  graph << RDF::Statement.new(gene_core_type, RDFS.label,  RDF::Literal.new("Gene"), graph_name: context_uri)
  graph << RDF::Statement.new(gene_uri,  SIMPATHIC['original-id'], RDF::Literal.new("#{gene_id}"), graph_name: context_uri)
  graph << RDF::Statement.new(gene_uri,  SIMPATHIC['in-taxon'], taxon, graph_name: context_uri)
  
  graph << RDF::Statement.new(protein_uri,  RDFS.label,       biovista_protein_label , graph_name: context_uri)
  graph << RDF::Statement.new(protein_uri,  RDF.type,         protein_type, graph_name: context_uri)
  graph << RDF::Statement.new(protein_uri,  RDF.type,         protein_core_type, graph_name: context_uri)
  graph << RDF::Statement.new(protein_type, RDFS.label,       RDF::Literal.new("UniProt"), graph_name: context_uri)
  graph << RDF::Statement.new(protein_core_type, RDFS.label,  RDF::Literal.new("Protein"), graph_name: context_uri)
  graph << RDF::Statement.new(protein_uri,  SIMPATHIC['original-id'], RDF::Literal.new("#{gene_id}"), graph_name: context_uri)
  graph << RDF::Statement.new(protein_uri,  SIMPATHIC['in-taxon'], taxon, graph_name: context_uri)

    
  graph << RDF::Statement.new(context_uri, SIMPATHIC['skg-source'], RDF::Literal.new("BioVista"), graph_name: general_context)
  graph << RDF::Statement.new(context_uri, SIMPATHIC['evidence'], RDF::URI.new(evidence), graph_name: general_context)
  graph << RDF::Statement.new(context_uri, SIMPATHIC['score'], RDF::Literal.new(score), graph_name: general_context)
end

# Write RDF to file in N-Quads format
File.open('./graph/disease-gene.nq.large', 'w') do |f|
  RDF::Writer.for(:nquads).new(f) do |writer|
    writer << graph
  end
end
graphing_errors.close

puts "RDF quads written"

gene lookup failed 72448059
gene lookup failed 104897615
gene lookup failed 3878934
gene lookup failed 17
gene lookup failed 887216
gene lookup failed 123142229
gene lookup failed 497258
gene lookup failed 9272629
gene lookup failed 106390267
gene lookup failed 41930
gene lookup failed 542841
gene lookup failed 809587
gene lookup failed 60899174
gene lookup failed 3880727
gene lookup failed 3878171
gene lookup failed 100275660
gene lookup failed 4328196
gene lookup failed 3639254
gene lookup failed 100129518
gene lookup failed 4326236
gene lookup failed 3879135
gene lookup failed 816871
gene lookup failed 100765829
gene lookup failed 100125585
gene lookup failed 3641731
gene lookup failed 100728057
gene lookup failed 886729
gene lookup failed 100529264
gene lookup failed 595111
gene lookup failed 6629976
gene lookup failed 125765250
gene lookup failed 125768785
gene lookup failed 4337437
gene lookup failed 100009596
gene lookup failed 104889455
gene lookup failed 1280455
gene lookup fa

RDF quads written
