In [1]:
require 'json'
require 'rest-client'
require 'linkeddata'
require 'sparql/client'
require 'csv'
require '../Lookups/metadata_functions.rb'

CSVFILE = "./raw_data/bv-kg-20250225.large".freeze
OUTPUT = "./maps/2025-biovista-genes.map".freeze
# puts `ls`
puts `head -2 ./raw_data/bv-kg-20250225.large`



source_1	id_1	type_1	name_1	source_2	id_2	type_2	name_2	score	url
UMLS:Disease or Syndrome:MSH	C0268631	Disease	SSADH Deficiency	HP:human_phenotype	HP:0001263	Human Phenotype	Global developmental delay	0.0501869	https://www.biovista.com/db/link/%5B%5B%22Disease%7CSSADH%20Deficiency%22%5D,%20%5B%22Human%20Phenotype%7CGlobal%20developmental%20delay%22%5D%5D?strength-weight-map=%257B%2522MEDLINE_STRENGTH_AB%2522:1.0,%2522HPO%2522:100.0%257D


In [None]:
ncbi = Hash.new
mesh = Hash.new
CSV.foreach(CSVFILE, headers: true, col_sep: "\t") do |row|
  if row['type_1'] == "Gene"
    if row['source_1'] =~ /NCBI/ || row['source_1'] =~ /UniProt/
      ncbi[row['id_1']] = 1
    elsif row['source_1'] =~ /MeSH/
      mesh[row['id_1']] = 1
    else
      abort row['source_1']
    end
  end
  if row['type_2'] == "Gene"
    if row['source_2'] =~ /NCBI/ || row['source_2'] =~ /UniProt/
      ncbi[row['id_2']] = 1
    elsif row['source_2'] =~ /MeSH/
      mesh[row['id_2']] = 1
    else
      abort row['source_2']
    end
  end
end
puts "done"


In [None]:
def format_ncbi_values_clause(idlist:, batch_size: 20)
  # used to make efficient sparql
  valueslist = Array.new
  puts idlist.size
#   slice = 1
  base_uri = "<http://purl.uniprot.org/geneid/%s>"
  idlist.each_slice(batch_size).map do |batch|
#     puts slice
#     slice = slice + 1
    values = batch.map { |id| base_uri % id }.join(' ')
    valueslist << values
  end
  valueslist
end

In [None]:
genequery = "
PREFIX xsd: <http://www.w3.org/2001/XMLSchema#>
PREFIX uniprotkb: <http://purl.uniprot.org/uniprot/>
PREFIX taxon: <http://purl.uniprot.org/taxonomy/>
PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
PREFIX skos: <http://www.w3.org/2004/02/skos/core#>
PREFIX up: <http://purl.uniprot.org/core/>
SELECT distinct ?geneid ?protein ?recommended_full ?taxon
WHERE
{
        VALUES ?geneid {|||VALUES|||}
        ?protein a up:Protein .
  		?protein rdfs:seeAlso ?geneid . # seeAlso http://purl.uniprot.org/geneid/939976
  		?protein up:organism ?taxon .
  		?protein up:recommendedName ?rname .
  		?rname up:fullName ?recommended_full .
}"
puts

In [None]:
out = File.open(OUTPUT, "w");
out.write "geneid,protein,recommended_full,taxon\n"

# puts genes.keys.size

sparql = SPARQL::Client.new("https://sparql.uniprot.org/sparql/")

batches = format_ncbi_values_clause(idlist: ncbi.keys)
batches.each do |batch|
  retry_attempts = 0
  begin
    result = sparql.query(genequery.gsub("|||VALUES|||", batch))
  rescue StandardError => e
    retry_attempts += 1
    if retry_attempts < 10
      retry
    else
      puts e.inspect
      abort
    end
  end
  puts "FOUND: #{result.size}"
  abort "#{batch}\n#{result.inspect}" unless result.size >= 20
  result.each do |res|
    puts "#{res['geneid']},#{res['protein']},#{res['recommended_full']},#{res['taxon']}"
    out.write CSV.generate_line([res["geneid"],res["protein"],res["recommended_full"],res["taxon"]])
  end
end

out.close
puts "done ncbi"


In [None]:
# For mesh, we will do an OBO lookup on each one

OUTFILE = "./maps/2025-biovista-mesh.map".freeze
out = File.open(OUTFILE, "w")

mesh.each_key do |m|
  uri = "http://purl.bioontology.org/ontology/MESH/#{m}"
  term = ontology_annotations(uri: uri )
  puts "#{uri},#{term},http://purl.uniprot.org/taxonomy/9606"
  out.write CSV.generate_line([uri,term,"http://purl.uniprot.org/taxonomy/9606"])
end

puts "done mesh"