In [2]:
puts `ls /tmp/biovista`


biovistakg-2025.tsv


In [8]:
require 'json'
require 'rest-client'
require '../OBO_Lookup/obo.rb'
require 'linkeddata'
require 'sparql/client'

CSVFILE = "/tmp/biovista/biovistakg-2025.tsv".freeze
COLUMN = 0
OUTPUT = "./maps/2025-biovista-genes.map".freeze



"./maps/2025-biovista-genes.map"

In [4]:
genequery = "
PREFIX xsd: <http://www.w3.org/2001/XMLSchema#>
PREFIX uniprotkb: <http://purl.uniprot.org/uniprot/>
PREFIX taxon: <http://purl.uniprot.org/taxonomy/>
PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
PREFIX skos: <http://www.w3.org/2004/02/skos/core#>
PREFIX up: <http://purl.uniprot.org/core/>
#SELECT ?protein ?fullName ?prefLabel ?recommended ?gene
SELECT distinct ?protein ?recommended_full ?pref ?alt
WHERE
{
        ?protein a up:Protein .
        ?protein up:reviewed true .
#  		?protein rdfs:seeAlso ?other .
#  		?other rdfs:comment ?alt_name .
  		?protein up:organism taxon:9606 .
  		?protein up:recommendedName ?rname .
  		?rname up:fullName ?recommended_full .
		?protein up:encodedBy ?gene .
  		?gene skos:prefLabel ?pref .
  		?gene skos:altLabel ?alt .  
        FILTER (  lcase(str(?pref)) =  lcase('|||LABEL|||') || lcase(str(?alt)) = lcase('|||LABEL|||'))  
}"
puts




In [5]:
labelquery = "
PREFIX xsd: <http://www.w3.org/2001/XMLSchema#>
PREFIX uniprotkb: <http://purl.uniprot.org/uniprot/>
PREFIX taxon: <http://purl.uniprot.org/taxonomy/>
PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
PREFIX skos: <http://www.w3.org/2004/02/skos/core#>
PREFIX up: <http://purl.uniprot.org/core/>
#SELECT ?protein ?fullName ?prefLabel ?recommended ?gene
SELECT distinct ?protein ?recommended_full ?pref ?alt
WHERE
{
        ?protein a up:Protein .
        ?protein up:reviewed true .
#  		?protein rdfs:seeAlso ?other .
#  		?other rdfs:comment ?alt_name .
  		?protein up:organism taxon:9606 .
  		?protein up:recommendedName ?rname .
  		?rname up:fullName ?recommended_full .
		?protein up:encodedBy ?gene .
  		?gene skos:prefLabel ?pref .
  		?gene skos:altLabel ?alt .  
  		FILTER CONTAINS(lcase(str(?recommended_full)), lcase('|||LABEL|||'))
  
}"
puts





In [10]:
types = {}
File.readlines(CSVFILE, chomp: true).each do |line|
  row = line.split("\t")
  
  types[row[2]] = 1
end
puts "left side"
puts types.keys
  


type_1
Disease
Pathway
Drug
Human Phenotype
Gene


In [11]:
types = {}
File.readlines(CSVFILE, chomp: true).each do |line|
  row = line.split("\t")
  
  types[row[6]] = 1
end
puts "right side"
puts types.keys
  


right side
type_2
Human Phenotype
Gene
Pathway
Drug


In [12]:
types = {}
File.readlines(CSVFILE, chomp: true).each do |line|
  row = line.split("\t")
  
  types["#{row[2]} #{row[6]}"] = 1
end
puts "Pairs"
puts types.keys
  


Pairs
type_1 type_2
Disease Human Phenotype
Disease Gene
Disease Pathway
Disease Drug
Pathway Human Phenotype
Pathway Gene
Pathway Pathway
Pathway Drug
Drug Drug
Drug Gene
Drug Human Phenotype
Human Phenotype Human Phenotype
Human Phenotype Gene
Gene Gene


In [None]:
@seen = {}

out = File.open(OUTPUT, "w");

File.readlines(CSVFILE, chomp: true).each do |line|
  row = line.split("\t")
  next unless row[COLUMN + 1] == "Gene"
  value = row[COLUMN].strip

  next if @seen[value] # don't hammer UniProt :-)
  
  warn "looking for #{value}"
  if value.empty?
    warn "no value found in the column"
    next
  end
  
  sparql = SPARQL::Client.new("https://sparql.uniprot.org/sparql/")
  
  if value.length <= 8
    warn "gene name #{value}"
    result = nil
    retry_attempts = 1
    begin
#       puts genequery.gsub("|||LABEL|||", value)
#       abort
      result = sparql.query(genequery.gsub("|||LABEL|||", value))
#       puts result.inspect
    rescue 
      retry_attempts += 1
      if retry_attempts < 10
        retry
      else
        puts "Timeout error"
        abort
      end
    end
    if result.first
      res = result.first  # first to get the recommended label
      puts "#{res["pref"]}, #{res["protein"]}, #{res["recommended_full"]}"
      out.write "#{res["pref"]}, #{res["protein"]}, #{res["recommended_full"]}\n"

      result.each do |res|
        if res["alt"]
          puts "#{res["alt"]}, #{res["protein"]}, #{res["recommended_full"]}"
          out.write "#{res["alt"]}, #{res["protein"]}, #{res["recommended_full"]}\n"
        end
      end
    else
      out.write "No URI found for #{value}\n"
    end

  else # label ? 8 so assume it is a functional name
    warn "function name #{value}"      
    retry_attempts = 1
    result = nil
    begin
#       puts labelquery.gsub("|||LABEL|||", value)
#       abort
      result = sparql.query(labelquery.gsub("|||LABEL|||", value)) # label query
#       puts result.inspect
    rescue 
      retry_attempts += 1
      if retry_attempts < 10
        retry
      else
        puts "Timeout error"
        abort
      end
    end
    if result.first  # give up
      res = result.first  # first to get the recommended label
      puts "#{res["pref"]}, #{res["protein"]}, #{res["recommended_full"]}"
      out.write "#{res["pref"]}, #{res["protein"]}, #{res["recommended_full"]}\n"
      result.each do |res|
        if res["alt"]
          puts "#{res["alt"]}, #{res["protein"]}, #{res["recommended_full"]}"
          out.write "#{res["alt"]}, #{res["protein"]}, #{res["recommended_full"]}\n"
        end
      end
    else
      out.write "No URI found for #{value}\n"
    end
  end

  @seen[value] = 1
end

puts "done"