In [4]:
puts `head -2 ./mappings/gene-mappings.map`

source,label,geneid,protein,recommended_full,taxon
C1421313,UCP1,http://purl.uniprot.org/geneid/7350,http://purl.uniprot.org/uniprot/P25874,uncoupling protein 1,http://purl.uniprot.org/taxonomy/9606


In [5]:
puts `head -2 ./mappings/disease-mappings.map`

source,snomedct,orpha,prefname
C0019247,http://purl.bioontology.org/ontology/SNOMEDCT/32895009,https://fake.orpha.net/not-found,Hereditary disease


In [3]:
puts `head -2 ./raw-data/Gene-Diseasetriples.tsv`

﻿Gene	Gene_id	Disease	Disease_id
WASF1 gene	C1421479	Hereditary Diseases	C0019247


In [8]:
require 'linkeddata'
require 'rdf/nquads'
require 'csv'

graphing_errors = File.open('./graph/disease-gene-errors.txt', 'w') 
File.open('./graph/disease-gene.nq', 'a') do |f| # reset
end
# Define namespaces
SIMPATHIC = RDF::Vocabulary.new('urn:simpathic:')
RDFS = RDF::Vocabulary.new('http://www.w3.org/2000/01/rdf-schema#')

# Read input files
disease_mappings = CSV.read('./mappings/disease-mappings.map', headers: true)
gene_mappings = CSV.read('./mappings/gene-mappings.map', headers: true)
failures = {}

# refresh
f = File.open('./graph/disease-gene.nq', 'w')
f.close

recordcount = 0
CSV.foreach('./raw-data/Gene-Diseasetriples.tsv', col_sep: "\t", quote_char: '"', 
  liberal_parsing: true, headers: true) do |row|
# Gene	Gene_id	Disease	Disease_id
# WASF1 gene	C1421479	Hereditary Diseases	C0019247
  disease_id = row['Disease_id']
  gene_id = row['Gene_id']
  # score = 1
  #evidence = ""

#   warn "searching for #{disease_id}"
  disease = disease_mappings.find { |d| d['source'] == disease_id }
  gene = gene_mappings.find { |d| d['source'] == gene_id }
  
  unless disease
    next if failures[disease_id]
    failures[disease_id] = 1
    warn "disease lookup failed #{disease_id}"
    graphing_errors.write "disease lookup failed #{disease_id}\n"
    next
  end
  unless gene
    next if failures[gene_id]
    failures[gene_id] = 1
    warn "gene lookup failed #{gene_id}"
    graphing_errors.write "gene lookup failed #{gene_id}\n"
    next
  end
  

#   source,snomedct,orpha,prefname
#   C0019247,http://purl.bioontology.org/ontology/SNOMEDCT/32895009,https://fake.orpha.net/not-found,Hereditary disease
  snomed_uri = RDF::URI.new(disease['snomedct'])
  snomed_type = RDF::URI.new("https://bioportal.bioontology.org/ontologies/SNOMEDCT")
  snomed_core_type = RDF::URI.new("https://w3id.org/biolink/vocab/Disease")
  snomed_label =  RDF::Literal.new("SNOMED Term")
  orphanet = RDF::URI.new(disease['orpha'])
  disease_label = RDF::Literal.new(disease['prefname'])
  original_disease = RDF::Literal.new(disease_id)

  #   source,label,geneid,protein,recommended_full,taxon
  #   C1421313,UCP1,http://purl.uniprot.org/geneid/7350,http://purl.uniprot.org/uniprot/P25874,uncoupling protein 1,http://purl.uniprot.org/taxonomy/9606
  gene_uri = RDF::URI.new(gene['geneid'])
  gene_type = RDF::URI.new("http://edamontology.org/data_2610")
  gene_label =  RDF::Literal.new(gene['label'])
  gene_core_type = RDF::URI.new("https://w3id.org/biolink/vocab/Gene")
  human_gene_label = RDF::Literal.new(gene['recommended_full'])

  
  protein_uri = RDF::URI.new(gene['protein'])
  protein_type = RDF::URI.new("http://edamontology.org/data_2291")
  protein_label =  RDF::Literal.new(gene['protein'])
  protein_core_type = RDF::URI.new("https://w3id.org/biolink/vocab/Protein")
  human_protein_label = RDF::Literal.new(gene['recommended_full'])

  taxon = RDF::URI.new(gene['taxon'])

  
  # Create context URI
  context_uri = RDF::URI.new("urn:simpathic:context:#{disease_id}_#{gene_id}")
  general_context = RDF::URI.new("urn:simpathic:context:all_metadata")
  
  # Create RDF repository (need to do this each time, since there are hundreds of thousands of lines, and the graph gets too big for memory)
  graph = RDF::Repository.new


  # Add quads to graph using RDF::Statement
  graph << RDF::Statement.new(snomed_uri, SIMPATHIC['associated-with'], gene_uri, graph_name: context_uri)
  graph << RDF::Statement.new(gene_uri, SIMPATHIC['associated-with'], snomed_uri , graph_name: context_uri)
    
  graph << RDF::Statement.new(snomed_uri, RDFS.label, disease_label, graph_name: context_uri)
  graph << RDF::Statement.new(snomed_uri, RDF.type, snomed_type, graph_name: context_uri)
  graph << RDF::Statement.new(snomed_uri, RDF.type, snomed_core_type, graph_name: context_uri)
  graph << RDF::Statement.new(snomed_type, RDFS.label, snomed_label, graph_name: context_uri)
  graph << RDF::Statement.new(snomed_uri, SIMPATHIC['orphanet'], orphanet, graph_name: context_uri)
  graph << RDF::Statement.new(snomed_uri, SIMPATHIC['original-id'], original_disease, graph_name: context_uri)

  graph << RDF::Statement.new(gene_uri,  RDFS.label,       human_gene_label , graph_name: context_uri)
  graph << RDF::Statement.new(gene_uri,  RDF.type,         gene_type, graph_name: context_uri)
  graph << RDF::Statement.new(gene_uri,  RDF.type,         gene_core_type, graph_name: context_uri)
  graph << RDF::Statement.new(gene_type, RDFS.label,       RDF::Literal.new("NCBI Gene"), graph_name: context_uri)
  graph << RDF::Statement.new(gene_core_type, RDFS.label,  RDF::Literal.new("Gene"), graph_name: context_uri)
  graph << RDF::Statement.new(gene_uri,  SIMPATHIC['original-id'], RDF::Literal.new("#{gene_id}"), graph_name: context_uri)
  graph << RDF::Statement.new(gene_uri,  SIMPATHIC['in-taxon'], taxon, graph_name: context_uri)

  graph << RDF::Statement.new(snomed_uri, SIMPATHIC['associated-with'], protein_uri, graph_name: context_uri)
  graph << RDF::Statement.new(protein_uri, SIMPATHIC['associated-with'], snomed_uri, graph_name: context_uri)

  graph << RDF::Statement.new(protein_uri,  RDFS.label,       human_protein_label , graph_name: context_uri)
  graph << RDF::Statement.new(protein_uri,  RDF.type,         protein_type, graph_name: context_uri)
  graph << RDF::Statement.new(protein_uri,  RDF.type,         protein_core_type, graph_name: context_uri)
  graph << RDF::Statement.new(protein_type, RDFS.label,       RDF::Literal.new("UniProt"), graph_name: context_uri)
  graph << RDF::Statement.new(protein_core_type, RDFS.label,  RDF::Literal.new("Protein"), graph_name: context_uri)
  graph << RDF::Statement.new(protein_uri,  SIMPATHIC['original-id'], RDF::Literal.new("#{gene_id}"), graph_name: context_uri)
  graph << RDF::Statement.new(protein_uri,  SIMPATHIC['in-taxon'], taxon, graph_name: context_uri)

  graph << RDF::Statement.new(context_uri, SIMPATHIC['skg-source'], RDF::Literal.new("DEMOKRITOS"), graph_name: general_context)
#   graph << RDF::Statement.new(context_uri, SIMPATHIC['evidence'], RDF::URI.new(evidence))
#   graph << RDF::Statement.new(context_uri, SIMPATHIC['score'], RDF::Literal.new(score))


#   warn "graph #{context_uri} built"
  # Write RDF to file in N-Quads format
  File.open('./graph/disease-gene.nq.large', 'a') do |f|
    RDF::Writer.for(:nquads).new(f) do |writer|
      writer << graph
    end
  end
#   warn "end graph writing"

end
warn "completed graph building"
graphing_errors.close

puts "RDF quads written"

gene lookup failed C1883559
gene lookup failed C0678941
gene lookup failed C1705240
gene lookup failed C1537998
disease lookup failed C0524851
disease lookup failed C0678236
disease lookup failed C0751870
disease lookup failed C0751670
gene lookup failed C1705304
gene lookup failed C3811119
gene lookup failed C3539618
gene lookup failed C0678933
gene lookup failed C0002085
gene lookup failed C1417173
gene lookup failed C1708726
gene lookup failed C1422767
gene lookup failed C3539643
gene lookup failed C1819716
gene lookup failed C3813713
gene lookup failed C3538705
gene lookup failed C1419834
gene lookup failed C3811222
gene lookup failed C1517495
gene lookup failed C1704924
disease lookup failed C0740279
gene lookup failed C0282641
disease lookup failed C0742034
disease lookup failed C0814120
disease lookup failed C0262405
disease lookup failed C0376415
gene lookup failed C3812695
gene lookup failed C1563761
disease lookup failed C0031117
disease lookup failed C0004096
disease lookup 

RDF quads written
