In [1]:
puts `head -2 ./mappings/genes.map`

sourceid,label,geneid,protein,recommended_full,taxon
ENSG00000012779,ENSG00000012779,http://purl.uniprot.org/geneid/240,http://purl.uniprot.org/uniprot/P09917,Polyunsaturated fatty acid 5-lipoxygenase,http://purl.uniprot.org/taxonomy/9606


In [2]:
puts `head -2 ./mappings/diseases.map`

source,snomedct,orpha,prefname
DOID_7551,http://purl.bioontology.org/ontology/SNOMEDCT/15628003,https://fake.orphanet/not-found,gonorrhea


In [3]:
puts `head -2 ./rawdata/disease-gene.csv`

"source","source_type","target","target_type"
"DOID_7551","disease","ENSG00000058085","gene"


In [4]:
require 'linkeddata'
require 'rdf/nquads'
require 'csv'

graphing_errors = File.open('./graph/disease-gene-errors.txt', 'w') 
File.open('./graph/disease-gene.nq', 'a') do |f| # reset
end
# Define namespaces
SIMPATHIC = RDF::Vocabulary.new('urn:simpathic:')
RDFS = RDF::Vocabulary.new('http://www.w3.org/2000/01/rdf-schema#')

# Read input files
disease_mappings = CSV.read('./mappings/diseases.map', headers: true)
gene_mappings = CSV.read('./mappings/genes.map', headers: true)
failures = {}

# refresh
f = File.open('./graph/disease-gene.nq', 'w')
f.close

recordcount = 0
CSV.foreach('./rawdata/disease-gene.csv', col_sep: ",", quote_char: '"', 
  liberal_parsing: true, headers: true) do |row|
# "source","source_type","target","target_type"
# "DOID_7551","disease","ENSG00000058085","gene"
  disease_id = row['source']
  gene_id = row['target']
  score = 1
  #evidence = ""

#   warn "searching for #{disease_id}"
  disease = disease_mappings.find { |d| d['source'] == disease_id }
  gene = gene_mappings.find { |d| d['sourceid'] == gene_id }
  
  unless disease
    next if failures[disease_id]
    failures[disease_id] = 1
    warn "disease lookup failed #{disease_id}"
    graphing_errors.write "disease lookup failed #{disease_id}\n"
    next
  end
  unless gene
    next if failures[gene_id]
    failures[gene_id] = 1
    warn "gene lookup failed #{gene_id}"
    graphing_errors.write "gene lookup failed #{gene_id}\n"
    next
  end
  
  # Extract relevant IDs and labels
#   source,snomedct,orpha,prefname
#   DOID_7551,http://purl.bioontology.org/ontology/SNOMEDCT/15628003,https://fake.orphanet/not-found,gonorrhea
  snomed_uri = RDF::URI.new(disease['snomedct'])
  snomed_type = RDF::URI.new("https://bioportal.bioontology.org/ontologies/SNOMEDCT")
  snomed_core_type = RDF::URI.new("https://w3id.org/biolink/vocab/Disease")
  snomed_label =  RDF::Literal.new("SNOMED Term")
  orphanet = RDF::URI.new(disease['orpha'])
  disease_label = RDF::Literal.new(disease['prefname'])
  original_disease = RDF::Literal.new(disease['source'])

#   sourceid,label,geneid,protein,recommended_full,taxon
#   ENSG00000091831,ENSG00000091831,http://purl.uniprot.org/geneid/2099,http://purl.uniprot.org/uniprot/P03372,Estrogen receptor,http://purl.uniprot.org/taxonomy/9606
  gene_uri = RDF::URI.new(gene['geneid'])
  gene_type = RDF::URI.new("http://edamontology.org/data_1027")
  gene_label =  RDF::Literal.new("NCBI/UniProt Gene Identifier")
  gene_core_type = RDF::URI.new("https://w3id.org/biolink/vocab/Gene")
  gene_label = RDF::Literal.new(gene['label'])

  
  protein_uri = RDF::URI.new(gene['protein'])
  protein_type = RDF::URI.new("http://edamontology.org/data_2291")
  protein_label =  RDF::Literal.new("UniProt Identifier")
  protein_core_type = RDF::URI.new("https://w3id.org/biolink/vocab/Protein")
  human_protein_label = RDF::Literal.new(gene['recommended_full'])
    
  taxon = RDF::URI.new(gene['taxon'])
  
  # Create context URI
  context_uri = RDF::URI.new("urn:simpathic:context:#{disease_id}_#{gene_id}")
  general_context = RDF::URI.new("urn:simpathic:context:all_metadata")
  
  # Create RDF repository (need to do this each time, since there are hundreds of thousands of lines, and the graph gets too big for memory)
  graph = RDF::Repository.new


  # Add quads to graph using RDF::Statement
  graph << RDF::Statement.new(snomed_uri, SIMPATHIC['associated-with'], protein_uri, graph_name: context_uri)
  graph << RDF::Statement.new(protein_uri, SIMPATHIC['associated-with'], snomed_uri, graph_name: context_uri)

    
  graph << RDF::Statement.new(snomed_uri, RDFS.label, disease_label, graph_name: context_uri)
  graph << RDF::Statement.new(snomed_uri, RDF.type, snomed_type, graph_name: context_uri)
  graph << RDF::Statement.new(snomed_uri, RDF.type, snomed_core_type, graph_name: context_uri)
  graph << RDF::Statement.new(snomed_type, RDFS.label, snomed_label, graph_name: context_uri)
  graph << RDF::Statement.new(snomed_uri, SIMPATHIC['orphanet'], orphanet, graph_name: context_uri)
  graph << RDF::Statement.new(snomed_uri, SIMPATHIC['original-id'], original_disease, graph_name: context_uri)
  
  graph << RDF::Statement.new(gene_uri,  RDFS.label,       gene_label , graph_name: context_uri)
  graph << RDF::Statement.new(gene_uri,  RDF.type,         gene_type, graph_name: context_uri)
  graph << RDF::Statement.new(gene_uri,  RDF.type,         gene_core_type, graph_name: context_uri)
  graph << RDF::Statement.new(gene_type, RDFS.label,       RDF::Literal.new("NCBI Gene"), graph_name: context_uri)
  graph << RDF::Statement.new(gene_core_type, RDFS.label,  RDF::Literal.new("Gene"), graph_name: context_uri)
  graph << RDF::Statement.new(gene_uri,  SIMPATHIC['original-id'], RDF::Literal.new("#{gene_id}"), graph_name: context_uri)
  graph << RDF::Statement.new(gene_uri,  SIMPATHIC['in-taxon'], taxon, graph_name: context_uri)
    
  graph << RDF::Statement.new(snomed_uri, SIMPATHIC['associated-with'], gene_uri, graph_name: context_uri)
  graph << RDF::Statement.new(gene_uri, SIMPATHIC['associated-with'], snomed_uri, graph_name: context_uri)

  
  graph << RDF::Statement.new(protein_uri,  RDFS.label,       human_protein_label , graph_name: context_uri)
  graph << RDF::Statement.new(protein_uri,  RDF.type,         protein_type, graph_name: context_uri)
  graph << RDF::Statement.new(protein_uri,  RDF.type,         protein_core_type, graph_name: context_uri)
  graph << RDF::Statement.new(protein_type, RDFS.label,       RDF::Literal.new("UniProt"), graph_name: context_uri)
  graph << RDF::Statement.new(protein_core_type, RDFS.label,  RDF::Literal.new("Protein"), graph_name: context_uri)
  graph << RDF::Statement.new(protein_uri,  SIMPATHIC['original-id'], RDF::Literal.new("#{gene_id}"), graph_name: context_uri)
  graph << RDF::Statement.new(protein_uri,  SIMPATHIC['in-taxon'], taxon, graph_name: context_uri)

  
  graph << RDF::Statement.new(context_uri, SIMPATHIC['skg-source'], RDF::Literal.new("Radboud"), graph_name: general_context)
#   graph << RDF::Statement.new(context_uri, SIMPATHIC['evidence'], RDF::URI.new(evidence))
#   graph << RDF::Statement.new(context_uri, SIMPATHIC['score'], RDF::Literal.new(score))


#   warn "graph #{context_uri} built"
  # Write RDF to file in N-Quads format
  File.open('./graph/disease-gene.nq.large', 'a') do |f|
    RDF::Writer.for(:nquads).new(f) do |writer|
      writer << graph
    end
  end
#   warn "end graph writing"

end
warn "completed graph building"
graphing_errors.close

puts "RDF quads written"

disease lookup failed HP_0000857
disease lookup failed HP_0012076
disease lookup failed HP_0000520
disease lookup failed HP_0000802
disease lookup failed HP_0001591
disease lookup failed HP_0002035
disease lookup failed HP_0008909
disease lookup failed EFO_0000713
disease lookup failed HP_0009816
disease lookup failed HP_0010865
disease lookup failed HP_0012042
disease lookup failed HP_0002571
disease lookup failed HP_0004308
disease lookup failed HP_0012390
disease lookup failed HP_0012410
disease lookup failed HP_0100727
disease lookup failed EFO_0001675
disease lookup failed HP_0000327
disease lookup failed HP_0006789
disease lookup failed HP_0030834
disease lookup failed EFO_0000174
disease lookup failed EFO_0003073
disease lookup failed HP_0000200
disease lookup failed EFO_0003075
disease lookup failed HP_0002904
disease lookup failed EFO_0000595
disease lookup failed HP_0002153
disease lookup failed EFO_0002689
disease lookup failed HP_0000876
disease lookup failed HP_0003076
dis

disease lookup failed HP_0003323
disease lookup failed HP_0030838
disease lookup failed HP_0003547
disease lookup failed HP_0011096
disease lookup failed EFO_0000350
disease lookup failed EFO_0003029
disease lookup failed HP_0010307
disease lookup failed HP_0000863
disease lookup failed EFO_0000335
disease lookup failed HP_0001007
disease lookup failed HP_0009908
disease lookup failed HP_0011451
disease lookup failed EFO_0000224
disease lookup failed HP_0002313
disease lookup failed HP_0002616
disease lookup failed HP_0002575
disease lookup failed HP_0003489
disease lookup failed HP_0010750
disease lookup failed HP_0012317
disease lookup failed EFO_0000432
disease lookup failed HP_0000155
disease lookup failed HP_0000414
disease lookup failed HP_0002474
disease lookup failed HP_0002999
disease lookup failed HP_0005387
disease lookup failed HP_0030757
disease lookup failed EFO_0002614
disease lookup failed HP_0002194
disease lookup failed HP_0002280
disease lookup failed HP_0100257
dise

disease lookup failed EFO_0010977
disease lookup failed EFO_1000179
disease lookup failed EFO_1002027
disease lookup failed EFO_0005242
disease lookup failed EFO_0005556
disease lookup failed EFO_0005585
disease lookup failed EFO_0006862
disease lookup failed EFO_0009065
disease lookup failed EFO_0010580
disease lookup failed EFO_1001341
disease lookup failed EFO_1001410
disease lookup failed EFO_0003958
disease lookup failed EFO_0004610
disease lookup failed EFO_0004795
disease lookup failed EFO_0009485
disease lookup failed EFO_0010143
disease lookup failed EFO_0010267
disease lookup failed EFO_0010698
disease lookup failed EFO_1000243
disease lookup failed EFO_1000244
disease lookup failed EFO_1000249
disease lookup failed EFO_1001248
disease lookup failed EFO_1001829
disease lookup failed EFO_1001973
disease lookup failed EFO_0009005
disease lookup failed EFO_1000376
disease lookup failed EFO_1001379
disease lookup failed EFO_0006347
disease lookup failed EFO_0006519
disease lookup

disease lookup failed EFO_0005846
disease lookup failed EFO_0009582
disease lookup failed EFO_0010661
disease lookup failed EFO_0005526
disease lookup failed EFO_0006926
disease lookup failed EFO_0006995
disease lookup failed EFO_1000129
disease lookup failed EFO_1000945
disease lookup failed EFO_0005529
disease lookup failed EFO_0006475
disease lookup failed EFO_0007336
disease lookup failed EFO_0009008
disease lookup failed EFO_1000318
disease lookup failed EFO_1001351
disease lookup failed EFO_0003911
disease lookup failed EFO_0006865
disease lookup failed EFO_0009573
disease lookup failed EFO_0009964
disease lookup failed EFO_1000254
disease lookup failed EFO_1001226
disease lookup failed EFO_1001826
disease lookup failed EFO_0004318
disease lookup failed EFO_0005555
disease lookup failed EFO_0005924
disease lookup failed EFO_1000322
disease lookup failed EFO_0004875
disease lookup failed EFO_0007135
disease lookup failed EFO_0007443
disease lookup failed EFO_1000138
disease lookup

disease lookup failed EFO_0007298
disease lookup failed EFO_0009144
disease lookup failed EFO_1000368
disease lookup failed EFO_1000540
disease lookup failed EFO_1001017
disease lookup failed EFO_0009545
disease lookup failed EFO_1000332
disease lookup failed EFO_0007220
disease lookup failed EFO_0009478
disease lookup failed EFO_1000569
disease lookup failed EFO_1001486
disease lookup failed EFO_0007585
disease lookup failed EFO_0007887
disease lookup failed EFO_0009677
disease lookup failed EFO_0010738
disease lookup failed EFO_1000084
disease lookup failed EFO_0005623
disease lookup failed EFO_0007243
disease lookup failed EFO_0010820
disease lookup failed EFO_1001054
disease lookup failed EFO_1001114
disease lookup failed EFO_0007386
disease lookup failed EFO_0007496
disease lookup failed EFO_0007549
disease lookup failed EFO_0008501
disease lookup failed EFO_0008586
disease lookup failed EFO_0009846
disease lookup failed EFO_1000256
disease lookup failed EFO_1000259
disease lookup

disease lookup failed EFO_0008556
disease lookup failed EFO_0009687
disease lookup failed EFO_1000075
disease lookup failed EFO_1000486
disease lookup failed EFO_1000738
disease lookup failed EFO_1001812
disease lookup failed EFO_0003924
disease lookup failed EFO_0004220
disease lookup failed EFO_0004997
disease lookup failed EFO_0005783
disease lookup failed EFO_1000461
disease lookup failed EFO_1001129
disease lookup failed EFO_0007404
disease lookup failed EFO_0007535
disease lookup failed EFO_0003939
disease lookup failed EFO_0007183
disease lookup failed EFO_0009018
disease lookup failed EFO_0009130
disease lookup failed EFO_1000215
disease lookup failed EFO_1001013
disease lookup failed EFO_1000087
disease lookup failed EFO_1000303
disease lookup failed EFO_1000409
disease lookup failed EFO_1000786
disease lookup failed EFO_1001935
disease lookup failed EFO_0007368
disease lookup failed EFO_0009016
disease lookup failed EFO_0009715
disease lookup failed EFO_1000550
disease lookup

disease lookup failed EFO_1001078
disease lookup failed EFO_1001186
disease lookup failed EFO_1001831
disease lookup failed EFO_0005761
disease lookup failed EFO_0009058
disease lookup failed EFO_1000607
disease lookup failed EFO_1000824
disease lookup failed EFO_1000852
disease lookup failed EFO_1001233
disease lookup failed EFO_1001411
disease lookup failed EFO_1001963
disease lookup failed EFO_1002032
disease lookup failed EFO_0007493
disease lookup failed EFO_0009019
disease lookup failed EFO_0009067
disease lookup failed EFO_0007214
disease lookup failed EFO_0008602
disease lookup failed EFO_0009840
disease lookup failed EFO_0009952
disease lookup failed EFO_0010268
disease lookup failed EFO_0010638
disease lookup failed EFO_0010660
disease lookup failed EFO_1000039
disease lookup failed EFO_1000054
disease lookup failed EFO_1000389
disease lookup failed EFO_1001029
disease lookup failed EFO_1001048
disease lookup failed EFO_1001333
disease lookup failed EFO_1001401
disease lookup

disease lookup failed EFO_0010832
disease lookup failed EFO_1000082
disease lookup failed EFO_1000209
disease lookup failed EFO_1000691
disease lookup failed EFO_1001207
disease lookup failed EFO_1001329
disease lookup failed EFO_1001353
disease lookup failed EFO_1001941
disease lookup failed EFO_0004540
disease lookup failed EFO_0008507
disease lookup failed EFO_0009659
disease lookup failed EFO_0010256
disease lookup failed EFO_0010646
disease lookup failed EFO_1000495
disease lookup failed EFO_1000557
disease lookup failed EFO_1000836
disease lookup failed EFO_1001345
disease lookup failed EFO_1001769
disease lookup failed EFO_0004242
disease lookup failed EFO_0007940
disease lookup failed EFO_0009004
disease lookup failed EFO_1000085
disease lookup failed EFO_1000573
disease lookup failed EFO_1000860
disease lookup failed EFO_0009153
disease lookup failed EFO_0009361
disease lookup failed EFO_0009618
disease lookup failed EFO_1000013
disease lookup failed EFO_1000204
disease lookup

disease lookup failed Orphanet_206599
disease lookup failed Orphanet_269505
disease lookup failed Orphanet_276399
disease lookup failed Orphanet_2768
disease lookup failed Orphanet_2819
disease lookup failed Orphanet_2876
disease lookup failed Orphanet_2334
disease lookup failed Orphanet_247585
disease lookup failed Orphanet_313838
disease lookup failed Orphanet_329228
disease lookup failed Orphanet_369902
disease lookup failed Orphanet_397964
disease lookup failed Orphanet_50811
disease lookup failed Orphanet_2064
disease lookup failed Orphanet_216873
disease lookup failed Orphanet_2373
disease lookup failed Orphanet_238459
disease lookup failed Orphanet_2460
disease lookup failed Orphanet_254854
disease lookup failed Orphanet_2789
disease lookup failed Orphanet_314802
disease lookup failed Orphanet_352709
disease lookup failed Orphanet_36383
disease lookup failed Orphanet_79156
disease lookup failed Orphanet_165994
disease lookup failed Orphanet_284139
disease lookup failed Orphanet_

disease lookup failed Orphanet_93352
disease lookup failed Orphanet_98765
disease lookup failed Orphanet_98959
disease lookup failed Orphanet_79445
disease lookup failed Orphanet_93951
disease lookup failed Orphanet_79477
disease lookup failed Orphanet_98673
disease lookup failed Orphanet_98773
disease lookup failed Orphanet_83620
disease lookup failed Orphanet_99939
disease lookup failed Orphanet_79503
disease lookup failed Orphanet_93160
disease lookup failed Orphanet_98870
disease lookup failed Orphanet_88917
disease lookup failed Orphanet_98769
disease lookup failed Orphanet_90001
disease lookup failed Orphanet_93114
disease lookup failed Orphanet_85179
disease lookup failed Orphanet_98994
disease lookup failed Orphanet_85448
disease lookup failed Orphanet_99955
disease lookup failed Orphanet_89844
disease lookup failed Orphanet_93608
disease lookup failed Orphanet_93974
disease lookup failed Orphanet_79506
disease lookup failed Orphanet_85203
disease lookup failed Orphanet_93589
d

RDF quads written
