In [6]:
puts `ls -l raw-data`

total 484
-rw-rw-r-- 1 osboxes osboxes  34777 Aug 12 17:45 Demokritos-KG-information.xlsx
-rw-rw-r-- 1 osboxes osboxes 207331 Aug 12 17:45 Disease-Therapeutic_Area.tsv
-rw-rw-r-- 1 osboxes osboxes  36869 Aug 12 17:45 Drug-Diseasetriples.tsv
-rw-rw-r-- 1 osboxes osboxes 111643 Aug 12 17:45 Drug-Drug_type.tsv
-rw-rw-r-- 1 osboxes osboxes  32600 Aug 12 17:45 Drug-Genetriples.tsv
-rw-rw-r-- 1 osboxes osboxes  51800 Aug 12 17:45 Gene-Diseasetriples.tsv
-rw-rw-r-- 1 osboxes osboxes   7965 Aug 12 17:45 Gene-Pathwaytriples.tsv


In [7]:
puts `head -5 ./raw-data/Drug-Genetriples.tsv`

﻿Drug	Drug_id	Gene	Gene_id
Dopamine Hydrochloride	C0282151	UCP1 gene	C1421313
Dopamine Hydrochloride	C0282151	PPARA gene	C1418776
Dopamine Hydrochloride	C0282151	PPP1R12A gene	C1417581
Dopamine Hydrochloride	C0282151	NLRP1 wt Allele	C3538756


In [8]:
puts `head -5 ./mappings/drug-mappings.map`

demokritosid,xref,demokritos_label,pubchem_cid,IUPACname
C0613621,http://purl.bioontology.org/ontology/MESH/C030536,"2,2-dichloro-1,1-difluoroethyl difluoromethyl ether",https://pubchem.ncbi.nlm.nih.gov/compound/152803,"2,2-dichloro-1,1-difluoroethyl%20difluoromethyl%20ether"
C0042291,http://purl.bioontology.org/ontology/SNOMEDCT/387080000,Valproic Acid,https://pubchem.ncbi.nlm.nih.gov/compound/3121,Valproic%20acid
C0059747,http://purl.bioontology.org/ontology/SNOMEDCT/83298009,ethyl acetate,https://pubchem.ncbi.nlm.nih.gov/compound/8857,Ethyl%20acetate
C0059747,http://purl.bioontology.org/ontology/MESH/C007650,ethyl acetate,https://pubchem.ncbi.nlm.nih.gov/compound/8857,ethyl%20acetate


In [9]:
puts `head -5 ./mappings/gene-mappings.map`

source,label,geneid,protein,recommended_full,taxon
C1421313,UCP1,http://purl.uniprot.org/geneid/7350,http://purl.uniprot.org/uniprot/P25874,uncoupling protein 1,http://purl.uniprot.org/taxonomy/9606
C1418776,PPARA,http://purl.uniprot.org/geneid/5465,http://purl.uniprot.org/uniprot/Q07869,peroxisome proliferator activated receptor alpha,http://purl.uniprot.org/taxonomy/9606
C1417581,PPP1R12A,http://purl.uniprot.org/geneid/4659,http://purl.uniprot.org/uniprot/O14974,protein phosphatase 1 regulatory subunit 12A,http://purl.uniprot.org/taxonomy/9606
C1418482,CFP,http://purl.uniprot.org/geneid/5199,http://purl.uniprot.org/uniprot/P27918,complement factor properdin,http://purl.uniprot.org/taxonomy/9606


In [14]:
require 'linkeddata'
require 'csv'

graphing_errors = File.open('./graph/drug-gene-errors.txt', 'w') 

# Define namespaces
SIMPATHIC = RDF::Vocabulary.new('urn:simpathic:')
RDFS = RDF::Vocabulary.new('http://www.w3.org/2000/01/rdf-schema#')

# Read input files
entity_relations = CSV.read('./raw-data/Drug-Genetriples.tsv', col_sep: "\t", headers: true)
drug_mappings = CSV.read('./mappings/drug-mappings.map', headers: true)
gene_mappings = CSV.read('./mappings/gene-mappings.map', headers: true)

# Create RDF graph
graph = RDF::Repository.new

failures = {}
# Process each entity relation
entity_relations.each do |row|
  # warn row.inspect
  drug_id = row['Drug_id']
  gene_id = row['Gene_id']
  warn "Drug #{drug_id} -- Gene #{gene_id}"
  # Find corresponding mappings
  drug = drug_mappings.find { |d| d['demokritosid'] == drug_id }
  gene = gene_mappings.find { |d| d['source'] == gene_id }
  
  unless drug
    next if failures[drug_id]
    failures[drug_id] = 1
    warn "drug lookup failed #{drug_id}"
    graphing_errors.write "drug lookup failed #{drug_id}\n"
    next
  end
  unless gene
    next if failures[gene_id]
    failures[gene_id] = 1
    warn "gene lookup failed #{gene_id}"
    graphing_errors.write "gene lookup failed #{gene_id}\n"
    next
  end
  
  # Extract relevant IDs and labels
  # demokratisid,xref,demokratis_label,pubchem_cid,IUPACname
  # C0613621,http://purl.bioontology.org/ontology/MESH/C030536,"2,2-dichloro-1,1-difluoroethyl difluoromethyl ether",https://pubchem.ncbi.nlm.nih.gov/compound/152803,"2,2-dichloro-1,1-difluoroethyl%20difluoromethyl%20ether"
  pubchem_uri = RDF::URI.new(drug['pubchem_cid'])
  pubchem_type = RDF::URI.new("http://semanticscience.org/resource/CHEMINF_000302")
  pubchem_label =  RDF::Literal.new(drug['xref'])
  pubchem_core_type = RDF::URI.new("https://w3id.org/biolink/vocab/Drug")
  human_drug_label = RDF::Literal.new(drug['demokritos_label'])
  iupac_drug_label = RDF::Literal.new(drug['IUPACname'])

  #   source,label,geneid,protein,recommended_full,taxon
  #   C1421313,UCP1,http://purl.uniprot.org/geneid/7350,http://purl.uniprot.org/uniprot/P25874,uncoupling protein 1,http://purl.uniprot.org/taxonomy/9606
  gene_uri = RDF::URI.new(gene['geneid'])
  gene_type = RDF::URI.new("http://edamontology.org/data_2610")
  gene_label =  RDF::Literal.new(gene['label'])
  gene_core_type = RDF::URI.new("https://w3id.org/biolink/vocab/Gene")
  human_gene_label = RDF::Literal.new(gene['recommended_full'])

  
  protein_uri = RDF::URI.new(gene['protein'])
  protein_type = RDF::URI.new("http://edamontology.org/data_2291")
  protein_label =  RDF::Literal.new(gene['protein'])
  protein_core_type = RDF::URI.new("https://w3id.org/biolink/vocab/Protein")
  human_protein_label = RDF::Literal.new(gene['recommended_full'])

  taxon = RDF::URI.new(gene['taxon'])

  
  # Create context URI
  context_uri = RDF::URI.new("urn:simpathic:context:#{drug_id}_#{gene_id}")
  general_context = RDF::URI.new("urn:simpathic:context:all_metadata")
  
  # Add quads to graph using RDF::Statement
  graph << RDF::Statement.new(pubchem_uri, SIMPATHIC['associated-with'], gene_uri, graph_name: context_uri)
  graph << RDF::Statement.new(gene_uri, SIMPATHIC['associated-with'], pubchem_uri , graph_name: context_uri)
  
  graph << RDF::Statement.new(pubchem_uri,  RDFS.label,     human_drug_label, graph_name: context_uri)
  graph << RDF::Statement.new(pubchem_uri,  RDFS.label,     iupac_drug_label, graph_name: context_uri)
  graph << RDF::Statement.new(pubchem_uri,  RDF.type,       pubchem_type,          graph_name: context_uri)
  graph << RDF::Statement.new(pubchem_uri,  RDF.type,       pubchem_core_type,             graph_name: context_uri)
  graph << RDF::Statement.new(pubchem_type, RDFS.label,     RDF::Literal.new("PubChem"), graph_name: context_uri)
  graph << RDF::Statement.new(pubchem_core_type, RDFS.label,     RDF::Literal.new("Drug"), graph_name: context_uri)
  graph << RDF::Statement.new(pubchem_uri,  SIMPATHIC['original-id'], RDF::Literal.new("#{drug_id}"), graph_name: context_uri)
  
  graph << RDF::Statement.new(gene_uri,  RDFS.label,       human_gene_label , graph_name: context_uri)
  graph << RDF::Statement.new(gene_uri,  RDF.type,         gene_type, graph_name: context_uri)
  graph << RDF::Statement.new(gene_uri,  RDF.type,         gene_core_type, graph_name: context_uri)
  graph << RDF::Statement.new(gene_type, RDFS.label,       RDF::Literal.new("NCBI Gene"), graph_name: context_uri)
  graph << RDF::Statement.new(gene_core_type, RDFS.label,  RDF::Literal.new("Gene"), graph_name: context_uri)
  graph << RDF::Statement.new(gene_uri,  SIMPATHIC['original-id'], RDF::Literal.new("#{gene_id}"), graph_name: context_uri)
  graph << RDF::Statement.new(gene_uri,  SIMPATHIC['in-taxon'], taxon, graph_name: context_uri)

  
  
  graph << RDF::Statement.new(pubchem_uri, SIMPATHIC['associated-with'], protein_uri, graph_name: context_uri)
  graph << RDF::Statement.new(protein_uri, SIMPATHIC['associated-with'], pubchem_uri, graph_name: context_uri)

  graph << RDF::Statement.new(protein_uri,  RDFS.label,       human_protein_label , graph_name: context_uri)
  graph << RDF::Statement.new(protein_uri,  RDF.type,         protein_type, graph_name: context_uri)
  graph << RDF::Statement.new(protein_uri,  RDF.type,         protein_core_type, graph_name: context_uri)
  graph << RDF::Statement.new(protein_type, RDFS.label,       RDF::Literal.new("UniProt"), graph_name: context_uri)
  graph << RDF::Statement.new(protein_core_type, RDFS.label,  RDF::Literal.new("Protein"), graph_name: context_uri)
  graph << RDF::Statement.new(protein_uri,  SIMPATHIC['original-id'], RDF::Literal.new("#{gene_id}"), graph_name: context_uri)
  graph << RDF::Statement.new(protein_uri,  SIMPATHIC['in-taxon'], taxon, graph_name: context_uri)

  
  graph << RDF::Statement.new(context_uri, SIMPATHIC['skg-source'], RDF::Literal.new("DEMOKRITOS"), graph_name: general_context)
  # graph << RDF::Statement.new(context_uri, SIMPATHIC['skg-evidence'], RDF::Literal.new("TBD"), graph_name: general_context)
end

# Write RDF to file in N-Quads format
File.open('./graph/drug-gene.nq.large', 'w') do |f|
  RDF::Writer.for(:nquads).new(f) do |writer|
    writer << graph
  end
end
graphing_errors.close

puts "RDF quads written"

Drug C0282151 -- Gene C1421313
Drug C0282151 -- Gene C1418776
Drug C0282151 -- Gene C1417581
Drug C0282151 -- Gene C3538756
gene lookup failed C3538756
Drug C0282151 -- Gene C3813713
gene lookup failed C3813713
Drug C0282151 -- Gene C1418482
Drug C0282151 -- Gene C1825598
Drug C0282151 -- Gene C1333250
Drug C0282151 -- Gene C1538301
Drug C0282151 -- Gene C1420115
Drug C0282151 -- Gene C1538300
Drug C0013030 -- Gene C1418482
Drug C0013030 -- Gene C3538756
Drug C0013030 -- Gene C1883559
gene lookup failed C1883559
Drug C0013030 -- Gene C1420870
Drug C0013030 -- Gene C1425023
Drug C0013030 -- Gene C2239937
gene lookup failed C2239937
Drug C0013030 -- Gene C1422657
Drug C0013030 -- Gene C3813713
Drug C0013030 -- Gene C1415287
Drug C0013030 -- Gene C1823437
Drug C0013030 -- Gene C1415299
Drug C0013030 -- Gene C1415300
Drug C0013030 -- Gene C1420214
Drug C0013030 -- Gene C1366488
Drug C0013030 -- Gene C1413104
Drug C0013030 -- Gene C1415321
Drug C0013030 -- Gene C1337109
Drug C0013030 -- Gen

drug lookup failed C0001617
Drug C0001617 -- Gene C0919442
Drug C0033497 -- Gene C1332830
Drug C0033497 -- Gene C1413132
Drug C0022180 -- Gene C1417647
drug lookup failed C0022180
Drug C2698764 -- Gene C0694888
Drug C0171023 -- Gene C1413290
drug lookup failed C0171023
Drug C0171023 -- Gene C1420401
Drug C3541407 -- Gene C1415009
drug lookup failed C3541407
Drug C0070302 -- Gene C1418674
drug lookup failed C0070302
Drug C0070302 -- Gene C0002085
Drug C0038636 -- Gene C1414168
drug lookup failed C0038636
Drug C0038636 -- Gene C1823521
Drug C0016564 -- Gene C1414085
drug lookup failed C0016564
Drug C0016564 -- Gene C1823521
Drug C0016564 -- Gene C1826968
Drug C0016564 -- Gene C1416832
Drug C0022614 -- Gene C1424250
Drug C0022614 -- Gene C3538774
gene lookup failed C3538774
Drug C0022614 -- Gene C1333250
Drug C0026056 -- Gene C1414475
gene lookup failed C1414475
Drug C0026056 -- Gene C1412756
Drug C0085208 -- Gene C1414874
Drug C0074710 -- Gene C1414148
Drug C0004057 -- Gene C1425330
Drug

Drug C0034261 -- Gene C1538715
Drug C0359916 -- Gene C1823140
drug lookup failed C0359916
Drug C0030873 -- Gene C1417963
drug lookup failed C0030873
Drug C0085196 -- Gene C1412627
Drug C0085196 -- Gene C1412688
Drug C0070066 -- Gene C1823521
Drug C0070066 -- Gene C1417055
Drug C0070066 -- Gene C1418486
Drug C0070066 -- Gene C1425023
Drug C0034392 -- Gene C1335892
Drug C0005117 -- Gene C1417651
drug lookup failed C0005117
Drug C0006388 -- Gene C1416740
drug lookup failed C0006388
Drug C0085228 -- Gene C1823290
Drug C0001963 -- Gene C1417656
Drug C0013900 -- Gene C1412727
drug lookup failed C0013900
Drug C2343853 -- Gene C1412268
drug lookup failed C2343853
Drug C0062068 -- Gene C1419598
drug lookup failed C0062068
Drug C0360714 -- Gene C1414685
drug lookup failed C0360714
Drug C0373704 -- Gene C1414575
drug lookup failed C0373704
Drug C0373704 -- Gene C1426084
Drug C0373704 -- Gene C1413554
Drug C0083955 -- Gene C1334522
drug lookup failed C0083955
Drug C0018312 -- Gene C1332126
drug lo

drug lookup failed C0163557
Drug C0066646 -- Gene C1417679
drug lookup failed C0066646
Drug C0002333 -- Gene C1707163
drug lookup failed C0002333
Drug C0348029 -- Gene C1413554
drug lookup failed C0348029
Drug C0242546 -- Gene C1417819
drug lookup failed C0242546
Drug C0242546 -- Gene C1418858
Drug C0720523 -- Gene C1418674
drug lookup failed C0720523
Drug C0037556 -- Gene C1415353
drug lookup failed C0037556
Drug C0037556 -- Gene C1422359
Drug C0024742 -- Gene C1334074
Drug C0024742 -- Gene C1416788
Drug C0024742 -- Gene C1426768
Drug C0024742 -- Gene C3715044
Drug C0024742 -- Gene C1418674
Drug C0024742 -- Gene C1367477
Drug C0024742 -- Gene C1426190
Drug C0024742 -- Gene C2239937
Drug C0024742 -- Gene C1417244
Drug C0968642 -- Gene C1413555
drug lookup failed C0968642
Drug C1456750 -- Gene C1439351
drug lookup failed C1456750
Drug C1456750 -- Gene C1424950
Drug C1699861 -- Gene C1427005
drug lookup failed C1699861
Drug C1876226 -- Gene C1412268
drug lookup failed C1876226
Drug C0007

RDF quads written


In [18]:
graphing_errors.close