In [1]:
puts `ls -l`

total 2956
-rw-rw-r-- 1 osboxes osboxes     504 Jun 30 10:49 Drug-Disease Graphing.ipynb
-rw-rw-r-- 1 osboxes osboxes  138187 Jun 27 15:44 drug-mapping.ipynb
-rw-rw-r-- 1 osboxes osboxes 2778831 Jun 19 13:41 gene disease.ipynb
drwxrwxr-x 2 osboxes osboxes    4096 Jun 30 10:50 mappings
drwxrwxr-x 2 osboxes osboxes    4096 Jun 27 15:13 raw-data
-rw-rw-r-- 1 osboxes osboxes   90846 Jun 20 10:59 therapeutic_area.ipynb


In [2]:
puts `head -5 ./raw-data/Drug-Genetriples.tsv`

﻿Drug	Drug_id	Gene	Gene_id
Dopamine Hydrochloride	C0282151	UCP1 gene	C1421313
Dopamine Hydrochloride	C0282151	PPARA gene	C1418776
Dopamine Hydrochloride	C0282151	PPP1R12A gene	C1417581
Dopamine Hydrochloride	C0282151	NLRP1 wt Allele	C3538756


In [13]:
puts `head -5 ./mappings/drug-mappings.csv`

demokratisid,xref,demokratis_label,pubchem_cid,IUPACname
C0613621,http://purl.bioontology.org/ontology/MESH/C030536,"2,2-dichloro-1,1-difluoroethyl difluoromethyl ether",https://pubchem.ncbi.nlm.nih.gov/compound/152803,"2,2-dichloro-1,1-difluoroethyl%20difluoromethyl%20ether"
C0042291,http://purl.bioontology.org/ontology/SNOMEDCT/387080000,Valproic Acid,https://pubchem.ncbi.nlm.nih.gov/compound/3121,Valproic%20acid
C0059747,http://purl.bioontology.org/ontology/SNOMEDCT/83298009,ethyl acetate,https://pubchem.ncbi.nlm.nih.gov/compound/8857,Ethyl%20acetate
C0059747,http://purl.bioontology.org/ontology/MESH/C007650,ethyl acetate,https://pubchem.ncbi.nlm.nih.gov/compound/8857,ethyl%20acetate


In [1]:
puts `head -5 ./mappings/gene-mappings.csv`

source,ensembl,uniprot,taxon,prefname
C1421479,http://purl.uniprot.org/opentargets/ENSG00000007047,http://purl.uniprot.org/uniprot/Q96L34,http://purl.uniprot.org/taxonomy/9606,MAP/microtubule affinity-regulating kinase 4
C1418504,http://purl.uniprot.org/opentargets/ENSG00000013375,http://purl.uniprot.org/uniprot/O95394,http://purl.uniprot.org/taxonomy/9606,Phosphoacetylglucosamine mutase
C1425023,http://purl.uniprot.org/opentargets/ENSG00000033011,http://purl.uniprot.org/uniprot/Q9BT22,http://purl.uniprot.org/taxonomy/9606,Chitobiosyldiphosphodolichol beta-mannosyltransferase
C1421146,http://purl.uniprot.org/opentargets/ENSG00000039650,http://purl.uniprot.org/uniprot/Q96T60,http://purl.uniprot.org/taxonomy/9606,Bifunctional polynucleotide phosphatase/kinase


In [10]:
require 'linkeddata'
require 'csv'

graphing_errors = File.open('./graph/drug-gene-errors.txt', 'w') 

# Define namespaces
SIMPATHIC = RDF::Vocabulary.new('urn:simpathic:')
RDFS = RDF::Vocabulary.new('http://www.w3.org/2000/01/rdf-schema#')

# Read input files
entity_relations = CSV.read('./raw-data/Drug-Genetriples.tsv', col_sep: "\t", headers: true)
drug_mappings = CSV.read('./mappings/drug-mappings.csv', headers: true)
gene_mappings = CSV.read('./mappings/gene-mappings.csv', headers: true)

# Create RDF graph
graph = RDF::Graph.new

failures = {}
# Process each entity relation
entity_relations.each do |row|
  # warn row.inspect
  drug_id = row['Drug_id']
  gene_id = row['Gene_id']
  
  # Find corresponding mappings
  drug = drug_mappings.find { |d| d['demokratisid'] == drug_id }
  gene = gene_mappings.find { |d| d['source'] == gene_id }
  
  unless drug
    next if failures[drug_id]
    failures[drug_id] = 1
    warn "drug lookup failed #{drug_id}"
    graphing_errors.write "drug lookup failed #{drug_id}\n"
    next
  end
  unless gene
    next if failures[gene_id]
    failures[gene_id] = 1
    warn "gene lookup failed #{gene_id}"
    graphing_errors.write "gene lookup failed #{gene_id}\n"
    next
  end
  
  # Extract relevant IDs and labels
  # demokratisid,xref,demokratis_label,pubchem_cid,IUPACname
  # C0613621,http://purl.bioontology.org/ontology/MESH/C030536,"2,2-dichloro-1,1-difluoroethyl difluoromethyl ether",https://pubchem.ncbi.nlm.nih.gov/compound/152803,"2,2-dichloro-1,1-difluoroethyl%20difluoromethyl%20ether"
  pubchem_uri = RDF::URI.new(drug['pubchem_cid'])
  pubchem_type = RDF::URI.new("http://semanticscience.org/resource/CHEMINF_000302")
  pubchem_label =  RDF::Literal.new("PubChem Identifier")
  pubchem_core_type = RDF::URI.new("https://w3id.org/biolink/vocab/Drug")
  demokratis_drug_label = RDF::Literal.new(drug['demokratis_label'])
  iupac_drug_label = RDF::Literal.new(drug['IUPACname'])

  #   source,ensembl,uniprot,taxon,prefname
  #   C1421479,http://purl.uniprot.org/opentargets/ENSG00000007047,http://purl.uniprot.org/uniprot/Q96L34,http://purl.uniprot.org/taxonomy/9606,MAP/microtubule affinity-regulating kinase 4
  gene_uri = RDF::URI.new(gene['ensembl'])
  gene_type = RDF::URI.new("http://edamontology.org/data_2610")
  gene_label =  RDF::Literal.new("ENSEMBL Identifier")
  gene_core_type = RDF::URI.new("https://w3id.org/biolink/vocab/Gene")
  demokratis_gene_label = RDF::Literal.new(gene['prefname'])

  
  protein_uri = RDF::URI.new(gene['uniprot'])
  protein_type = RDF::URI.new("http://edamontology.org/data_2291")
  protein_label =  RDF::Literal.new("UniProt Identifier")
  protein_core_type = RDF::URI.new("https://w3id.org/biolink/vocab/Protein")
  demokratis_protein_label = RDF::Literal.new(gene['prefname'])

  taxon = RDF::URI.new(gene['taxon'])

  
  # Create context URI
  context_uri = RDF::URI.new("urn:simpathic:context:#{drug_id}_#{gene_id}")
  
  # Add quads to graph using RDF::Statement
  graph << RDF::Statement.new(pubchem_uri, SIMPATHIC['associated-with'], gene_uri, graph_name: context_uri)
  
  graph << RDF::Statement.new(pubchem_uri,  RDFS.label,     demokratis_drug_label, graph_name: context_uri)
  graph << RDF::Statement.new(pubchem_uri,  RDFS.label,     iupac_drug_label, graph_name: context_uri)
  graph << RDF::Statement.new(pubchem_uri,  RDF.type,       pubchem_type,          graph_name: context_uri)
  graph << RDF::Statement.new(pubchem_uri,  RDF.type,       pubchem_core_type,             graph_name: context_uri)
  graph << RDF::Statement.new(pubchem_type, RDFS.label,     RDF::Literal.new("PubChem"), graph_name: context_uri)
  graph << RDF::Statement.new(pubchem_core_type, RDFS.label,     RDF::Literal.new("Drug"), graph_name: context_uri)
  graph << RDF::Statement.new(pubchem_uri,  SIMPATHIC['original-id'], RDF::Literal.new("#{drug_id}"), graph_name: context_uri)
  
  graph << RDF::Statement.new(gene_uri,  RDFS.label,       demokratis_gene_label , graph_name: context_uri)
  graph << RDF::Statement.new(gene_uri,  RDF.type,         gene_type, graph_name: context_uri)
  graph << RDF::Statement.new(gene_uri,  RDF.type,         gene_core_type, graph_name: context_uri)
  graph << RDF::Statement.new(gene_type, RDFS.label,       RDF::Literal.new("ENSEMBL"), graph_name: context_uri)
  graph << RDF::Statement.new(gene_core_type, RDFS.label,  RDF::Literal.new("Gene"), graph_name: context_uri)
  graph << RDF::Statement.new(gene_uri,  SIMPATHIC['original-id'], RDF::Literal.new("#{gene_id}"), graph_name: context_uri)
  graph << RDF::Statement.new(gene_uri,  SIMPATHIC['in-taxon'], taxon, graph_name: context_uri)

  
  
  graph << RDF::Statement.new(pubchem_uri, SIMPATHIC['associated-with'], protein_uri, graph_name: context_uri)

  graph << RDF::Statement.new(protein_uri,  RDFS.label,       demokratis_protein_label , graph_name: context_uri)
  graph << RDF::Statement.new(protein_uri,  RDF.type,         protein_type, graph_name: context_uri)
  graph << RDF::Statement.new(protein_uri,  RDF.type,         protein_core_type, graph_name: context_uri)
  graph << RDF::Statement.new(protein_type, RDFS.label,       RDF::Literal.new("UniProt"), graph_name: context_uri)
  graph << RDF::Statement.new(protein_core_type, RDFS.label,  RDF::Literal.new("Protein"), graph_name: context_uri)
  graph << RDF::Statement.new(protein_uri,  SIMPATHIC['original-id'], RDF::Literal.new("#{gene_id}"), graph_name: context_uri)
  graph << RDF::Statement.new(protein_uri,  SIMPATHIC['in-taxon'], taxon, graph_name: context_uri)

  
  graph << RDF::Statement.new(context_uri, SIMPATHIC['skg-source'], RDF::Literal.new("DEMOKRATIS"), graph_name: context_uri)
  graph << RDF::Statement.new(context_uri, SIMPATHIC['skg-evidence'], RDF::Literal.new("TBD"), graph_name: context_uri)
end

# Write RDF to file in N-Quads format
File.open('./graph/drug-gene.nq', 'w') do |f|
  RDF::Writer.for(:nquads).new(f) do |writer|
    writer << graph
  end
end
graphing_errors.close

puts "RDF quads written"

gene lookup failed C1421313
gene lookup failed C1418776
gene lookup failed C1417581
gene lookup failed C3538756
gene lookup failed C3813713
gene lookup failed C1418482
gene lookup failed C1538300
gene lookup failed C1883559
gene lookup failed C1420870
gene lookup failed C2239937
gene lookup failed C1422657
gene lookup failed C1415287
gene lookup failed C1823437
gene lookup failed C1415299
gene lookup failed C1415300
gene lookup failed C1366488
gene lookup failed C1413104
gene lookup failed C1337109
gene lookup failed C1332830
gene lookup failed C1416903
drug lookup failed C0040815
drug lookup failed C0056077
drug lookup failed C0110177
drug lookup failed C0127400
gene lookup failed C1420220
gene lookup failed C1423842
gene lookup failed C1414988
gene lookup failed C1417565
gene lookup failed C3540116
gene lookup failed C1413900
gene lookup failed C1419027
gene lookup failed C2829972
gene lookup failed C1412113
gene lookup failed C1413949
gene lookup failed C1539487
drug lookup failed C

drug lookup failed C0030049
drug lookup failed C1120149
drug lookup failed C2698875
drug lookup failed C0729511
drug lookup failed C0010194
drug lookup failed C1096766
drug lookup failed C0034283
drug lookup failed C0171368
drug lookup failed C0058185
drug lookup failed C0025627
drug lookup failed C0085170
drug lookup failed C0008931
drug lookup failed C0138666
drug lookup failed C0061223
drug lookup failed C0128170
drug lookup failed C0039623
drug lookup failed C0534301
drug lookup failed C0048255
drug lookup failed C0068897
drug lookup failed C0054863
drug lookup failed C0039654
drug lookup failed C0024002
drug lookup failed C0754659
drug lookup failed C1176316
drug lookup failed C0023556
drug lookup failed C0084563
drug lookup failed C0163557
drug lookup failed C0066646
drug lookup failed C0002333
drug lookup failed C0348029
drug lookup failed C0242546
drug lookup failed C0720523
drug lookup failed C0037556
gene lookup failed C1334074
gene lookup failed C1416788
gene lookup failed C

RDF quads written


In [18]:
graphing_errors.close