In [3]:
puts `ls -l rawdata`

total 7716
-rw-rw-r-- 1 osboxes osboxes 3678593 Jul 15 10:40 disease-gene.csv
-rw-rw-r-- 1 osboxes osboxes 1419434 Jul 15 10:40 disease-therapeuticarea.csv
-rw-rw-r-- 1 osboxes osboxes   50286 Jul 15 10:40 drug-disease.csv
-rw-rw-r-- 1 osboxes osboxes   97134 Jul 15 10:40 drug-drugtype.csv
-rw-rw-r-- 1 osboxes osboxes  148014 Jul 15 10:40 drug-gene.csv
-rw-rw-r-- 1 osboxes osboxes     576 Jul 15 10:40 edges.csv
-rw-rw-r-- 1 osboxes osboxes  201313 Jul 15 10:40 gene-reactome.csv
-rw-rw-r-- 1 osboxes osboxes 2282825 Jul 15 10:40 Graph_edge_info.xlsx


In [4]:
puts `head -2 ./rawdata/drug-gene.csv`

"source","source_type","target","target_type"
"CHEMBL1201746","drug","ENSG00000228716","gene"


In [5]:
puts `ls -l mappings`

total 1052
drwxrwxr-x 2 osboxes osboxes   4096 Aug  8 14:52 DEPRECATED
-rw-rw-r-- 1 osboxes osboxes 378534 Jul 15 10:40 diseases-errors.txt
-rw-rw-r-- 1 osboxes osboxes 656312 Jul 15 10:40 diseases.map
-rw-rw-r-- 1 osboxes osboxes   1320 Aug 11 12:48 drugs-biologics-errors.txt
-rw-rw-r-- 1 osboxes osboxes  11009 Aug  8 15:18 drugs-errors.txt
-rw-rw-r-- 1 osboxes osboxes  88654 Aug 11 12:48 drugs.map
-rw-rw-r-- 1 osboxes osboxes 135160 Aug  8 14:45 genes.map


In [6]:
puts `head -2 ./mappings/drugs.map`

chembl,label,CID,IUPACname
CHEMBL1200656,CHEMBL1200656,5284447,Natamycin



genes.map was lost and recreated (including code) during the large file fiasco

This should be identical to previous... we assume!


In [7]:
puts `head -2 ./mappings/genes.map`

sourceid,label,geneid,protein,recommended_full,taxon
ENSG00000012779,ENSG00000012779,http://purl.uniprot.org/geneid/240,http://purl.uniprot.org/uniprot/P09917,Polyunsaturated fatty acid 5-lipoxygenase,http://purl.uniprot.org/taxonomy/9606


In [1]:
require 'linkeddata'
require 'rdf/nquads'
require 'csv'

graphing_errors = File.open('./graph/drug-gene-errors.txt', 'w') 

# Define namespaces
SIMPATHIC = RDF::Vocabulary.new('urn:simpathic:')
RDFS = RDF::Vocabulary.new('http://www.w3.org/2000/01/rdf-schema#')

# Read input files
drug_mappings = CSV.read('./mappings/drugs.map', headers: true)
gene_mappings = CSV.read('./mappings/genes.map', headers: true)
failures = {}

# refresh
f = File.open('./graph/drug-gene.nq', 'w')
f.close

recordcount = 0
CSV.foreach('./rawdata/drug-gene.csv', col_sep: ",", quote_char: '"', 
  liberal_parsing: true, headers: true) do |row|
# "source","source_type","target","target_type"
# "CHEMBL1201746","drug","ENSG00000228716","gene"
  drug_id = row['source']
  gene_id = row['target']
  score = 1
  #evidence = ""

#   warn "searching for #{drug_id}"
  drug = drug_mappings.find { |d| d['chembl'] == drug_id }
  gene = gene_mappings.find { |d| d['sourceid'] == gene_id }
  
#     warn "found #{drug}"
  unless drug
    next if failures[drug_id]
    failures[drug_id] = 1
    warn "drug lookup failed #{drug_id}"
    graphing_errors.write "drug lookup failed #{drug_id}\n"
    next
  end
  unless gene
    next if failures[gene_id]
    failures[gene_id] = 1
    warn "gene lookup failed #{gene_id}"
    graphing_errors.write "gene lookup failed #{gene_id}\n"
    next
  end
  
  # Extract relevant IDs and labels
#   chembl,label,CID,IUPACname
#   CHEMBL1200656,CHEMBL1200656,5284447,Natamycin
  pubchem_uri = RDF::URI.new("https://pubchem.ncbi.nlm.nih.gov/compound/#{drug['CID']}")
  pubchem_type = RDF::URI.new("http://semanticscience.org/resource/CHEMINF_000302")
  pubchem_label =  RDF::Literal.new("PubChem Identifier")
  pubchem_core_type = RDF::URI.new("https://w3id.org/biolink/vocab/Drug")
  drug_label = RDF::Literal.new(drug['label'])
  iupac_drug_label = RDF::Literal.new(drug['IUPACname'])

#   sourceid,label,geneid,protein,recommended_full,taxon
#   ENSG00000091831,ENSG00000091831,http://purl.uniprot.org/geneid/2099,http://purl.uniprot.org/uniprot/P03372,Estrogen receptor,http://purl.uniprot.org/taxonomy/9606
  gene_uri = RDF::URI.new(gene['geneid'])
  gene_type = RDF::URI.new("http://edamontology.org/data_1027")
  gene_label =  RDF::Literal.new("NCBI/UniProt Gene Identifier")
  gene_core_type = RDF::URI.new("https://w3id.org/biolink/vocab/Gene")
  gene_label = RDF::Literal.new(gene['label'])

  
  protein_uri = RDF::URI.new(gene['protein'])
  protein_type = RDF::URI.new("http://edamontology.org/data_2291")
  protein_label =  RDF::Literal.new("UniProt Identifier")
  protein_core_type = RDF::URI.new("https://w3id.org/biolink/vocab/Protein")
  biovista_protein_label = RDF::Literal.new(gene['recommended_full'])
    
  taxon = RDF::URI.new(gene['taxon'])
  
  # Create context URI
  context_uri = RDF::URI.new("urn:simpathic:context:#{drug_id}_#{gene_id}")
  general_context = RDF::URI.new("urn:simpathic:context:all_metadata")
  
  # Create RDF repository (need to do this each time, since there are hundreds of thousands of lines, and the graph gets too big for memory)
  graph = RDF::Repository.new
    
  # Add quads to graph using RDF::Statement
  graph << RDF::Statement.new(pubchem_uri, SIMPATHIC['associated-with'], gene_uri, graph_name: context_uri)
  graph << RDF::Statement.new(gene_uri, SIMPATHIC['associated-with'], pubchem_uri, graph_name: context_uri)
  
  graph << RDF::Statement.new(pubchem_uri,  RDFS.label,     drug_label, graph_name: context_uri)
  graph << RDF::Statement.new(pubchem_uri,  RDFS.label,     iupac_drug_label, graph_name: context_uri)
  graph << RDF::Statement.new(pubchem_uri,  RDF.type,       pubchem_type,          graph_name: context_uri)
  graph << RDF::Statement.new(pubchem_uri,  RDF.type,       pubchem_core_type,             graph_name: context_uri)
  graph << RDF::Statement.new(pubchem_type, RDFS.label,     RDF::Literal.new("PubChem"), graph_name: context_uri)
  graph << RDF::Statement.new(pubchem_core_type, RDFS.label,     RDF::Literal.new("Drug"), graph_name: context_uri)
  graph << RDF::Statement.new(pubchem_uri,  SIMPATHIC['original-id'], RDF::Literal.new("#{drug_id}"), graph_name: context_uri)
  
  graph << RDF::Statement.new(gene_uri,  RDFS.label,       gene_label , graph_name: context_uri)
  graph << RDF::Statement.new(gene_uri,  RDF.type,         gene_type, graph_name: context_uri)
  graph << RDF::Statement.new(gene_uri,  RDF.type,         gene_core_type, graph_name: context_uri)
  graph << RDF::Statement.new(gene_type, RDFS.label,       RDF::Literal.new("NCBI Gene"), graph_name: context_uri)
  graph << RDF::Statement.new(gene_core_type, RDFS.label,  RDF::Literal.new("Gene"), graph_name: context_uri)
  graph << RDF::Statement.new(gene_uri,  SIMPATHIC['original-id'], RDF::Literal.new("#{gene_id}"), graph_name: context_uri)
  graph << RDF::Statement.new(gene_uri,  SIMPATHIC['in-taxon'], taxon, graph_name: context_uri)
  

  graph << RDF::Statement.new(pubchem_uri, SIMPATHIC['associated-with'], protein_uri, graph_name: context_uri)
  graph << RDF::Statement.new(protein_uri, SIMPATHIC['associated-with'], pubchem_uri, graph_name: context_uri)
  
  graph << RDF::Statement.new(protein_uri,  RDFS.label,       biovista_protein_label , graph_name: context_uri)
  graph << RDF::Statement.new(protein_uri,  RDF.type,         protein_type, graph_name: context_uri)
  graph << RDF::Statement.new(protein_uri,  RDF.type,         protein_core_type, graph_name: context_uri)
  graph << RDF::Statement.new(protein_type, RDFS.label,       RDF::Literal.new("UniProt"), graph_name: context_uri)
  graph << RDF::Statement.new(protein_core_type, RDFS.label,  RDF::Literal.new("Protein"), graph_name: context_uri)
  graph << RDF::Statement.new(protein_uri,  SIMPATHIC['original-id'], RDF::Literal.new("#{gene_id}"), graph_name: context_uri)
  graph << RDF::Statement.new(protein_uri,  SIMPATHIC['in-taxon'], taxon, graph_name: context_uri)

  
  graph << RDF::Statement.new(context_uri, SIMPATHIC['skg-source'], RDF::Literal.new("Radboud"), graph_name: general_context)
#   graph << RDF::Statement.new(context_uri, SIMPATHIC['evidence'], RDF::URI.new(evidence))
#   graph << RDF::Statement.new(context_uri, SIMPATHIC['score'], RDF::Literal.new(score))


#   warn "graph #{context_uri} built"
  # Write RDF to file in N-Quads format
  File.open('./graph/drug-gene.nq.large', 'a') do |f|
    RDF::Writer.for(:nquads).new(f) do |writer|
      writer << graph
    end
  end
#   warn "end graph writing"

end
warn "completed graph building"
graphing_errors.close

puts "RDF quads written"

drug lookup failed CHEMBL2108222
drug lookup failed CHEMBL3991432
drug lookup failed CHEMBL2362016
gene lookup failed ENSG00000123201
drug lookup failed CHEMBL1201630
drug lookup failed CHEMBL3301581
gene lookup failed ENSG00000106123
drug lookup failed CHEMBL1201572
drug lookup failed CHEMBL1201629
drug lookup failed CHEMBL2107857
drug lookup failed CHEMBL1396
drug lookup failed CHEMBL2108278
drug lookup failed CHEMBL1201608
drug lookup failed CHEMBL2107869
drug lookup failed CHEMBL1201580
drug lookup failed CHEMBL3039583
drug lookup failed CHEMBL1201657
drug lookup failed CHEMBL1742990
drug lookup failed CHEMBL4297762
drug lookup failed CHEMBL2103749
drug lookup failed CHEMBL1201661
drug lookup failed CHEMBL2354773
gene lookup failed ENSG00000211891
completed graph building


RDF quads written


# Experiments below

In [6]:
File.open('./graph/drug-gene.nq', 'w')
# Create context URI
  context_uri = RDF::URI.new("urn:simpathic:context:123_456")
  
  # Create RDF graph
  graph = RDF::Repository.new
    
  # Add quads to graph using RDF::Statement
  graph << RDF::Statement.new(RDF::URI.new("https://example.org"), RDFS.label,     RDF::Literal.new("PubChem"), graph_name: context_uri)

  warn "graph #{context_uri} built"
  # Write RDF to file in N-Quads format
  File.open('./graph/drug-gene.nq', 'a') do |f|
    RDF::Writer.for(:nquads).new(f) do |writer|
      writer << graph
    end
  end
  warn "end graph writing"


graph urn:simpathic:context:123_456 built
end graph writing
