turn the Jaspar experiments  https://github.com/TomConlin/Jaspar_FA into RDF to load

In [1]:
import csv
import hashlib

In [2]:
# return a deterministic digest of input
# the 'b' is an experiment forcing the first char to be
# non numeric but valid hex
# which is in no way required for RDF
# but can help when using the identifier in other contexts
def digest_id(wordage):
    return 'b' + hashlib.sha1(wordage.encode('utf-8')).hexdigest()[1:20]

In [3]:
# stand in till I expand the curises to full IRI
def write_triple(sub, prd, obj):
    triples.append(sub + ' ' + prd + ' ' + obj + ' .\n')


In [4]:
regions = {    
    '1k' : {
        'f1' : 'gene_motifsetsig_1k.tab'
    }, 
    '2k' : {
        'f1' : 'gene_motifsetsig_1k.tab'
    },
    '5k' : {
        'f1' : 'gene_motifsetsig_1k.tab'
    }
}

The triples to produce form these files are:

    # <NCBIGene_123><SO:adjacent_to><BNODE:gene_upstream_region>
    # <BNODE:gene_upstream_region><rdfs:label><gene_upstream_region>
    # <BNODE:gene_upstream_region><GENO:has_extent><1000 (region extent bp)>
    # <BNODE:gene_upstream_region><rdf:type><SO:five_prime_flanking_region>
    # <BNODE:gene_upstream_region><rdf:comment><Candidate SO:TF_binding_sites>
    # <BNODE:motif_set><rdfs:label><motif_set_sig>
    # <BNODE:motif_set><rdf:type><SIO:collection>


In [5]:
triples = []

for extent in regions:
    # print(extent)
    # print(regions[extent])
    for fname in regions[extent]:
       # print(fname)
       # print(regions[extent][fname])    
        with open(regions[extent][fname], 'r') as tabfile:
            filereader = csv.reader(tabfile, delimiter='\t')
            line_counter = 0
            for row in filereader:
                line_counter += 1     
                (entrezid, motifsetsig, count) = row
                gene = 'NCBIGene:' + str(entrezid)
                region_label = gene + '_upstream_' + extent
                region = '_:' + digest_id(region_label)
                motifset = '_:' + digest_id(motifsetsig)
    
                # using the triple templates extracted from the target model
    
                # <NCBIGene_123><SO:adjacent_to><BNODE:gene_upstream_region>
                write_triple(gene, 'SO:adjecent_to', region)
                # <BNODE:gene_upstream_region><rdfs:label><gene_upstream_region>
                write_triple(region, 'rdfs:label', region_label)
                # <BNODE:gene_upstream_region><GENO:has_extent><1000>
                write_triple(region, 'GENO:has_extent', extent)
                # <BNODE:gene_upstream_region><rdf:type><SO:five_prime_flanking_region>
                write_triple(region, 'rdf:type', 'SO:five_prime_flanking_region')
                # <BNODE:motif_set><rdfs:label><motif_set_sig>
                write_triple(motifset, 'rdfs:label', motifsetsig)
                # <BNODE:motif_set><rdf:type><SIO:collection>
                write_triple(motifset, 'rdf:type', 'SIO:collection')

In [6]:
len(triples)

237186

In [7]:
print(triples[1])
print(triples[len(triples)-1])

_:b8824ba57445ddf26fae rdfs:label NCBIGene:85460_upstream_1k .

_:b6589fc6ab0dc82cf120 rdf:type SIO:collection .



splitting the motifset count out b/c it is independent of the gene/regions

    # <BNODE:motif_set><rdf:value><3>
    
in `motifsetsig_count.tab`

In [8]:
with open('motifsetsig_count.tab', 'r') as tabfile:
    filereader = csv.reader(tabfile, delimiter='\t')
    for row in filereader:
        line_counter += 1     
        (motifsetsig, count) = row
        motifset = '_:' + digest_id(motifsetsig)
        # <BNODE:motif_set><rdf:value><3>
        write_triple(motifset, 'rdf:value', count)

In [9]:
print(len(triples))
print(triples[len(triples)-1])

243211
_:b6589fc6ab0dc82cf120 rdf:value -1 .



    # <BNODE:motif_set><OIO:subset><BNODE:motif_set>

comes from `motifsetsig_subset.tab`

In [10]:
with open('motifsetsig_subset.tab', 'r') as tabfile:
    filereader = csv.reader(tabfile, delimiter='\t')
    for row in filereader:     
        (motifsetsig, subsetsig) = row
        motifset = '_:' + digest_id(motifsetsig)
        subset = '_:' + digest_id(subsetsig)
        # <BNODE:motif_set><OIO:subset><BNODE:motif_set>
        write_triple(motifset, 'OIO:subset', subset)

In [11]:
print(len(triples))
print(triples[len(triples)-1])

250890
_:b1d5781111d84f7b3fe4 OIO:subset _:ba4b9237bacccdf19c07 .



    # <BNODE:motif_set><OIO:hasdbxref><http:JASPAR:motif>
    # <BNODE:motif_set><RO:has member><JASPAR:motif> 
    
will just do the first with data from `motifsetsig_motif.tab`

In [13]:
with open('motifsetsig_motif.tab', 'r') as tabfile:
    filereader = csv.reader(tabfile, delimiter='\t')
    for row in filereader:     
        (motifsetsig, motif) = row
        motifset = '_:' + digest_id(motifsetsig)
        # <BNODE:motif_set><OIO:hasdbxref><http:JASPAR:motif>
        write_triple(motifset, 'OIO:hasdbxref', "JASPAR:" + motif)

In [14]:
print(len(triples))
print(triples[len(triples)-1])

302150
_:b093da02f1d652201da3 OIO:hasdbxref JASPAR:MA0940.1 .



    # <BNODE:gene_upstream_region><RO:member of><BNODE:gene_jaccard_value>
    # <BNODE:gene2_upstream_region><RO:member of><BNODE:gene_jaccard_value>
    
    # <BNODE:gene_jaccard_value><rdfs:label><gene1_region gene2_region>
    # <BNODE:gene_jaccard_value><SWO:Similarity score>	<0.73>
    # <BNODE:gene_jaccard_value><rdf:type><SWO:Jaccard’s index> 
    
 previously created    
    gene = 'NCBIGene:' + str(entrezid)
    region_label = gene + '_upstream_' + extent
    region = '_:' + digest_id(region_label)
    
 will need   
    (extent)    gene1    gene2    jaccard
    
 maybe also 
    
    (extent)    gene    dimotifsig
    
    
    
    
    