In [24]:
from grakn.client import *
import pandas as pd
import time
import collections
import numpy as np

# The purpose of this script is to create a report of which elements of a schema have data present.
KEYSPACE = "biograkn_V2prod"
URI = "localhost:1729"


# originally using grakn2prod, holoviews
# scikit-image, 0.18.1 - version required -    
# holoviews                 1.14.3            
# bokeh                     2.3.1    
##grakn 2.01 env graknprodv2_1

def report_values(URI, KEYSPACE, query_in, ret_var, attribute):
    """return a list from a match query, reports on one or more attrubuted for the type listed"""

    t1=time.time()
    lookup = dict(zip(ret_var, attribute))

    with Grakn.core_client(URI) as client:
        with client.session(KEYSPACE, SessionType.DATA) as session:
            #with session.transaction(TransactionType.WRITE) as tx:
            with session.transaction(TransactionType.READ) as tx:
                iterator=tx.query().match(query_in)
                a=(list(iterator))
                #answers = [ans.get('virus-name') for ans in iterator]
                #result = [ answer.value() for answer in answers ]
                live_df = pd.DataFrame()
                
                # this pulles attributes out of the answer object
                for answer in a:
                    row={lookup[i]: [answer.get(i).get_value()] for i in ret_var}
                    live_df = live_df.append(pd.DataFrame(row))
                live_df.reset_index(drop=True, inplace=True)
                
                #remove - , this seems to break the dash implementation
                live_df.columns = [i.replace('-','_') for i in live_df.columns]
   
    print(f"elapsed time {time.time()-t1}")            
    return live_df 

## HOW MANY VIRUSES ARE THERE IN THE DB
#there are 15 viruses in the database they appear to be relatives of covid.

def count_values(URI, KEYSPACE, type, limit):
    """ count_values(URI, KEYSPACE, virus, 10) """

    with Grakn.core_client(URI) as client:
        with client.session(KEYSPACE, SessionType.DATA) as session:
            #with session.transaction(TransactionType.WRITE) as tx:
            with session.transaction(TransactionType.READ) as tx:
                query_in=f"match $d isa {type}; limit {limit}; count;"
                ans=tx.query().match_aggregate(query_in)
                print(ans.get().as_int())

#get the set of viruses on each line and make a new colunm
#vir_prot_vir_rt.vs_set=
def virus_set(x):
    """ this takes in the whole dataframe, pass N columns in next iteration"""
    x['vs_set'] = set([x['genbank_id1'],x['uniprot_entry_name'], x['genbank_id2']])     
    x['vs_set_str'] = str(x['vs_set']) 
    return x

In [19]:
print('loading fresh data from the graph checking virus objects')
count_values(URI, KEYSPACE, 'virus', 100)

loading fresh data from the graph checking virus objects
15


## Collect the data from the graph and save it to the REPO. 
## Add a push the REPO


In [23]:

# virus level data - attributes plus discovery-orign and organism
## get all the virus names in the notebook, add thier relatedness attribute0.
# -- funciton: get attribute list from type

attribute= ['virus-name', 'genbank-id', 'identity-percentage', 'organism-name', 'country-name' ] #, 
ret_var=['vn', 'gid','idprc', 'on', 'cn'] #,'gva'

query_in=f"""match $v isa virus, has virus-name $vn, has genbank-id $gid, has identity-percentage $idprc;
$ova(hosting-organism: $o, hosted-virus: $v) isa organism-virus-hosting;
$o isa organism, has organism-name $on;
$discv (discovered-virus: $v, discovering-location: $c) isa discovery;
$c isa country, has country-name $cn;
"""     

#$g isa gene, has gene-symbol $gs, has entrez-id $entid;
#$gva(associated-virus-gene: $g, associated-virus: $v) isa gene-virus-association;

VirAttrib=report_values(URI, KEYSPACE, query_in, ret_var, attribute)
print(VirAttrib.shape)

VirAttrib['organism_name'] = VirAttrib['organism_name'].replace(['Homo sapiens (Human)'],'Human')

### not sure why duplicates are coming back from this query
### also some of the original viruses are not in the data set?
## this is related to the requirement for the gene virus association
## removing that updates teh list

VirAttrib=VirAttrib.drop_duplicates()
print(VirAttrib.shape)

############### write out the CSVs #############
VirAttrib.to_csv('virus_attributes.csv', index=False)

#if VirAttrib[organism-name] == Homo sapiens (Human)
#VirAttrib.where('Homo sapiens (Human)', 'Human', inplace=True)
# round trips
# virus1 > protien1 > virus2. - neighborhood analysis

attribute= ['virus-name1', 'uniprot-name', 'function-description', 'uniprot-entry-name',  'virus-name2', 'genbank-id1', 'genbank-id2'] #, , 'pathway-name', 'pathway-id'
ret_var=['vn1', 'un', 'fdesc', 'unpn', 'vn2', 'gid1', 'gid2'] #,'gva' , 'pwn', 'pwid'

query_in=f"""
match $v1 isa virus, has genbank-id $gid1, has virus-name $vn1; 
$p isa protein, has uniprot-name $un, has function-description $fdesc, has uniprot-entry-name $unpn;
(hosting-virus-protein: $p, associated-virus: $v1) isa protein-virus-association; 
$v2 isa virus, has genbank-id $gid2, has virus-name $vn2;
(hosting-virus-protein: $p, associated-virus: $v2) isa protein-virus-association; 
"""

vir_prot_vir_rt=report_values(URI, KEYSPACE, query_in, ret_var, attribute)
print(vir_prot_vir_rt.shape)

### THIS DOES NOT HAVE duplicates are coming back from this query
### maybe the round trips are reciprical
df=vir_prot_vir_rt.drop_duplicates()
print(df.shape)

# remove self referential paths - this might be a mistake since it will remove proteins that are onlu linked to one virus - we want all unique paths including terminal paths

#df=df[df['virus-name1'] != df['virus-name2']]
#print(f'dropped self referential paths, shape: {df.shape}')
###remove redundant relationships## i.e a->b == b->a

df = df.apply(virus_set, axis=1)
df = df.drop_duplicates('vs_set_str', keep='last')

# # subset by virus 1 and 2 (a or b) 
print(f'dropped reverse paths shape accounting for redundant IDs: {df.shape}')
df.head()

df.to_csv('vir_prot_vir_rt.csv', index=False)
vir_prot_vir_rt=df
vir_prot_vir_rt


elapsed time 0.550957202911377
(36, 5)
(19, 5)
elapsed time 2.6177940368652344
(695, 7)
(694, 7)


dropped reverse paths shape accounting for redundant IDs: (133, 9)


Unnamed: 0,virus_name1,uniprot_name,function_description,uniprot_entry_name,virus_name2,genbank_id1,genbank_id2,vs_set,vs_set_str
6,MHV,Heterogeneous nuclear ribonucleoprotein Q (hnR...,FUNCTION: Heterogenous nuclear ribonucleoprote...,HNRPQ_HUMAN,MHV,AC 000192,AC 000192,"{AC 000192, HNRPQ_HUMAN}","{'AC 000192', 'HNRPQ_HUMAN'}"
7,HCoV-229E,Aminopeptidase N (AP-N) (hAPN) (EC 3.4.11.2) (...,FUNCTION: Broad specificity aminopeptidase whi...,AMPN_HUMAN,HCoV-229E,MN306046,MN306046,"{MN306046, AMPN_HUMAN}","{'MN306046', 'AMPN_HUMAN'}"
8,HCoV-NL63,Angiotensin-converting enzyme 2 (EC 3.4.17.23)...,FUNCTION: Essential counter-regulatory carboxy...,ACE2_HUMAN,HCoV-NL63,MG772808,MG772808,"{MG772808, ACE2_HUMAN}","{'MG772808', 'ACE2_HUMAN'}"
11,SARS-CoV-2,Angiotensin-converting enzyme 2 (EC 3.4.17.23)...,FUNCTION: Essential counter-regulatory carboxy...,ACE2_HUMAN,HCoV-NL63,AY390556,MG772808,"{AY390556, MG772808, ACE2_HUMAN}","{'AY390556', 'MG772808', 'ACE2_HUMAN'}"
13,HCoV-229E,Fibroleukin (Fibrinogen-like protein 2) (pT49),FUNCTION: May play a role in physiologic lymph...,FGL2_HUMAN,HCoV-229E,MN306046,MN306046,"{MN306046, FGL2_HUMAN}","{'MN306046', 'FGL2_HUMAN'}"
...,...,...,...,...,...,...,...,...,...
682,SARS-CoV-2,Induced myeloid leukemia cell differentiation ...,FUNCTION: Involved in the regulation of apopto...,MCL1_HUMAN,SARS-CoV[GZ02],AY390556,AY390556,"{AY390556, MCL1_HUMAN}","{'AY390556', 'MCL1_HUMAN'}"
685,SARS-CoV-2,Caveolin-1,FUNCTION: May act as a scaffolding protein wit...,CAV1_HUMAN,SARS-CoV[GZ02],AY390556,AY390556,"{AY390556, CAV1_HUMAN}","{'AY390556', 'CAV1_HUMAN'}"
688,SARS-CoV-2,Proteasome subunit alpha type-2 (EC 3.4.25.1) ...,FUNCTION: Component of the 20S core proteasome...,PSA2_HUMAN,SARS-CoV[GZ02],AY390556,AY390556,"{PSA2_HUMAN, AY390556}","{'PSA2_HUMAN', 'AY390556'}"
691,SARS-CoV-2,Intraflagellar transport-associated protein (P...,FUNCTION: Seems to play a role in ciliary BBSo...,IFTAP_HUMAN,SARS-CoV[GZ02],AY390556,AY390556,"{IFTAP_HUMAN, AY390556}","{'IFTAP_HUMAN', 'AY390556'}"


In [None]:

## Get the data from 
- virus graph github 
- or local file and parse.

In [17]:
VirAttrib.head()

Unnamed: 0,virus_name,genbank_id,identity_percentage,organism_name,country_name
0,IBV,KY421672,48.38,Avian,China
1,MHV,AC 000192,50.23,Mouse,USA
2,SARS-CoV[NS-1],AY508724,79.69,Human,China
4,SARS-COV[GD01],AY278489,79.65,Human,China
6,SARS,AY390556,79.7,Human,China
