# Instructions

The assignment consists of three tasks:

- Run the T-test for the means of two independent samples underlying the statement "IRE binding activity was significantly reduced in failing hearts" (originally published by Haddad et al. in https://doi.org/10.1093/eurheartj/ehw333) using the following example data.

| non-failing heart (NF) | failing heart (F) |
| ---------------------- | ----------------- |
| 95 | 50 |
| 103 | 35 |
| 99 | 21 | 
| &nbsp; | 15 | 
| &nbsp; | 7 | 
| &nbsp; | 40 |

- Describe the statistical hypothesis test in machine readable form following the [statistical methods ontology concept for "two sample t-test with unequal variance"](https://www.ebi.ac.uk/ols/ontologies/stato/terms?iri=http%3A%2F%2Fpurl.obolibrary.org%2Fobo%2FSTATO_0000304) using Semantic Web technologies, namely the Resource Description Framework (RDF)
- Process the resulting machine readable description using Semantic Web technologies, namely the SPARQL Protocol and RDF Query Language.

Please return the assignment with all outputs visible (i.e., do not clear the outputs).

Good luck!

In [2]:
!pip install rdflib pandas scipy numpy 

Collecting rdflib
  Downloading https://files.pythonhosted.org/packages/3c/fe/630bacb652680f6d481b9febbb3e2c3869194a1a5fc3401a4a41195a2f8f/rdflib-4.2.2-py3-none-any.whl (344kB)
Collecting isodate (from rdflib)
  Downloading https://files.pythonhosted.org/packages/9b/9f/b36f7774ff5ea8e428fdcfc4bb332c39ee5b9362ddd3d40d9516a55221b2/isodate-0.6.0-py2.py3-none-any.whl (45kB)
Installing collected packages: isodate, rdflib
Successfully installed isodate-0.6.0 rdflib-4.2.2


In [3]:
# Import all required libraries (some are missing)
from rdflib import Graph, URIRef
from rdflib.namespace import RDF
import io
import re
import requests
import numpy as np
import pandas as pd
from  urllib.parse import quote
from scipy.stats import ttest_ind
from rdflib import Graph, URIRef, BNode, Literal
from rdflib.namespace import RDF, XSD, RDFS
from rdflib.plugins.sparql.results.csvresults import CSVResultSerializer

In [4]:
# Run the T-test for the means of two independent samples using the example data

# labels = ...
labels = ['non-failing heart (NF)', 'failing heart (F)']
#Data=...
data = [(99, 52), (96, 40), (100, 38), (105, 18), 
        (np.nan, 11), (np.nan, 5), (np.nan, 42), 
        (np.nan, 55), (np.nan, 53), (np.nan, 39),
        (np.nan, 42), (np.nan, 50)]

df = pd.DataFrame.from_records(data, columns=labels)
tt = ttest_ind(df['non-failing heart (NF)'], 
               df['failing heart (F)'], 
               equal_var=False, nan_policy='omit')

# Print the p-value

tt.pvalue


1.3111247517411591e-08

In [5]:
obo = dict()

obo['STATO_0000304'] = URIRef('http://purl.obolibrary.org/obo/STATO_0000304')
obo['OBI_0000299'] = URIRef('http://purl.obolibrary.org/obo/OBI_0000299')
obo['OBI_0000175'] = URIRef('http://purl.obolibrary.org/obo/OBI_0000175')
obo['OBI_0000293'] = URIRef('http://purl.obolibrary.org/obo/OBI_0000293')
obo['STATO_0000251'] = URIRef('http://purl.obolibrary.org/obo/STATO_0000251')
obo['BFO_0000051'] = URIRef('http://purl.obolibrary.org/obo/BFO_0000051')
obo['IAO_0000032'] = URIRef('http://purl.obolibrary.org/obo/IAO_0000032')
obo['OBI_0001938'] = URIRef('http://purl.obolibrary.org/obo/OBI_0001938')
obo['OBI_0001933'] = URIRef('http://purl.obolibrary.org/obo/OBI_0001933')
obo['OBI_0001931'] = URIRef('http://purl.obolibrary.org/obo/OBI_0001931')
obo['OBI_0002135'] = URIRef('http://purl.obolibrary.org/obo/OBI_0002135')
obo['OBI_0001937'] = URIRef('http://purl.obolibrary.org/obo/OBI_0001937')
obo['OBI_0000751'] = URIRef('http://purl.obolibrary.org/obo/OBI_0000751')
obo['STATO_0000205'] = URIRef('http://purl.obolibrary.org/obo/STATO_0000205')
obo['STATO_0000019'] = URIRef('http://purl.obolibrary.org/obo/STATO_0000019')
obo['STATO_0000129'] = URIRef('http://purl.obolibrary.org/obo/STATO_0000129')
obo['GO_0030350'] = URIRef('http://purl.obolibrary.org/obo/GO_0030350')

def create_ttest_description(df, tt):
    g = Graph()

    g.bind('obo', 'http://purl.obolibrary.org/obo/')
    g . bind('ex', 'http://example.org/')

    n1 = URIRef('https://doi.org/10.1093/eurheartj/ehw333')
    n2 = BNode()
    n3 = BNode()
    n4 = BNode()
    n5 = BNode()
    n6 = BNode()
    
    g.add((obo['STATO_0000304'], RDFS.label, Literal('two sample t-test with unequal variance')))
    g.add((obo['OBI_0000299'], RDFS.label, Literal('has specified output')))
    g.add((obo['OBI_0000293'], RDFS.label, Literal('has specified input')))
    g.add((obo['OBI_0000175'], RDFS.label, Literal('p-value')))
    g.add((obo['OBI_0001938'], RDFS.label, Literal('has value specification')))
    g.add((obo['OBI_0001931'], RDFS.label, Literal('scalar value specification')))
    g.add((obo['OBI_0001937'], RDFS.label, Literal('has specified numeric value')))
    g.add((obo['GO_0030350'], RDFS.label, Literal('iron-responsive element binding')))
   
    
    g.add((n1, RDF.type, obo['STATO_0000304']))
    g.add((n1, obo['OBI_0000299'], n2))
    g.add((n2, RDF.type, obo['OBI_0000175']))
    g.add((n2, obo['OBI_0001938'], n3))
    g.add((n3, RDF.type, obo['OBI_0001931']))
    g.add((n3, obo['OBI_0001937'], Literal(tt.pvalue, datatype=XSD.double)))
    g.add((n1, obo['OBI_0000293'], n4))
    g.add((n4, RDF.type, obo['OBI_0000751']))
    g.add((n4, RDF.type, obo['GO_0030350']))
    
    for c in list(df):
        b1 = BNode()
        g.add((n1, obo['OBI_0000293'], b1))
        g.add((b1, RDF.type, obo['STATO_0000251']))
        for v in df[c]:
            if np.isnan(v): 
                continue
            b2 = BNode()
            b3 = BNode()
            g.add((b1, RDFS.label, Literal(c)))
            g.add((b1, obo['BFO_0000051'], b2))
            g.add((b2, obo['OBI_0001938'], b3))
            g.add((b3, RDF.type, obo['OBI_0001931']))
            g.add((b3, obo['OBI_0001937'], Literal(str(v), datatype=XSD.double)))

    return g

def query(g, q):
    serializer = CSVResultSerializer(g.query(q))
    output = io.BytesIO()
    serializer.serialize(output)
    return pd.read_csv(io.StringIO(output.getvalue().decode('utf-8')))

g = create_ttest_description(df, tt)

print(g.serialize(format='turtle').decode('utf-8'))

@prefix ex: <http://example.org/> .
@prefix obo: <http://purl.obolibrary.org/obo/> .
@prefix rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#> .
@prefix rdfs: <http://www.w3.org/2000/01/rdf-schema#> .
@prefix xml: <http://www.w3.org/XML/1998/namespace> .
@prefix xsd: <http://www.w3.org/2001/XMLSchema#> .

obo:OBI_0000293 rdfs:label "has specified input" .

obo:OBI_0000299 rdfs:label "has specified output" .

obo:OBI_0001937 rdfs:label "has specified numeric value" .

obo:OBI_0001938 rdfs:label "has value specification" .

<https://doi.org/10.1093/eurheartj/ehw333> a obo:STATO_0000304 ;
    obo:OBI_0000293 [ a obo:GO_0030350,
                obo:OBI_0000751 ],
        [ a obo:STATO_0000251 ;
            rdfs:label "failing heart (F)" ;
            obo:BFO_0000051 [ obo:OBI_0001938 [ a obo:OBI_0001931 ;
                            obo:OBI_0001937 1.8e+01 ] ],
                [ obo:OBI_0001938 [ a obo:OBI_0001931 ;
                            obo:OBI_0001937 5.5e+01 ] ],
                

In [6]:
# Process the machine readable statistical hypothesis test by completing the following SPARQL query that returns the p-value
q = """
PREFIX obo: <http://purl.obolibrary.org/obo/>
SELECT ?pvalue WHERE {
  ?r a obo:STATO_0000304 .
  ?r obo:OBI_0000293 [ a obo:GO_0030350 ] .
  ?r obo:OBI_0000299 [
      a obo:OBI_0000175;
      obo:OBI_0001938[
          a obo:OBI_00019131;
          obo:OBI_0001937 ?pvalue
      ]
  ]
   ?r obo:OBI_0000293[
       a obo:OBI_0000751, obo:GO_0030350;
           ]   ]
  FILTER (?pvalue<0.005)
    
}
"""

#for qs in g.query(q):
print(q[0])



