# Datasets

A Dataset is a specialization of a `Resource` that aims to register (upload) files with its metadata.

In [1]:
from kgforge.core import KnowledgeGraphForge

In [2]:
import getpass

In [3]:
token = getpass.getpass()

In [4]:
forge = KnowledgeGraphForge("../../configurations/demo-forge-nexus.yml", token=token)

## Imports

In [5]:
from kgforge.core import Resource
from kgforge.specializations.resources import Dataset
import pandas as pd

## Creation with files

In [6]:
! ls -p ../../data | egrep -v /$

associations.tsv
my_data.xwz
persons.csv


In [7]:
jane = Resource(type="Person", name="Jane Doe")

In [8]:
persons = Dataset(forge, name="Interesting Persons")

In [9]:
persons.add_files("../../data/persons.csv")

In [10]:
persons.add_contribution(jane)

In [11]:
forge.register(persons)

<action> _register_one
<succeeded> True


In [12]:
print(persons)

{
    context: http://context.example.org
    id: https://bbp.epfl.ch/dke/kgforge/e2c922c6-afec-4f52-91c2-7617735ce52f
    type: Dataset
    contribution:
    {
        type: Contribution
        agent:
        {
            id:
            {
                type: Person
                name: Jane Doe
            }
            type: Agent
        }
    }
    hasPart:
    {
        type: DataDownload
        contentSize:
        {
            unitCode: bytes
            value: 163
        }
        contentUrl: https://staging.nexus.ocp.bbp.epfl.ch/v1/files/dke/kgforge/75c5b07c-7531-4f1c-8ee0-134041c0df9c
        digest:
        {
            algorithm: SHA-256
            value: 4ea2f323c33e1fde5b7e48ab14fda50490e56e55dd6b1b369113fc3615c4e7ea
        }
        encodingFormat: text/csv
        name: persons.csv
    }
    name: Interesting Persons
}


In [13]:
associations = Dataset(forge, name="Associations data")

In [14]:
associations.add_files("../../data/associations.tsv")

In [15]:
associations.add_derivation(persons)

In [16]:
associations.add_contribution(jane)

In [17]:
forge.register(associations)

<action> _register_one
<succeeded> True


In [18]:
print(associations)

{
    context: http://context.example.org
    id: https://bbp.epfl.ch/dke/kgforge/23108bb4-0091-4f48-aa58-1561be453e76
    type: Dataset
    contribution:
    {
        type: Contribution
        agent:
        {
            id:
            {
                type: Person
                name: Jane Doe
            }
            type: Agent
        }
    }
    derivation:
    {
        type: Derivation
        entity:
        {
            id: https://bbp.epfl.ch/dke/kgforge/e2c922c6-afec-4f52-91c2-7617735ce52f?rev=1
            type: Dataset
            name: Interesting Persons
        }
    }
    hasPart:
    {
        type: DataDownload
        contentSize:
        {
            unitCode: bytes
            value: 506
        }
        contentUrl: https://staging.nexus.ocp.bbp.epfl.ch/v1/files/dke/kgforge/6d8e8c35-c5c5-4ad7-bde0-ab785e617d3e
        digest:
        {
            algorithm: SHA-256
            value: 9639abc864e91c645779f510ae5c06a1618941d569eb1af619c2767ba790230d
    

In [19]:
associations.download("files", "./downloaded/")

In [20]:
! ls ./downloaded

associations.tsv


In [21]:
! rm -R ./downloaded

## Creation with resources

In [22]:
distribution_1 = forge.attach("../../data/associations.tsv")

In [23]:
distribution_2 = forge.attach("../../data/persons.csv")

In [24]:
jane = Resource(type="Person", name="Jane Doe", distribution=distribution_1)

In [25]:
john = Resource(type="Person", name="John Smith", distribution=distribution_2)

In [26]:
persons = [jane, john]

In [27]:
forge.register(persons)

<count> 2
<action> _register_many
<succeeded> True


In [28]:
dataset = Dataset(forge, name="Interesting people")

In [29]:
dataset.add_parts(persons)

In [30]:
print(dataset)

{
    type: Dataset
    hasPart:
    [
        {
            id: https://bbp.epfl.ch/dke/kgforge/26d941f8-c2ce-4190-9655-cfa0ec5b0134?rev=1
            type: Person
            distribution:
            {
                contentUrl: https://staging.nexus.ocp.bbp.epfl.ch/v1/files/dke/kgforge/3e11cb25-7b86-4fb4-aeff-bd23afa9a441
            }
            name: Jane Doe
        }
        {
            id: https://bbp.epfl.ch/dke/kgforge/621b606c-4549-4c96-b83a-360df44bbdf9?rev=1
            type: Person
            distribution:
            {
                contentUrl: https://staging.nexus.ocp.bbp.epfl.ch/v1/files/dke/kgforge/f66cd57e-5df0-4c85-bc39-7b3df9e7cf63
            }
            name: John Smith
        }
    ]
    name: Interesting people
}


In [31]:
forge.register(dataset)

<action> _register_one
<succeeded> True


In [32]:
dataset.download("parts", "./downloaded/")

In [33]:
! ls ./downloaded

associations.tsv persons.csv


In [34]:
! rm -R ./downloaded

### specifiying custom content-type

In [35]:
data_file = forge.attach("../../data/my_data.xwz", content_type="application/xwz")

In [36]:
experiment = Resource(type="Experiment", name="generated data", distribution=data_file)

In [37]:
forge.register(experiment)

<action> _register_one
<succeeded> True


In [38]:
print(experiment)

{
    context: http://context.example.org
    id: https://bbp.epfl.ch/dke/kgforge/2993d9b9-fac4-4aa0-81f5-862a76d685dc
    type: Experiment
    distribution:
    {
        type: DataDownload
        contentSize:
        {
            unitCode: bytes
            value: 16
        }
        contentUrl: https://staging.nexus.ocp.bbp.epfl.ch/v1/files/dke/kgforge/84298cec-4bd3-4126-985a-11b152f21059
        digest:
        {
            algorithm: SHA-256
            value: df03e7e93f870c6731540b3cae26391670da682c7a8dbdd18448cbcfc4fb7981
        }
        encodingFormat: application/xwz
        name: my_data.xwz
    }
    name: generated data
}


## Creation from a dataframe

See notebook `DataFrame IO.ipynb` for details on conversions of instances of Resource from a Pandas DataFrame.

### basics

In [39]:
dataframe = pd.read_csv("../../data/persons.csv")

In [40]:
dataframe

Unnamed: 0,type,name,distribution
0,Person,Marie Curie,../../data/scientists-database/marie_curie.txt
1,Person,Albert Einstein,../../data/scientists-database/albert_einstein...


In [41]:
persons = forge.from_dataframe(dataframe)

In [42]:
forge.register(persons)

<count> 2
<action> _register_many
<succeeded> True


In [43]:
dataset = Dataset(forge, name="Interesting people")

In [44]:
dataset.add_parts(persons)

In [45]:
print(dataset)

{
    type: Dataset
    hasPart:
    [
        {
            id: https://bbp.epfl.ch/dke/kgforge/cf0a7b63-85b4-4f28-8897-0df4846a93c1?rev=1
            type: Person
            distribution: ../../data/scientists-database/marie_curie.txt
            name: Marie Curie
        }
        {
            id: https://bbp.epfl.ch/dke/kgforge/3d96e638-9bee-4380-a080-7b29854b72c9?rev=1
            type: Person
            distribution: ../../data/scientists-database/albert_einstein.txt
            name: Albert Einstein
        }
    ]
    name: Interesting people
}


In [46]:
forge.register(dataset)

<action> _register_one
<succeeded> True


### advanced

In [47]:
dataframe = pd.read_csv("../../data/associations.tsv", sep="\t", usecols=["name", "type", "agent__type", "agent__name", "agent__gender__label", "distribution"])

In [48]:
dataframe

Unnamed: 0,name,type,agent__type,agent__name,agent__gender__label,distribution
0,Curie Association,Association,Person,Marie Curie,female,../../data/scientists-database/marie_curie.txt
1,Einstein Association,Association,Person,Albert Einstein,male,../../data/scientists-database/albert_einstein...


In [49]:
dataframe["distribution"] = dataframe["distribution"].map(lambda x: forge.attach(x))

In [50]:
associations = forge.from_dataframe(dataframe, na="(missing)", nesting="__")

In [51]:
print(associations[0])

{
    type: Association
    agent:
    {
        type: Person
        gender:
        {
            label: female
        }
        name: Marie Curie
    }
    distribution: LazyAction(operation=Store.upload, args=['../../data/scientists-database/marie_curie.txt', None])
    name: Curie Association
}


In [52]:
forge.register(associations)

<count> 2
<action> _register_many
<succeeded> True


In [53]:
dataset = Dataset(forge, name="Interesting associations")

In [54]:
print(dataset)

{
    type: Dataset
    name: Interesting associations
}


In [55]:
dataset.add_parts(associations)

In [56]:
print(dataset)

{
    type: Dataset
    hasPart:
    [
        {
            id: https://bbp.epfl.ch/dke/kgforge/c9306dcb-db27-473d-9a83-90b762fbd277?rev=1
            type: Association
            distribution:
            {
                contentUrl: https://staging.nexus.ocp.bbp.epfl.ch/v1/files/dke/kgforge/98dc3465-2fa1-4117-b3a3-bd82851439b2
            }
            name: Curie Association
        }
        {
            id: https://bbp.epfl.ch/dke/kgforge/df29e12a-805a-4a91-a1a2-5330b49c5b59?rev=1
            type: Association
            distribution:
            {
                contentUrl: https://staging.nexus.ocp.bbp.epfl.ch/v1/files/dke/kgforge/1c716e5e-d460-4278-b1ad-3f8f54e13dcb
            }
            name: Einstein Association
        }
    ]
    name: Interesting associations
}


In [57]:
forge.register(dataset)

<action> _register_one
<succeeded> True


In [58]:
dataset.download("parts", "./downloaded/")

In [59]:
! ls ./downloaded

albert_einstein.txt marie_curie.txt


In [60]:
! rm -R ./downloaded