# Datasets

A Dataset is a specialization of a `Resource` that aims to register (upload) files with its metadata.

In [1]:
from kgforge.core import KnowledgeGraphForge

In [2]:
import getpass

In [3]:
token = getpass.getpass()

In [4]:
forge = KnowledgeGraphForge("../../configurations/demo-forge-nexus.yml", token=token)

## Imports

In [5]:
from kgforge.core import Resource

In [6]:
from kgforge.specializations.resources import Dataset

In [7]:
import pandas as pd

In [8]:
import json

In [9]:
def pp(x): print(json.dumps(x, indent=4))

## Creation with files

In [10]:
! ls -p ../../data | egrep -v /$

associations.tsv
persons.csv


In [11]:
jane = Resource(type="Person", name="Jane Doe")

In [12]:
persons = Dataset(forge, name="Interesting Persons")

In [13]:
persons.add_files("../../data/persons.csv")

In [14]:
persons.add_contribution(jane)

In [15]:
forge.register(persons)

<action> _register_one
<succeeded> True


In [16]:
# print(persons)
pp(forge.as_json(persons))

{
    "id": "https://bbp.epfl.ch/dke/kgforge_tests/d065dcb0-3417-4f3b-81f0-513e720beb02",
    "type": "Dataset",
    "contribution": {
        "type": "Contribution",
        "agent": {
            "id": {
                "type": "Person",
                "name": "Jane Doe"
            },
            "type": "Agent"
        }
    },
    "hasPart": {
        "type": "DataDownload",
        "contentSize": {
            "unitCode": "bytes",
            "value": 163
        },
        "contentUrl": "https://staging.nexus.ocp.bbp.epfl.ch/v1/files/dke/kgforge_tests/f20526f9-c564-4545-b577-130d4030e750",
        "digest": {
            "algorithm": "SHA-256",
            "value": "4ea2f323c33e1fde5b7e48ab14fda50490e56e55dd6b1b369113fc3615c4e7ea"
        },
        "encodingFormat": "text/csv",
        "name": "persons.csv"
    },
    "name": "Interesting Persons"
}


In [17]:
associations = Dataset(forge, name="Associations data")

In [18]:
associations.add_files("../../data/associations.tsv")

In [19]:
associations.add_derivation(persons)

In [20]:
associations.add_contribution(jane)

In [21]:
forge.register(associations)

<action> _register_one
<succeeded> True


In [22]:
# print(associations)
pp(forge.as_json(associations))

{
    "id": "https://bbp.epfl.ch/dke/kgforge_tests/fed316f9-bee8-4ad3-9d45-041b86657237",
    "type": "Dataset",
    "contribution": {
        "type": "Contribution",
        "agent": {
            "id": {
                "type": "Person",
                "name": "Jane Doe"
            },
            "type": "Agent"
        }
    },
    "derivation": {
        "type": "Derivation",
        "entity": {
            "id": "https://bbp.epfl.ch/dke/kgforge_tests/d065dcb0-3417-4f3b-81f0-513e720beb02?rev=1",
            "type": "Dataset",
            "name": "Interesting Persons"
        }
    },
    "hasPart": {
        "type": "DataDownload",
        "contentSize": {
            "unitCode": "bytes",
            "value": 506
        },
        "contentUrl": "https://staging.nexus.ocp.bbp.epfl.ch/v1/files/dke/kgforge_tests/c39aa4a3-f712-4422-aa5a-51b5838c966a",
        "digest": {
            "algorithm": "SHA-256",
            "value": "9639abc864e91c645779f510ae5c06a1618941d569eb1af619c2767

In [23]:
associations.download("files", "./downloaded/")

In [24]:
! ls ./downloaded

associations.tsv


In [25]:
! rm -R ./downloaded

## Creation with resources

In [26]:
distribution_1 = forge.attach("../../data/associations.tsv")

In [27]:
distribution_2 = forge.attach("../../data/persons.csv")

In [28]:
jane = Resource(type="Person", name="Jane Doe", distribution=distribution_1)

In [29]:
john = Resource(type="Person", name="John Smith", distribution=distribution_2)

In [30]:
persons = [jane, john]

In [31]:
forge.register(persons)

<count> 2
<action> _register_many
<succeeded> True


In [32]:
dataset = Dataset(forge, name="Interesting people")

In [33]:
dataset.add_parts(persons)

In [34]:
# print(dataset)
pp(forge.as_json(dataset))

{
    "type": "Dataset",
    "hasPart": [
        {
            "id": "https://bbp.epfl.ch/dke/kgforge_tests/0e0e2e2e-ab9b-451b-b0d0-8ad07e75d085?rev=1",
            "type": "Person",
            "distribution": {
                "contentUrl": "https://staging.nexus.ocp.bbp.epfl.ch/v1/files/dke/kgforge_tests/c110f43e-d29e-484d-8b3f-9992c7756fbf"
            },
            "name": "Jane Doe"
        },
        {
            "id": "https://bbp.epfl.ch/dke/kgforge_tests/2b3e93a0-8ed3-4611-befe-5df1658214ef?rev=1",
            "type": "Person",
            "distribution": {
                "contentUrl": "https://staging.nexus.ocp.bbp.epfl.ch/v1/files/dke/kgforge_tests/86d44420-0e37-4406-bcdb-ae67d5b67760"
            },
            "name": "John Smith"
        }
    ],
    "name": "Interesting people"
}


In [35]:
forge.register(dataset)

<action> _register_one
<succeeded> True


In [36]:
dataset.download("parts", "./downloaded/")

In [37]:
! ls ./downloaded

associations.tsv persons.csv


In [38]:
! rm -R ./downloaded

## Creation from a dataframe

See notebook `DataFrame IO.ipynb` for details on conversions of instances of Resource from a Pandas DataFrame.

### basics

In [39]:
dataframe = pd.read_csv("../../data/persons.csv")

In [40]:
dataframe

Unnamed: 0,type,name,distribution
0,Person,Marie Curie,../../data/scientists-database/marie_curie.txt
1,Person,Albert Einstein,../../data/scientists-database/albert_einstein...


In [41]:
persons = forge.from_dataframe(dataframe)

In [42]:
forge.register(persons)

<count> 2
<action> _register_many
<succeeded> True


In [43]:
dataset = Dataset(forge, name="Interesting people")

In [44]:
dataset.add_parts(persons)

In [45]:
print(dataset)

{
    type: Dataset
    hasPart:
    [
        {
            id: https://bbp.epfl.ch/dke/kgforge_tests/f39167ae-c61b-4a2b-aa11-dca4a7001c1a?rev=1
            type: Person
            distribution: ../../data/scientists-database/marie_curie.txt
            name: Marie Curie
        }
        {
            id: https://bbp.epfl.ch/dke/kgforge_tests/e797d7c9-0016-4d06-ad48-0d8e407d8ced?rev=1
            type: Person
            distribution: ../../data/scientists-database/albert_einstein.txt
            name: Albert Einstein
        }
    ]
    name: Interesting people
}


In [46]:
forge.register(dataset)

<action> _register_one
<succeeded> True


### advanced

In [47]:
dataframe = pd.read_csv("../../data/associations.tsv", sep="\t", usecols=["name", "type", "agent__type", "agent__name", "agent__gender__label", "distribution"])

In [48]:
dataframe

Unnamed: 0,name,type,agent__type,agent__name,agent__gender__label,distribution
0,Curie Association,Association,Person,Marie Curie,female,../../data/scientists-database/marie_curie.txt
1,Einstein Association,Association,Person,Albert Einstein,male,../../data/scientists-database/albert_einstein...


In [49]:
dataframe["distribution"] = dataframe["distribution"].map(lambda x: forge.attach(x))

In [50]:
associations = forge.from_dataframe(dataframe, na="(missing)", nesting="__")

In [51]:
print(associations[0])

{
    type: Association
    agent:
    {
        type: Person
        gender:
        {
            label: female
        }
        name: Marie Curie
    }
    distribution: LazyAction(operation=Store.upload, args=['../../data/scientists-database/marie_curie.txt'])
    name: Curie Association
}


In [52]:
forge.register(associations)

<count> 2
<action> _register_many
<succeeded> True


In [53]:
dataset = Dataset(forge, name="Interesting associations")

In [54]:
print(dataset)

{
    type: Dataset
    name: Interesting associations
}


In [55]:
dataset.add_parts(associations)

In [56]:
print(dataset)

{
    type: Dataset
    hasPart:
    [
        {
            id: https://bbp.epfl.ch/dke/kgforge_tests/1264fc5a-354b-49fc-8488-52a6a04f712a?rev=1
            type: Association
            distribution:
            {
                contentUrl: https://staging.nexus.ocp.bbp.epfl.ch/v1/files/dke/kgforge_tests/d3889e03-8b07-408d-961e-2e64376cba26
            }
            name: Curie Association
        }
        {
            id: https://bbp.epfl.ch/dke/kgforge_tests/6134eca2-cc3b-4d43-8d0e-0763037fb2b5?rev=1
            type: Association
            distribution:
            {
                contentUrl: https://staging.nexus.ocp.bbp.epfl.ch/v1/files/dke/kgforge_tests/d846c409-c7b0-4f5d-a833-cb1411e74945
            }
            name: Einstein Association
        }
    ]
    name: Interesting associations
}


In [57]:
forge.register(dataset)

<action> _register_one
<succeeded> True


In [58]:
dataset.download("parts", "./downloaded/")

In [59]:
! ls ./downloaded

albert_einstein.txt marie_curie.txt


In [60]:
! rm -R ./downloaded