# Datasets

A Dataset is a specialization of a `Resource` that aims to register (upload) files with its metadata.

In [1]:
from kgforge.core import KnowledgeGraphForge

In [2]:
import getpass

In [3]:
token = getpass.getpass()

In [4]:
forge = KnowledgeGraphForge("../../configurations/demo-forge-nexus.yml", token=token)

## Imports

In [5]:
from kgforge.core import Resource

In [6]:
from kgforge.specializations.resources import Dataset

In [7]:
import pandas as pd

In [8]:
import json

In [9]:
def pp(x): print(json.dumps(x, indent=4))

## Creation with files

In [10]:
! ls -p ../../data | egrep -v /$

associations.tsv
my_data.xwz
persons.csv


In [11]:
jane = Resource(type="Person", name="Jane Doe")

In [12]:
persons = Dataset(forge, name="Interesting Persons")

In [13]:
persons.add_files("../../data/persons.csv")

In [14]:
persons.add_contribution(jane)

In [15]:
forge.register(persons)

<action> _register_one
<succeeded> True


In [16]:
# print(persons)
pp(forge.as_json(persons))

{
    "id": "https://bbp.epfl.ch/dke/kgforge_tests/43b30848-b2d5-4cff-8409-057a70a8c44a",
    "type": "Dataset",
    "contribution": {
        "type": "Contribution",
        "agent": {
            "id": {
                "type": "Person",
                "name": "Jane Doe"
            },
            "type": "Agent"
        }
    },
    "hasPart": {
        "type": "DataDownload",
        "contentSize": {
            "unitCode": "bytes",
            "value": 163
        },
        "contentUrl": "https://staging.nexus.ocp.bbp.epfl.ch/v1/files/dke/kgforge_tests/0017a10b-958d-42f7-90b0-b18aa6b41280",
        "digest": {
            "algorithm": "SHA-256",
            "value": "4ea2f323c33e1fde5b7e48ab14fda50490e56e55dd6b1b369113fc3615c4e7ea"
        },
        "encodingFormat": "text/csv",
        "name": "persons.csv"
    },
    "name": "Interesting Persons"
}


In [17]:
associations = Dataset(forge, name="Associations data")

In [18]:
associations.add_files("../../data/associations.tsv")

In [19]:
associations.add_derivation(persons)

In [20]:
associations.add_contribution(jane)

In [21]:
forge.register(associations)

<action> _register_one
<succeeded> True


In [22]:
# print(associations)
pp(forge.as_json(associations))

{
    "id": "https://bbp.epfl.ch/dke/kgforge_tests/8721e427-6ecb-4648-b166-d9c5463ba825",
    "type": "Dataset",
    "contribution": {
        "type": "Contribution",
        "agent": {
            "id": {
                "type": "Person",
                "name": "Jane Doe"
            },
            "type": "Agent"
        }
    },
    "derivation": {
        "type": "Derivation",
        "entity": {
            "id": "https://bbp.epfl.ch/dke/kgforge_tests/43b30848-b2d5-4cff-8409-057a70a8c44a?rev=1",
            "type": "Dataset",
            "name": "Interesting Persons"
        }
    },
    "hasPart": {
        "type": "DataDownload",
        "contentSize": {
            "unitCode": "bytes",
            "value": 506
        },
        "contentUrl": "https://staging.nexus.ocp.bbp.epfl.ch/v1/files/dke/kgforge_tests/134b7ac8-c4d3-4871-8749-7b84d9e38838",
        "digest": {
            "algorithm": "SHA-256",
            "value": "9639abc864e91c645779f510ae5c06a1618941d569eb1af619c2767

In [23]:
associations.download("files", "./downloaded/")

In [24]:
! ls ./downloaded

associations.tsv


In [25]:
! rm -R ./downloaded

## Creation with resources

In [26]:
distribution_1 = forge.attach("../../data/associations.tsv")

In [27]:
distribution_2 = forge.attach("../../data/persons.csv")

In [28]:
jane = Resource(type="Person", name="Jane Doe", distribution=distribution_1)

In [29]:
john = Resource(type="Person", name="John Smith", distribution=distribution_2)

In [30]:
persons = [jane, john]

In [31]:
forge.register(persons)

<count> 2
<action> _register_many
<succeeded> True


In [32]:
dataset = Dataset(forge, name="Interesting people")

In [33]:
dataset.add_parts(persons)

In [34]:
# print(dataset)
pp(forge.as_json(dataset))

{
    "type": "Dataset",
    "hasPart": [
        {
            "id": "https://bbp.epfl.ch/dke/kgforge_tests/0e10bb04-fcd7-4b4b-88e5-5f72c1f4ddcf?rev=1",
            "type": "Person",
            "distribution": {
                "contentUrl": "https://staging.nexus.ocp.bbp.epfl.ch/v1/files/dke/kgforge_tests/0d05bfe2-2a94-4cb2-ad47-35a458cb250d"
            },
            "name": "Jane Doe"
        },
        {
            "id": "https://bbp.epfl.ch/dke/kgforge_tests/267d4f09-2bbb-4340-9d93-5f0c56654b4f?rev=1",
            "type": "Person",
            "distribution": {
                "contentUrl": "https://staging.nexus.ocp.bbp.epfl.ch/v1/files/dke/kgforge_tests/4f609c47-5a7e-4e32-9bf2-a162ad334cdf"
            },
            "name": "John Smith"
        }
    ],
    "name": "Interesting people"
}


In [35]:
forge.register(dataset)

<action> _register_one
<succeeded> True


In [36]:
dataset.download("parts", "./downloaded/")

In [37]:
! ls ./downloaded

associations.tsv persons.csv


In [38]:
! rm -R ./downloaded

### specifiying custom content-type

In [39]:
data_file = forge.attach("../../data/my_data.xwz", content_type="application/xwz")

In [40]:
experiment = Resource(type="Experiment", name="generated data", distribution=data_file)

In [41]:
forge.register(experiment)

<action> _register_one
<succeeded> True


In [42]:
print(experiment.distribution)

{
    type: DataDownload
    contentSize:
    {
        unitCode: bytes
        value: 16
    }
    contentUrl: https://staging.nexus.ocp.bbp.epfl.ch/v1/files/dke/kgforge_tests/8bdf701b-bde3-462e-b3d2-2a210bf2ffbc
    digest:
    {
        algorithm: SHA-256
        value: df03e7e93f870c6731540b3cae26391670da682c7a8dbdd18448cbcfc4fb7981
    }
    encodingFormat: application/xwz
    name: my_data.xwz
}


## Creation from a dataframe

See notebook `DataFrame IO.ipynb` for details on conversions of instances of Resource from a Pandas DataFrame.

### basics

In [43]:
dataframe = pd.read_csv("../../data/persons.csv")

In [44]:
dataframe

Unnamed: 0,type,name,distribution
0,Person,Marie Curie,../../data/scientists-database/marie_curie.txt
1,Person,Albert Einstein,../../data/scientists-database/albert_einstein...


In [45]:
persons = forge.from_dataframe(dataframe)

In [46]:
forge.register(persons)

<count> 2
<action> _register_many
<succeeded> True


In [47]:
dataset = Dataset(forge, name="Interesting people")

In [48]:
dataset.add_parts(persons)

In [49]:
print(dataset)

{
    type: Dataset
    hasPart:
    [
        {
            id: https://bbp.epfl.ch/dke/kgforge_tests/ea2e5071-6181-499e-b1a6-079b841c220d?rev=1
            type: Person
            distribution: ../../data/scientists-database/marie_curie.txt
            name: Marie Curie
        }
        {
            id: https://bbp.epfl.ch/dke/kgforge_tests/93bbce98-8b9e-45af-a388-5f6264030617?rev=1
            type: Person
            distribution: ../../data/scientists-database/albert_einstein.txt
            name: Albert Einstein
        }
    ]
    name: Interesting people
}


In [50]:
forge.register(dataset)

<action> _register_one
<succeeded> True


### advanced

In [51]:
dataframe = pd.read_csv("../../data/associations.tsv", sep="\t", usecols=["name", "type", "agent__type", "agent__name", "agent__gender__label", "distribution"])

In [52]:
dataframe

Unnamed: 0,name,type,agent__type,agent__name,agent__gender__label,distribution
0,Curie Association,Association,Person,Marie Curie,female,../../data/scientists-database/marie_curie.txt
1,Einstein Association,Association,Person,Albert Einstein,male,../../data/scientists-database/albert_einstein...


In [53]:
dataframe["distribution"] = dataframe["distribution"].map(lambda x: forge.attach(x))

In [54]:
associations = forge.from_dataframe(dataframe, na="(missing)", nesting="__")

In [55]:
print(associations[0])

{
    type: Association
    agent:
    {
        type: Person
        gender:
        {
            label: female
        }
        name: Marie Curie
    }
    distribution: LazyAction(operation=Store.upload, args=['../../data/scientists-database/marie_curie.txt', None])
    name: Curie Association
}


In [56]:
forge.register(associations)

<count> 2
<action> _register_many
<succeeded> True


In [57]:
dataset = Dataset(forge, name="Interesting associations")

In [58]:
print(dataset)

{
    type: Dataset
    name: Interesting associations
}


In [59]:
dataset.add_parts(associations)

In [60]:
print(dataset)

{
    type: Dataset
    hasPart:
    [
        {
            id: https://bbp.epfl.ch/dke/kgforge_tests/ccbb5419-4334-468e-ac9a-b18f0da099de?rev=1
            type: Association
            distribution:
            {
                contentUrl: https://staging.nexus.ocp.bbp.epfl.ch/v1/files/dke/kgforge_tests/4581b3a5-a6d7-4c01-9390-e2d9ec882972
            }
            name: Curie Association
        }
        {
            id: https://bbp.epfl.ch/dke/kgforge_tests/cdcac522-fd03-4f26-8410-87fdb73b6cfe?rev=1
            type: Association
            distribution:
            {
                contentUrl: https://staging.nexus.ocp.bbp.epfl.ch/v1/files/dke/kgforge_tests/29acae13-8da0-43bc-aaa9-1fc94951abb4
            }
            name: Einstein Association
        }
    ]
    name: Interesting associations
}


In [61]:
forge.register(dataset)

<action> _register_one
<succeeded> True


In [62]:
dataset.download("parts", "./downloaded/")

In [63]:
! ls ./downloaded

albert_einstein.txt marie_curie.txt


In [64]:
! rm -R ./downloaded