# Quick start

Training and testing with only one node.
- dataset: Titanic
- algo: random forest
- tasks: one traintuple and one testtuple

In [1]:
from pathlib import Path
import os
import zipfile

In [2]:
import substra

In [3]:
substra.__version__

'0.13.0'

In [9]:
assets_directory = Path("../titanic-assets") / "assets"

In [10]:
assets_directory

PosixPath('../titanic-assets/assets')

## Registering data samples and dataset

In [11]:
client = substra.Client(debug=True)

In [12]:
client.list_dataset()

[]

In [13]:
permissions = {
            'public': False, 
            'authorized_ids': []
}

DATASET = {
    'name': 'Titanic dataset - Node 1',
    'type': 'csv',
    'data_opener': assets_directory / 'dataset' / 'opener.py',
    'description': assets_directory / 'dataset' / 'description.md',
    'permissions': permissions
}

dataset_key_1 = client.add_dataset(DATASET)
print(f'Dataset key {dataset_key_1}')

InvalidRequest: Could not copy ../titanic-assets/assets/dataset/opener.py

### Adding train data samples

In [8]:
train_data_sample_folder = assets_directory / 'train_data_samples'
train_data_sample_paths = list(train_data_sample_folder.glob('*'))
train_data_sample_keys = list()

for path in train_data_sample_paths:
    data_sample_key = client.add_data_sample({
        'data_manager_keys': [dataset_key_1],
        'test_only': False,
        'path': path,
    }, local=True)
    train_data_sample_keys.append(data_sample_key)

print(f"{len(train_data_sample_keys)} data samples were registered")

10 data samples were registered


### Adding test data samples

In [9]:
test_data_sample_folder = assets_directory / 'test_data_samples'
test_data_sample_paths = list(test_data_sample_folder.glob('*'))
test_data_sample_keys = list()

for path in test_data_sample_paths:
    data_sample_key = client.add_data_sample({
        'data_manager_keys': [dataset_key_1],
        'test_only': True,
        'path': path,
    }, local=True)
    test_data_sample_keys.append(data_sample_key)

print(f"{len(test_data_sample_keys)} data samples were registered")

2 data samples were registered


## Adding objective

In [13]:
OBJECTIVE = {
    'name': 'Titanic: Machine Learning From Disaster',
    'description': assets_directory / 'objective' / 'description.md',
    'metrics_name': 'accuracy',
    'metrics': assets_directory / 'objective' / 'metrics.zip',
    'permissions': {
        'public': False,
        'authorized_ids': []
    },
}

METRICS_DOCKERFILE_FILES = [
    assets_directory / 'objective' / 'metrics.py',
    assets_directory / 'objective' / 'Dockerfile'
]

archive_path = OBJECTIVE['metrics']
with zipfile.ZipFile(archive_path, 'w') as z:
    for filepath in METRICS_DOCKERFILE_FILES:
        z.write(filepath, arcname=os.path.basename(filepath))
        
objective_key = client.add_metric({
    'name': OBJECTIVE['name'],
    'file': assets_directory / 'objective' / 'metrics.zip',
    'description': OBJECTIVE['description'],
    'permissions': OBJECTIVE['permissions'],
})
assert objective_key, 'Missing objective key'

### Adding algo

In [30]:
ALGO_KEYS_JSON_FILENAME = 'algo_random_forest_keys.json'

ALGO = {
    'name': 'Titanic: Random Forest',
    'description': assets_directory / 'algo_random_forest' / 'description.md',
    'permissions': {
        'public': False,
        'authorized_ids': []
    },
}

ALGO_DOCKERFILE_FILES = [
        assets_directory / 'algo_random_forest/algo.py',
        assets_directory / 'algo_random_forest/Dockerfile',
]

archive_path = assets_directory / 'algo_random_forest' / 'algo_random_forest.zip'
with zipfile.ZipFile(archive_path, 'w') as z:
    for filepath in ALGO_DOCKERFILE_FILES:
        z.write(filepath, arcname=os.path.basename(filepath))
ALGO['file'] = archive_path

In [31]:
algo_key = client.add_algo({
    'name': ALGO['name'],
    'file': ALGO['file'],
    'description': ALGO['description'],
    'permissions': ALGO['permissions'],
    'category': "ALGO_SIMPLE"
})

### Registering tasks

In [32]:
traintuple_key = client.add_traintuple({
    'algo_key': algo_key,
    'data_manager_key': dataset_key_1,
    'rank': 0,    
    'train_data_sample_keys': train_data_sample_keys
})
assert traintuple_key, 'Missing traintuple key'


In [38]:
testtuple_key = client.add_testtuple({
    'metric_keys': [objective_key],
    'traintuple_key': traintuple_key,
    'test_data_sample_keys': test_data_sample_keys,
    'data_manager_key': dataset_key_1
})
assert testtuple_key, 'Missing testtuple key'

### Results

In [39]:
testtuple = client.get_testtuple(testtuple_key)

In [40]:
testtuple.status

<Status.done: 'STATUS_DONE'>

In [43]:
print(testtuple)

{
    "key": "cabb28d3f51c4a32971ef3182b6a8303",
    "category": "TASK_TEST",
    "algo": {
        "key": "680f024da49147fc8c6f5415f20a4b40",
        "name": "Titanic: Random Forest",
        "owner": "local-backend",
        "permissions": {
            "process": {
                "public": false,
                "authorized_ids": [
                    "local-backend"
                ]
            },
            "download": {
                "public": false,
                "authorized_ids": [
                    "local-backend"
                ]
            }
        },
        "metadata": {},
        "category": "ALGO_SIMPLE",
        "creation_date": "2021-10-27T09:18:17.042830",
        "description": {
            "checksum": "935d06ca1045e7ea949b5252c1655039c3b8da31ba0fc19870d80d3bf0423c54",
            "storage_address": "/Users/rgoussault/code/romaingoussault/cookbooks/local-worker/xhr7on5x/680f024da49147fc8c6f5415f20a4b40/description.md"
        },
        "algorithm": {
  

In [45]:
client.get_dataset(dataset_key_1)

{
    "key": "42fb299648274020aa567dbf55b7ad1c",
    "name": "Titanic dataset - Node 1",
    "owner": "local-backend",
    "permissions": {
        "process": {
            "public": false,
            "authorized_ids": [
                "local-backend"
            ]
        },
        "download": {
            "public": false,
            "authorized_ids": [
                "local-backend"
            ]
        }
    },
    "type": "csv",
    "train_data_sample_keys": [
        "27731ab54dfc461e8df03b9aa0f09be2",
        "28c80acc64174960a670b53a63fdc957",
        "0719e7fca7684af1a8d8965cbab98605",
        "a998af35fd5342cd8b46cb678a7841c1",
        "01f5111780ba411bacc332b6f607647d",
        "7bb52e15186f43e1abe6027d16595e06",
        "3252dfe9df56439f8aa229c7faff63b9",
        "83593787a77a43489be35ed0344d4bf1",
        "fb5ff35ab77e4c36bfce964744a77993",
        "090749873b08484fa83c5c77fac9d2bb"
    ],
    "test_data_sample_keys": [
        "8beb9d60a2de4d95baf1e185337f2580",
   