# Quick start

Training and testing with only one node.
- dataset: Titanic
- algo: random forest
- tasks: one traintuple and one testtuple

In [25]:
from pathlib import Path
import os
import zipfile

In [26]:
import substra

In [27]:
substra.__version__

'0.13.0'

In [28]:
assets_directory = Path("titanic") / "assets"

## Registering data samples and dataset

In [29]:
DEBUG = False

In [30]:
if DEBUG:
    client = substra.Client(debug=DEBUG)
else:
    client = substra.Client.from_config_file("node_A")

In [32]:
permissions = {
            'public': False, 
            'authorized_ids': []
}

DATASET = {
    'name': 'Titanic dataset - Node 1',
    'type': 'csv',
    'data_opener': assets_directory / 'dataset' / 'opener.py',
    'description': assets_directory / 'dataset' / 'description.md',
    'permissions': permissions
}

dataset_key_1 = client.add_dataset(DATASET)
print(f'Dataset key {dataset_key_1}')

Dataset key 27755e01-f736-4bb7-af61-afa4a4040e0a


### Adding train data samples

In [33]:
train_data_sample_folder = assets_directory / 'train_data_samples'
train_data_sample_paths = list(train_data_sample_folder.glob('*'))
train_data_sample_keys = list()

for path in train_data_sample_paths:
    data_sample_key = client.add_data_sample({
        'data_manager_keys': [dataset_key_1],
        'test_only': False,
        'path': path,
    }, local=True)
    train_data_sample_keys.append(data_sample_key)

print(f"{len(train_data_sample_keys)} data samples were registered")

10 data samples were registered


In [34]:
train_data_sample_folder

PosixPath('titanic/assets/train_data_samples')

### Adding test data samples

In [35]:
test_data_sample_folder = assets_directory / 'test_data_samples'
test_data_sample_paths = list(test_data_sample_folder.glob('*'))
test_data_sample_keys = list()

for path in test_data_sample_paths:
    data_sample_key = client.add_data_sample({
        'data_manager_keys': [dataset_key_1],
        'test_only': True,
        'path': path,
    }, local=True)
    test_data_sample_keys.append(data_sample_key)

print(f"{len(test_data_sample_keys)} data samples were registered")

2 data samples were registered


## Adding objective

In [36]:
OBJECTIVE = {
    'name': 'Titanic: Machine Learning From Disaster',
    'description': assets_directory / 'objective' / 'description.md',
    'metrics_name': 'accuracy',
    'metrics': assets_directory / 'objective' / 'metrics.zip',
    'permissions': {
        'public': False,
        'authorized_ids': []
    },
}

METRICS_DOCKERFILE_FILES = [
    assets_directory / 'objective' / 'metrics.py',
    assets_directory / 'objective' / 'Dockerfile'
]

archive_path = OBJECTIVE['metrics']
with zipfile.ZipFile(archive_path, 'w') as z:
    for filepath in METRICS_DOCKERFILE_FILES:
        z.write(filepath, arcname=os.path.basename(filepath))
        
objective_key = client.add_metric({
    'name': OBJECTIVE['name'],
    'file': assets_directory / 'objective' / 'metrics.zip',
    'description': OBJECTIVE['description'],
    'permissions': OBJECTIVE['permissions'],
})
assert objective_key, 'Missing objective key'

### Adding algo

In [37]:
ALGO_KEYS_JSON_FILENAME = 'algo_random_forest_keys.json'

ALGO = {
    'name': 'Titanic: Random Forest',
    'description': assets_directory / 'algo_random_forest' / 'description.md',
    'permissions': {
        'public': False,
        'authorized_ids': []
    },
}

ALGO_DOCKERFILE_FILES = [
        assets_directory / 'algo_random_forest/algo.py',
        assets_directory / 'algo_random_forest/Dockerfile',
]

archive_path = assets_directory / 'algo_random_forest' / 'algo_random_forest.zip'
with zipfile.ZipFile(archive_path, 'w') as z:
    for filepath in ALGO_DOCKERFILE_FILES:
        z.write(filepath, arcname=os.path.basename(filepath))
ALGO['file'] = archive_path

In [38]:
algo_key = client.add_algo({
    'name': ALGO['name'],
    'file': ALGO['file'],
    'description': ALGO['description'],
    'permissions': ALGO['permissions'],
    'category': "ALGO_SIMPLE"
})

### Registering tasks

In [39]:
traintuple_key = client.add_traintuple({
    'algo_key': algo_key,
    'data_manager_key': dataset_key_1,
    'rank': 0,    
    'train_data_sample_keys': train_data_sample_keys
})
assert traintuple_key, 'Missing traintuple key'


In [40]:
testtuple_key = client.add_testtuple({
    'metric_keys': [objective_key],
    'traintuple_key': traintuple_key,
    'test_data_sample_keys': test_data_sample_keys,
    'data_manager_key': dataset_key_1
})
assert testtuple_key, 'Missing testtuple key'

### Results

In [41]:
testtuple = client.get_testtuple(testtuple_key)

In [42]:
testtuple.status

'STATUS_WAITING'

In [24]:
testtuple.test.perfs

{'eb68084eaa29406b99bf8c108b861acc': 0.8156424581005587}