In [18]:
from dlhub_sdk import DLHubClient
from sklearn.svm import SVC
import pickle as pkl
import pandas as pd

# Train a ScikitLearn Model on the Iris Dataset

In [19]:
# Load the data
data = pd.read_csv('iris.csv', header=1)
print('Loaded {} rows with {} columns:'.format(len(data), len(data.columns)),
      data.columns.tolist())

# Make the model
model = SVC(kernel='linear', C=1, probability=True)
model.fit(data.values[:, :-1], data.values[:, -1])
print('Trained a SVC model')

# Save the model using pickle
with open('model.pkl', 'wb') as fp:
    pkl.dump(model, fp)
print('Saved model to disk')

Loaded 150 rows with 5 columns: ['sepal_length', 'sepal_width', 'petal_length', 'petal_width', 'species']
Trained a SVC model
Saved model to disk


# Describe the Model

In [20]:
from dlhub_sdk.models.servables.sklearn import ScikitLearnModel
from dlhub_sdk.models.datasets import TabularDataset
import pandas as pd
import json


# Make the dataset information
dataset_info = TabularDataset.create_model('iris.csv', read_kwargs=dict(header=1))

#   Read in the dataset
data = pd.read_csv('iris.csv', header=1)

#   Add link to where this data was downloaded from
dataset_info.add_alternate_identifier("https://archive.ics.uci.edu/ml/datasets/Iris", "URL")

#   Add link to paper describing the dataset
dataset_info.add_related_identifier("10.1111/j.1469-1809.1936.tb02137.x", "DOI", "IsDescribedBy")

#   Mark the domain of the dataset
dataset_info.set_domains(["biology"])

#   Describe the columns
dataset_info.annotate_column("sepal_length", description="Length of sepal", units="cm")
dataset_info.annotate_column("sepal_width", description="Width of sepal", units="cm")
dataset_info.annotate_column("petal_length", description="Length of petal", units="cm")
dataset_info.annotate_column("petal_width", description="Width of petal", units="cm")
dataset_info.annotate_column("species", description="Species", data_type='string')

#   Mark which columns are inputs and outputs
dataset_info.mark_inputs(data.columns[:-1])
dataset_info.mark_labels(data.columns[-1:])

#    Describe the data provenance
dataset_info.set_title("Iris Dataset")
dataset_info.set_name("iris_dataset")
dataset_info.set_authors(["Marshall, R.A."])

# Make the model information
model_info = ScikitLearnModel.create_model('model.pkl', n_input_columns=len(data.columns) - 1,
                                           classes=data['species'].unique())

#    Describe the model
model_info.set_title("Example Scikit-Learn Model")
model_info.set_name("iris_svm")
model_info.set_domains(["biology"])

# Print out the result
print('--> Dataset Information <--')
print(json.dumps(dataset_info.to_dict(), indent=2))
print('\n--> Model Information <--')
print(json.dumps(model_info.to_dict(), indent=2))

--> Dataset Information <--
{
  "datacite": {
    "creators": [
      {
        "givenName": "R.A.",
        "familyName": "Marshall",
        "affiliations": []
      }
    ],
    "titles": [
      {
        "title": "Iris Dataset"
      }
    ],
    "publisher": "DLHub",
    "publicationYear": "2019",
    "identifier": {
      "identifier": "10.YET/UNASSIGNED",
      "identifierType": "DOI"
    },
    "descriptions": [],
    "fundingReferences": [],
    "relatedIdentifiers": [
      {
        "relatedIdentifier": "10.1111/j.1469-1809.1936.tb02137.x",
        "relatedIdentifierType": "DOI",
        "relationType": "IsDescribedBy"
      }
    ],
    "alternateIdentifiers": [
      {
        "alternateIdentifier": "https://archive.ics.uci.edu/ml/datasets/Iris",
        "alternateIdentifierType": "URL"
      }
    ],
    "rightsList": [],
    "resourceType": {
      "resourceTypeGeneral": "Dataset"
    }
  },
  "dlhub": {
    "version": "0.8.2",
    "domains": [
      "biology"
    ],
  

# Publish the Model

In [21]:
dl = DLHubClient()

In [22]:
res = dl.publish_servable(model_info)

In [24]:
dl.get_task_status(res)

{'status': 'RUNNING'}