# Training example
This notebook shows an example of integrating Verta during the model training phase of the flow.

In [1]:
import itertools
import os
import pandas as pd

import sklearn
from sklearn import model_selection
from sklearn import linear_model
from sklearn import metrics
import warnings
from sklearn.exceptions import ConvergenceWarning
warnings.filterwarnings("ignore", category=ConvergenceWarning)
warnings.filterwarnings("ignore", category=FutureWarning)

## Initialize the client

In [2]:
from verta import Client
from verta.utils import ModelAPI

VERTA_HOST = "https://cm.dev.verta.ai"

client = Client(VERTA_HOST)

set email from environment
set developer key from environment
connection successfully established


## Download the dataset version
We'll also link to models later.

In [3]:
dataset = client.get_dataset(name="Census Income S3")
dataset_version = dataset.get_latest_version()
# dataset_version.download()

set existing Dataset: Census Income S3 from personal workspace
got existing dataset version: d5a01a87188b0a2884466a51aa2e721a4a13d7f3629a4a8e76f92f6ebc82d8ee


In [6]:
df_train = pd.read_csv("census-train.csv")
X_train = df_train.iloc[:,:-1]
y_train = df_train.iloc[:, -1]
df_train.head()

Unnamed: 0,age,capital-gain,capital-loss,hours-per-week,workclass_local-gov,workclass_private,workclass_self-emp-inc,workclass_self-emp-not-inc,workclass_state-gov,workclass_without-pay,...,occupation_handlers-cleaners,occupation_machine-op-inspct,occupation_other-service,occupation_priv-house-serv,occupation_prof-specialty,occupation_protective-serv,occupation_sales,occupation_tech-support,occupation_transport-moving,>50k
0,44,0,0,40,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,21,0,0,40,0,1,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
2,53,7298,0,60,0,1,0,0,0,0,...,0,0,0,0,0,0,1,0,0,1
3,49,0,0,40,0,1,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
4,53,0,1485,40,0,1,0,0,0,0,...,0,0,0,0,0,0,1,0,0,1


## Set up the training state
Creata a project and experiment, and configure the hyperparamters. Note that we could hyperparameters coming from a repository to use what is the latest.

In [7]:
project = client.set_project(name="Census Income S3")
experiment = client.set_experiment(name="Linear regression")

hyperparam_candidates = {
    'C': [1e-6, 1e-4, 1e-2, 1e0],
    'solver': ['lbfgs'],
    'max_iter': [10, 20, 30],
    'balanced': [1, 0],
}
hyperparam_sets = [dict(zip(hyperparam_candidates.keys(), values))
                   for values
                   in itertools.product(*hyperparam_candidates.values())]

got existing Project: Census Income S3
got existing Experiment: Linear regression


## Train the models and log information to ModelDB

In [None]:
def run_experiment(hyperparams):
    # create object to track experiment run
    run = client.set_experiment_run()

    # create validation split
    (X_val_train, X_val_test,
     y_val_train, y_val_test) = model_selection.train_test_split(X_train, y_train,
                                                                 test_size=0.2,
                                                                 shuffle=True)

    # log hyperparameters
    run.log_hyperparameters(hyperparams)
    print(hyperparams, end=' ')
    hyperparams['class_weight'] = 'balanced' if hyperparams['balanced'] else None
    del hyperparams['balanced']

    # create and train model
    model = linear_model.LogisticRegression(**hyperparams)
    model.fit(X_train, y_train)

    # calculate and log validation accuracy
    train_acc = model.score(X_val_train, y_val_train)
    run.log_metric("train_acc", train_acc)
    val_acc = model.score(X_val_test, y_val_test)
    run.log_metric("val_acc", val_acc)
    print("Validation accuracy: {:.4f}".format(val_acc))

    # create deployment artifacts
    model_api = ModelAPI(X_train, y_train)
    requirements = ["scikit-learn"]

    # save and log model
    run.log_model(model, model_api=model_api)
    run.log_requirements(requirements)

    # log dataset snapshot as version
    run.log_dataset_version("train", dataset_version)

    # log Git information as code version
    run.log_code()
    
for hyperparams in hyperparam_sets:
    run_experiment(hyperparams)

## Search through the results
After we have done the experimentation, we can search through the results by leveraging the backend and then post-processing on the client.

In [9]:
project.expt_runs.find('hyperparameters.balanced == 1').as_dataframe()

Unnamed: 0,hpp.C,hpp.balanced,hpp.max_iter,hpp.solver,metric.train_acc,metric.val_acc
f782935c-23fd-4d2f-b182-16ab43c1f7cb,1.0,1,30,lbfgs,0.730693,0.735351
cb0333bb-f31e-4d53-8beb-3fb13a92e27f,1.0,1,20,lbfgs,0.775612,0.767966
aa01cb36-2627-449a-825d-ce79970eb0a6,1.0,1,10,lbfgs,0.773643,0.774599
fc509fa6-573f-48a1-8ce6-1ae12e333116,0.01,1,30,lbfgs,0.73173,0.731481
39bae924-6aa0-4914-97d3-0836349cdff2,0.01,1,20,lbfgs,0.77468,0.771697
65c0b93f-6e57-4595-846f-e5a97395eabb,0.01,1,10,lbfgs,0.773643,0.774599
ec9f53ad-4894-466f-a55b-63022b884db6,0.0001,1,30,lbfgs,0.734287,0.735766
9781d90c-941f-485d-a898-4b1757052847,0.0001,1,20,lbfgs,0.773194,0.77764
0246265e-a39e-48f6-a04f-818070a8f64b,0.0001,1,10,lbfgs,0.774576,0.770868
40f10ff4-b768-4d61-b711-978af7ecb8b8,1e-06,1,30,lbfgs,0.772779,0.774185


In [11]:
project.expt_runs.top_k('metrics.val_acc', 10).as_dataframe()

Unnamed: 0,hpp.C,hpp.balanced,hpp.max_iter,hpp.solver,metric.train_acc,metric.val_acc
1415aa21-bd13-4bd2-8021-172d751c3312,0.0001,0,20,lbfgs,0.788881,0.799475
48794ac1-136b-483a-b331-db72ee215a12,1e-06,0,20,lbfgs,0.786669,0.795467
f09c0bfd-3bf9-4fb5-8ede-08df609aece0,0.0001,0,30,lbfgs,0.78895,0.795467
d87821f3-f6ce-401f-aea8-b5751428a17c,0.0001,0,10,lbfgs,0.78933,0.793947
8126aa1d-d0d1-4e90-9dbe-55a30afc7d91,1.0,0,10,lbfgs,0.789434,0.792565
d89d7e45-2f58-4d6f-a9dc-25ca3dbf4736,1.0,0,30,lbfgs,0.792613,0.792565
0de7f95f-4c37-46c9-b630-b9a0eb8442eb,1.0,0,20,lbfgs,0.790125,0.79215
083bbb1f-889e-4557-9491-7e4d4d3bbd64,0.01,0,30,lbfgs,0.793027,0.791183
5c3b1713-6907-4e53-a508-ddda9ceffa20,0.01,0,20,lbfgs,0.790608,0.790216
28ccfe22-477c-454c-bebc-6fb1e02810ba,1e-06,0,30,lbfgs,0.789883,0.789801
