# Introducción a MLFlow y Databricks: acelerando el Machine Learning LifeCycle - Python Sevilla 2019

## MLFlow Tracking

### Basic example

In [1]:
import mlflow

In [2]:
tracking_uri = "http://localhost:5000"
mlflow.set_tracking_uri(tracking_uri)

In [3]:
# test_experiment = mlflow.create_experiment('test_1')
mlflow.set_experiment('test_1')

In [4]:
run =  mlflow.start_run()
# with mlflow.start_run() as run: -> another alternative

In [5]:
mlflow.log_param('param1', 1)
mlflow.log_metric('metric1', 2)

In [6]:
mlflow.log_param('param1', 1)
mlflow.log_metric('metric1', 2)

In [7]:
mlflow.end_run()

### Breast cancer: Scikit-learn

In [8]:
# experiment = mlflow.create_experiment('breast_cancer')

In [9]:
mlflow.set_experiment('breast_cancer')

INFO: 'breast_cancer' does not exist. Creating a new experiment


In [10]:
import numpy as np
import pandas
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
import warnings
warnings.filterwarnings('ignore')

In [11]:
cancer = load_breast_cancer()
cancer.keys()

dict_keys(['data', 'target', 'target_names', 'DESCR', 'feature_names', 'filename'])

In [26]:
type(cancer)

sklearn.utils.Bunch

In [12]:
X = np.array(cancer.data)
y = np.array(cancer.target)
print(f'X: {X.shape}, y: {y.shape}')

X: (569, 30), y: (569,)


In [13]:
x_train, x_test, y_train, y_test = train_test_split(X, y, train_size=426, test_size=143, random_state=0)

In [14]:
#Feature Scaling
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
x_train = sc.fit_transform(x_train)
x_test = sc.transform(x_test)

In [15]:
# Function to validate a model        
def validate_model(model, x_test, y_test):    
    y_pred = model.predict(x_test)
    y_pred = (y_pred > 0.5)
    from sklearn.metrics import confusion_matrix
    tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()
    precision = tp / (tp + fp)
    recall = tp / (tp + fn)
    accuracy = (tp + tn) / (tp + fp + tn + fn)
    
    return precision, recall, accuracy

In [16]:
# Model 1: Logistic Regression
def breast_cancer_lr(solver="lbfgs", C=1.0):
    from sklearn.linear_model import LogisticRegression
    import mlflow.sklearn
    with mlflow.start_run() as run:
        lr = LogisticRegression(solver = solver, C = C)
        mlflow.log_param("solver", solver)
        mlflow.log_param("C", C)
        mlflow.set_tag("model type", "sklearn - LogisticRegression")
        lr.fit(x_train, y_train)
        precision, recall, accuracy = validate_model(lr, x_test, y_test)
        mlflow.log_metric("precision", precision)
        mlflow.log_metric("recall", recall)
        mlflow.log_metric("accuracy", accuracy)
        mlflow.sklearn.log_model(lr, "models")
        print("Model saved in run %s" % mlflow.active_run().info.run_uuid)

In [17]:
breast_cancer_lr()
breast_cancer_lr(solver="liblinear")
breast_cancer_lr(solver="liblinear", C=0.5)

Model saved in run 9bafba7ad7e8443098496ebb1ccaa885
Model saved in run 58be448c055148b0a78c16b431c9785b
Model saved in run 3d7f599649ae496294a5b8454a29013d


In [18]:
# Model 2: Random Forest
def breast_cancer_rf(n_estimators=100, max_depth=2, criterion="gini"):
    from sklearn.ensemble import RandomForestClassifier
    import mlflow.sklearn
    with mlflow.start_run() as run:
        clf = RandomForestClassifier(n_estimators=n_estimators, max_depth=max_depth, criterion=criterion)
        mlflow.log_param("n_estimators", n_estimators)
        mlflow.log_param("max_depth", max_depth)
        mlflow.log_param("criterion", criterion)
        mlflow.set_tag("model type", "sklearn - RandomForest")
        clf.fit(x_train, y_train)
        precision, recall, accuracy = validate_model(clf, x_test, y_test)
        mlflow.log_metric("precision", precision)
        mlflow.log_metric("recall", recall)
        mlflow.log_metric("accuracy", accuracy)
        mlflow.sklearn.log_model(clf, "models")
        print("Model saved in run %s" % mlflow.active_run().info.run_uuid)

In [19]:
breast_cancer_rf()
breast_cancer_rf(max_depth=5)
breast_cancer_rf(n_estimators=500, criterion="entropy")

Model saved in run d86c6c4faa58461d9a7014596bfca5ff
Model saved in run 8ad398b6f51d4e719be0f08ad5c1f808
Model saved in run 8a72d22516034c5faf8d137f5d2b69e4


In [24]:
# Model 3: Keras
from keras.callbacks import Callback
class LossHistory(Callback):
    def on_train_begin(self, logs={}):
        self.losses = []

    def on_epoch_end(self, epoch, logs={}):
        loss = logs.get('loss')
        acc = logs.get('accuracy')
        mlflow.log_metric("loss", loss, step=epoch)
        mlflow.log_metric("val_accuracy", acc, step=epoch)
        precision, recall, accuracy = validate_model(self.model, x_test, y_test)        
        mlflow.log_metric("precision", precision, step=epoch)
        mlflow.log_metric("recall", recall, step=epoch)
        mlflow.log_metric("accuracy", accuracy, step=epoch)
        self.losses.append(loss)
    

def breast_cancer_keras(optimizer='adam',dropout=0.00, nb_epoch=20):
    import mlflow.keras
    import keras
    from keras.models import Sequential
    from keras.layers import Dense, Dropout
    with mlflow.start_run() as run:
        mlflow.set_tag("model type", "keras - MLP")
        model = Sequential()
        # Adding the input layer and the first hidden layer
        model.add(Dense(output_dim=16, init='uniform', activation='relu', input_dim=30))
        # Adding dropout to prevent overfitting
        model.add(Dropout(p=dropout))
        # Adding the second hidden layer
        model.add(Dense(output_dim=16, init='uniform', activation='relu'))
        # Adding dropout to prevent overfitting
        model.add(Dropout(p=dropout))
        # Adding the output layer
        model.add(Dense(output_dim=1, init='uniform', activation='sigmoid'))
        # Compiling the ANN
        model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
        history = LossHistory()
        model.fit(x_train, y_train, batch_size=100, nb_epoch=nb_epoch, callbacks=[history])
        mlflow.log_param("optimizer", optimizer)
        mlflow.log_param("dropout", dropout)
        mlflow.keras.log_model(model, "models")

In [25]:
breast_cancer_keras()

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


In [None]:
breast_cancer_keras(nb_epoch=100)

In [None]:
breast_cancer_keras(dropout=0.25, nb_epoch=100)

In [35]:
# Model 4: Custom model
import mlflow.pyfunc
from numpy import random
class CustomClassifier(mlflow.pyfunc.PythonModel):

    def predict(self, model_input):
        return np.random.randint(2, size=len(model_input))

In [39]:
from tempfile import NamedTemporaryFile
def save_numpy_array(np_array):
    outfile = NamedTemporaryFile()
    np.save(outfile, np_array)
    return outfile

In [41]:
with mlflow.start_run() as run:
    ccl = CustomClassifier()
    precision, recall, accuracy = validate_model(ccl, x_test, y_test)
    mlflow.set_tag("model type", "pyfunc - random")
    mlflow.set_tag("dataset_uri", "https://archive.ics.uci.edu/ml/datasets/Breast+Cancer+Wisconsin+(Diagnostic)")
    mlflow.log_metric("precision", precision)
    mlflow.log_metric("recall", recall)
    mlflow.log_metric("accuracy", accuracy)
    mlflow.log_metric("f1", (2 * precision * recall / (precision + recall))) # new metric for this model
    # Log custom model by means of pyfunc api
    mlflow.pyfunc.log_model("models", python_model=ccl)
    # Log dataset and splits used to train/test
    x_train_file = save_numpy_array(x_train)
    x_test_file = save_numpy_array(x_test)
    y_train_file = save_numpy_array(y_train)
    y_test_file = save_numpy_array(y_test)
    mlflow.log_artifact(x_train_file.name, "dataset/x_train")
    mlflow.log_artifact(x_test_file.name, "dataset/x_test")
    mlflow.log_artifact(y_train_file.name, "dataset/y_train")
    mlflow.log_artifact(y_test_file.name, "dataset/y_test")