https://github.com/Azure/MachineLearningNotebooks/tree/master/how-to-use-azureml/automated-machine-learning#3-setup-a-new-conda-environment

This notebook demonstrates how to extract features for a sentence similarity task using the pretrained models InferSent and Google Universal Sentence Encoder. Then we will demonstrate how the AutoML package can easily automate model selection and hyperparameter tuning

In [1]:
# set the environment path to find NLP
import sys
sys.path.append("../../../")
import time
import os
import pandas as pd
import shutil
import numpy as np
import torch
import sys
from scipy.stats import pearsonr
from scipy.spatial import distance
from utils_nlp.azureml import azureml_utils

#tensorflow dependencies for Google Universal Sentence Encoder
import tensorflow as tf
import tensorflow_hub as hub

#AzureML packages
import azureml as aml
import logging
from azureml.telemetry import set_diagnostics_collection
set_diagnostics_collection(send_diagnostics=True)
from azureml.train.automl import AutoMLConfig
from azureml.core.experiment import Experiment
from azureml.widgets import RunDetails

print("System version: {}".format(sys.version))
print("Azure ML SDK Version:", aml.core.VERSION)
print("Pandas version: {}".format(pd.__version__))
print("Tensorflow Version:", tf.VERSION)

W0612 09:40:01.239300 40168 __init__.py:56] Some hub symbols are not available because TensorFlow version is less than 1.14


Turning diagnostics collection on. 
System version: 3.6.7 |Anaconda, Inc.| (default, Dec 10 2018, 20:35:02) [MSC v.1915 64 bit (AMD64)]
Azure ML SDK Version: 1.0.41
Pandas version: 0.23.4
Tensorflow Version: 1.13.1


In [2]:
BASE_DATA_PATH = '../../../data'

# Feature Engineering

We'll collect the Google Sentence Encoder encodings

In [3]:
#replace all this with our util!
data = []
with open("sts-train.csv", 'r', encoding="utf-8") as f:
    for line in f:
        l = line.strip().split("\t")
        data.append([l[5].strip().lower(),l[6].strip().lower(), float(l[4])])
train = pd.DataFrame(data, columns=['sentence1','sentence2','score'])

data = []
with open("sts-test.csv", 'r', encoding="utf-8") as f:
    for line in f:
        l = line.strip().split("\t")
        data.append([l[5].strip().lower(),l[6].strip().lower(), float(l[4])])
test = pd.DataFrame(data, columns=['sentence1','sentence2','score'])

data = []
with open("sts-dev.csv", 'r', encoding="utf-8") as f:
    for line in f:
        l = line.strip().split("\t")
        data.append([l[5].strip().lower(),l[6].strip().lower(), float(l[4])])
dev = pd.DataFrame(data, columns=['sentence1','sentence2','score'])

## Google Universal Sentence Encoder

In [4]:
module_url = "https://tfhub.dev/google/universal-sentence-encoder-large/3"

# Import the Universal Sentence Encoder's TF Hub module
google_USE_embed = hub.Module(module_url)

# Reduce logging output.
tf.logging.set_verbosity(tf.logging.ERROR)

## Embed Sentences

In [5]:
def embed_google_universal_sentence_encoder(dataset, embedding_model):
    sts_input1 = tf.placeholder(tf.string, shape=(None))
    sts_input2 = tf.placeholder(tf.string, shape=(None))

    sts_encode1 = tf.nn.l2_normalize(embedding_model(sts_input1), axis=1)
    sts_encode2 = tf.nn.l2_normalize(embedding_model(sts_input2), axis=1)
    
    with tf.Session() as session:
        session.run(tf.global_variables_initializer())
        session.run(tf.tables_initializer())
        emb1, emb2 = session.run(
          [sts_encode1, sts_encode2],
          feed_dict={
              sts_input1: dataset['sentence1'],
              sts_input2: dataset['sentence2']
          })
    return emb1, emb2
    
def feature_engineering(dataset, googleUSE_embedding_model):
    google_USE_emb1, google_USE_emb2 = embed_google_universal_sentence_encoder(dataset, googleUSE_embedding_model)
    n_google = google_USE_emb1.shape[1]    
    df = np.concatenate((google_USE_emb1, google_USE_emb2), axis=1)
    names = ['USEEmb1_'+str(i) for i in range(n_google)]+['USEEmb2_'+str(i) for i in range(n_google)]
    df = pd.DataFrame(df, columns=names)
    df['score'] = dataset['score']
    return df

In [6]:
training_data = feature_engineering(train, google_USE_embed)
validation_data = feature_engineering(dev, google_USE_embed)
testing_data = feature_engineering(test, google_USE_embed)

In [7]:
# Take out when notebook is ready for publishing
training_data.to_csv("Data/training_set.csv", index=None)
testing_data.to_csv("Data/testing_set.csv", index=None)
validation_data.to_csv("Data/validation_set.csv", index=None)

In [8]:
def get_baseline_performance(data):
    sent1_googleUSE = data[[i for i in data.columns if 'USEEmb1' in i]].values.tolist()
    sent2_googleUSE = data[[i for i in data.columns if 'USEEmb2' in i]].values.tolist()
    
    predictions_googleUSE = [1-distance.cosine(sent1_googleUSE[i], sent2_googleUSE[i]) for i in range(len(sent1_googleUSE))]
    print("Google Universal Sentence Encoder Pearson Correlation:",pearsonr(predictions_googleUSE, data['score'].values.tolist())[0])

In [9]:
get_baseline_performance(testing_data)

Google Universal Sentence Encoder Pearson Correlation: 0.7640280696312057


# AutoML - no AmlCompute

In [10]:
ws = azureml_utils.get_or_create_workspace(
    subscription_id="<SUBSCRIPTION_ID>",
    resource_group="<RESOURCE_GROUP>",
    workspace_name="<WORKSPACE_NAME>",
    workspace_region="<WORKSPACE_REGION>"
)
print('Workspace name: ' + ws.name, 
      'Azure region: ' + ws.location, 
      'Subscription id: ' + ws.subscription_id, 
      'Resource group: ' + ws.resource_group, sep='\n')

If you run your code in unattended mode, i.e., where you can't give a user input, then we recommend to use ServicePrincipalAuthentication or MsiAuthentication.
Please refer to aka.ms/aml-notebook-auth for different authentication mechanisms in azureml-sdk.


Performing interactive authentication. Please follow the instructions on the terminal.


W0612 09:49:08.846869 29376 _profile.py:1082] Note, we have launched a browser for you to login. For old experience with device code, use "az login --use-device-code"
W0612 09:49:20.330181 40168 _profile.py:774] You have logged in. Now let us find all the subscriptions to which you have access...


Interactive authentication successfully completed.
Workspace name: MAIDAPTest
Azure region: eastus2
Subscription id: 15ae9cb6-95c1-483d-a0e3-b1a1a3b06324
Resource group: nlprg


In [4]:
#Take out when notebook is ready for publishing
training_data = pd.read_csv("Data/training_set.csv")
testing_data = pd.read_csv("Data/testing_set.csv")
validation_data = pd.read_csv("Data/validation_set.csv")

In [11]:
train_y = training_data['score']
train_x = training_data[[i for i in training_data.columns if 'USE' in i]]

validation_y = validation_data['score']
validation_x = validation_data[[i for i in validation_data.columns if 'USE' in i]]

test_y = testing_data['score']
test_x = testing_data[[i for i in testing_data.columns if 'USE' in i]]

In [12]:
train_x.shape

(5749, 1024)

In [13]:
automl_settings = {
    "iteration_timeout_minutes" : 15,
    "iterations" : 50,
    "primary_metric" : 'spearman_correlation',
    "preprocess" : True,
    "verbosity":logging.ERROR}

In [14]:
# local compute
automated_ml_config = AutoMLConfig(task = 'regression',
                     debug_log = 'automated_ml_errors.log',
                     path = './automated-ml-regression',
                     X = train_x.values,
                     y = train_y.values.flatten(),
                     X_valid = validation_x.values,
                     y_valid = validation_y.values.flatten(),
                     **automl_settings)

In [15]:
experiment=Experiment(ws, 'automated-ml-regression')
local_run = experiment.submit(automated_ml_config, show_output=True)

Running on local machine
Parent Run ID: AutoML_089672db-2a8f-4d74-84ce-b3bee49733dd
Current status: DatasetFeaturization. Beginning to featurize the dataset.
Current status: DatasetEvaluation. Gathering dataset statistics.
Current status: FeaturesGeneration. Generating features for the dataset.
Current status: DatasetFeaturizationCompleted. Completed featurizing the dataset.
Current status: ModelSelection. Beginning model selection.

****************************************************************************************************
ITERATION: The iteration being evaluated.
PIPELINE: A summary description of the pipeline being evaluated.
DURATION: Time taken for the current iteration.
METRIC: The result of computing score on the fitted pipeline.
BEST: The best observed score thus far.
****************************************************************************************************

 ITERATION   PIPELINE                                       DURATION      METRIC      BEST
         0 

In [16]:
RunDetails(local_run).show()

_AutoMLWidget(widget_settings={'childWidgetDisplay': 'popup', 'send_telemetry': True, 'log_level': 'INFO', 'sd…

In [17]:
widget_data = RunDetails(local_run).get_widget_data()

In [21]:
description = 'AutoML Sentence Similarity Model'
tags = None
model = local_run.register_model(description = description, tags = tags)

print(local_run.model_id) # This will be written to the script file later in the notebook.

Registering model AutoML089672db2best
AutoML089672db2best


In [22]:
lookup_metric = "spearman_correlation"
best_run, fitted_model = local_run.get_output(metric = lookup_metric)
print(best_run)
print(fitted_model)

Run(Experiment: automated-ml-regression,
Id: AutoML_089672db-2a8f-4d74-84ce-b3bee49733dd_48,
Type: None,
Status: Completed)
RegressionPipeline(pipeline=Pipeline(memory=None,
     steps=[('datatransformer', DataTransformer(enable_feature_sweeping=None, feature_sweeping_timeout=None,
        is_onnx_compatible=None, logger=None, observer=None, task=None)), ('prefittedsoftvotingregressor', PreFittedSoftVotingRegressor(estimators=[('24', Pipeline(memory=None,
     steps=[('stand...333333333333, 0.06666666666666667, 0.06666666666666667, 0.06666666666666667, 0.06666666666666667]))]),
          stddev=None)


In [23]:
y_pred = fitted_model.predict(test_x.values)
print(pearsonr(y_pred, test_y)[0])

0.7817644762555442


In [52]:
fitted_model.pipeline.steps[1][1].get_params()

{'estimators': [('24', Pipeline(memory=None,
        steps=[('standardscalerwrapper', <automl.client.core.common.model_wrappers.StandardScalerWrapper object at 0x000002669728A1D0>), ('xgboostregressor', XGBoostRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
            colsample_bytree=1, eta=0.01, gamma=0, learning_rate=0.1,
            ma...ale_pos_weight=1, seed=None,
            silent=True, subsample=0.7, tree_method='auto', verbose=-10))])),
  ('33', Pipeline(memory=None,
        steps=[('maxabsscaler', MaxAbsScaler(copy=True)), ('lightgbmregressor', LightGBMRegressor(boosting_type='gbdt', class_weight=None,
            colsample_bytree=0.7000000000000001, importance_type='split',
            learning_rate=0.16842263157894738, max_bin=7, max_depth=3,
            min_child_samples=14,...ue, subsample=0.5499999999999999,
            subsample_for_bin=200000, subsample_freq=3, verbose=-1))])),
  ('25', Pipeline(memory=None,
        steps=[('truncatedsvdwrapper', Tru

In [44]:
from sklearn.externals import joblib

model_path = 'sentence_similarity_regressor.pkl'

joblib.dump(fitted_model, model_path)

['sentence_similarity_regressor.pkl']

In [53]:
m2 = joblib.load('sentence_similarity_regressor.pkl')

In [54]:
m2

RegressionPipeline(pipeline=Pipeline(memory=None,
     steps=[('datatransformer', DataTransformer(enable_feature_sweeping=None, feature_sweeping_timeout=None,
        is_onnx_compatible=None, logger=None, observer=None, task=None)), ('prefittedsoftvotingregressor', PreFittedSoftVotingRegressor(estimators=[('24', Pipeline(memory=None,
     steps=[('stand...333333333333, 0.06666666666666667, 0.06666666666666667, 0.06666666666666667, 0.06666666666666667]))]),
          stddev=None)