In [None]:
import os
import azureml.core
import pandas as pd
from azureml.core.runconfig import JarLibrary
from azureml.core.compute import ComputeTarget, DatabricksCompute
from azureml.exceptions import ComputeTargetException
from azureml.core.datastore import Datastore
from azureml.data.data_reference import DataReference
from azureml.core.databricks import PyPiLibrary

from azureml.train.hyperdrive import RandomParameterSampling, BanditPolicy, HyperDriveConfig, PrimaryMetricGoal
from azureml.core import Workspace, Environment, Experiment, Datastore, Dataset, ScriptRunConfig
from azureml.pipeline.core import Pipeline, PipelineData, TrainingOutput
from azureml.pipeline.steps import DatabricksStep, PythonScriptStep
from azureml.train.hyperdrive import choice, loguniform

# Check core SDK version number
print("SDK version:", azureml.core.VERSION)


In [None]:
ws = Workspace.from_config()
print(ws.name, ws.resource_group, ws.location, ws.subscription_id, sep = '\n')


In [None]:
db_compute_name = "Databricks" # Databricks compute name

databricks_compute = DatabricksCompute(workspace=ws, name=db_compute_name)
print('Compute target {} already exists'.format(db_compute_name))


In [None]:
from azureml.pipeline.core import PipelineParameter
from azureml.pipeline.core.pipeline_output_dataset import PipelineOutputAbstractDataset

def_blob_store = ws.get_default_datastore()
print('Datastore {} will be used'.format(def_blob_store.name))


In [None]:
# step_output_train = PipelineData("output_train", datastore=def_blob_store)
# step_output_validation = PipelineData("output_validation", datastore=def_blob_store)
# step_output_test = PipelineData("output_test", datastore=def_blob_store)
# step_output_temporal_test = PipelineData("output_temporal_test", datastore=def_blob_store)
# 
# ds_step_output_train = step_output_train.as_dataset()
# ds_step_output_validation = step_output_validation.as_dataset()
# ds_step_output_test = step_output_test.as_dataset()
# ds_step_output_temporal_test = step_output_temporal_test.as_dataset()


In [None]:
# ds_base_dataframe = Dataset.get_by_name(ws, 'base_dataframe')
# print(ds_base_dataframe.tags)
# ds_base_dataframe.tags['temporal_date']

In [None]:
source_directory = "./project"

preprocessing_script_name = "preprocessing_factory.py"


In [None]:
base_file_name = "ecd_tickets_cleaned_2_more_withNewLongDescs"
cluster_id = "0916-144740-3ql755ed" # Databricks "AML Cluster do not use please"

adb_prep_base = DatabricksStep(
    name="ADB_Prep_Base",
    compute_target=databricks_compute,
    existing_cluster_id=cluster_id,
    python_script_params=['--base_file_name', base_file_name
                          ],
    permit_cluster_restart=True,
    pypi_libraries=[],
    python_script_name='prep_base_dataset_from_SQL.py',
    source_directory=source_directory,
    run_name='ADB_Prep_Base',
    allow_reuse=True
)


In [None]:

cut_off_for_training = '201808'
valid_classes_period = '202105'
cut_off_date_recent = '202107'
temporal_test_date = '202209' # last training date. Keep it 1 month behind current date (or keep present date)
logic_v = '4'
top_n = '120'
base_file_name = "ecd_tickets_cleaned_2_more_withNewLongDescs"

adb_prep_step = DatabricksStep(
    name="ADB_Feature_Eng",
    compute_target=databricks_compute,
    existing_cluster_id=cluster_id,
    python_script_params=['--cut_off_for_training', cut_off_for_training,
                          '--valid_classes_period', valid_classes_period,
                          '--cut_off_date_recent', cut_off_date_recent,
                          '--temporal_test_date', temporal_test_date,
                          '--logic_v', logic_v,
                          '--top_n', top_n,
                          '--base_file_name', base_file_name
                          ],
    permit_cluster_restart=True,
    pypi_libraries=[PyPiLibrary(package='azureml-sdk'), PyPiLibrary(package='fsspec'), PyPiLibrary(package='plotly'), PyPiLibrary(package='kaleido')],
    python_script_name=preprocessing_script_name,
    source_directory=source_directory,
    run_name='ADB_Feature_Eng',
    allow_reuse=True
)

adb_prep_step.run_after(adb_prep_base)


In [None]:
# exp = Experiment(workspace=ws, name='transformer_hp')
# 
# steps = [dbNbStep]
# pipeline = Pipeline(workspace=ws, steps=steps)
# pipeline_run = exp.submit(pipeline)

In [None]:
# pipeline_run

In [None]:
from azureml.core.compute import ComputeTarget, AmlCompute
from azureml.core.compute_target import ComputeTargetException

# choose a name for your cluster
cluster_name = "NC6s-v3-SingleNode"
compute_target = ComputeTarget(workspace=ws, name=cluster_name)

In [None]:
env = Environment.get(workspace=ws, name="Transformer-DeBerta")

In [None]:
from azureml.core import ScriptRunConfig

args = [
        '--target-name', 'target',
        '--text-field', 'TEXT_FINAL',
        '--is-test', 0,
        '--is-final', 1,
        '--is-jump', 0,
        '--is-local', 0,
        '--evaluation-strategy', "epoch"
]

src = ScriptRunConfig(source_directory=source_directory,
                      script='train_transformer.py',
                      arguments=args,
                      compute_target=compute_target,
                      environment=env)


In [None]:
from azureml.train.hyperdrive import RandomParameterSampling, BanditPolicy, HyperDriveConfig, PrimaryMetricGoal
from azureml.train.hyperdrive import choice, loguniform

ps = RandomParameterSampling(
    {
        '--base-checkpoint': choice("bert-base-cased"), #, "bert-base-cased"), # , "bert-large-cased", "microsoft/deberta-v3-small", "distilbert-base-uncased", "bert-base-uncased"),
        '--batch-size': choice(16),
        '--no-epochs': choice(4),
        '--learning-rate': choice(5.5e-5), # 5e-5, 4.5e-5, 4e-5, 5.5e-5, 6e-5, 3.5e-5, 6.5e-5)
        '--warmup-steps': choice(0),
        '--weight-decay': choice(0.0),
        '--adam-beta1': choice(0.9),
        '--adam-beta2': choice(0.999),
        '--adam-epsilon': choice(1e-8)
    }
)


In [None]:
policy = BanditPolicy(evaluation_interval=5, slack_factor=0.1)
hyperdrive_config = HyperDriveConfig(run_config=src,
                                     hyperparameter_sampling=ps,
                                     policy=policy,
                                     primary_metric_name='eval_f1_weighted',
                                     primary_metric_goal=PrimaryMetricGoal.MAXIMIZE,
                                     max_total_runs=20,
                                     max_concurrent_runs=3)


In [None]:
from azureml.pipeline.steps import HyperDriveStep, HyperDriveStepRun, PythonScriptStep

hd_step_name='HyperDrive_Step'
hd_step = HyperDriveStep(
    name=hd_step_name,
    hyperdrive_config=hyperdrive_config,
    allow_reuse=True)

hd_step.run_after(adb_prep_step)


In [None]:
from azureml.core.compute import ComputeTarget, AmlCompute
from azureml.core.compute_target import ComputeTargetException

# choose a name for your cluster
reg_compute_target = ComputeTarget(workspace=ws, name="NC6s-v3-SingleNode")

In [None]:
from azureml.core.runconfig import RunConfiguration
from azureml.core.conda_dependencies import CondaDependencies

# conda_dep = CondaDependencies()
# conda_dep.add_pip_package("azureml-sdk")
# conda_dep.env

rcfg = RunConfiguration() # conda_dependencies=conda_dep)
rcfg.environment = env

register_model_step = PythonScriptStep(script_name='register_model.py',
                                       source_directory=source_directory,
                                       name="Register_Best_Model",
                                       compute_target=reg_compute_target,
                                       arguments=['--is-test', 0,
                                                  '--test-run-id', '',
                                                  '--metric-name', 'temporal_test_f1_weighted',
                                                  '--second-metric', 'temporal_test_f1',
                                                  '--temporal-test-date', temporal_test_date,
                                                  '--model-name', 'service_desk_concierge'],
                                       allow_reuse=True,
                                       runconfig=rcfg)

register_model_step.run_after(hd_step)


In [None]:
from azureml.core.runconfig import RunConfiguration
from azureml.core.conda_dependencies import CondaDependencies

# conda_dep = CondaDependencies()
# conda_dep.add_pip_package("azureml-sdk")
# conda_dep.env

rcfg = RunConfiguration() # conda_dependencies=conda_dep)
rcfg.environment = env

deploy_model_step = PythonScriptStep(script_name='deploy_model.py',
                                       source_directory=source_directory,
                                       name="Deploy_Latest_Model",
                                       compute_target=reg_compute_target,
                                       arguments=['--endpoint-name', 'help-desk-service-prod',
                                                  '--model-name', 'service_desk_concierge'],
                                       allow_reuse=True,
                                       runconfig=rcfg)

deploy_model_step.run_after(register_model_step)


In [None]:
exp = Experiment(workspace=ws, name='transformer_hp')
steps = [deploy_model_step]
pipeline = Pipeline(workspace=ws, steps=steps)

In [None]:
from datetime import datetime

timenow = datetime.now().strftime('%Y-%m-%d-%H-%M')

pipeline_name = 'ECD-horizon-' + timenow + "-Pipeline"
print(pipeline_name)

published_pipeline = pipeline.publish(
    name=pipeline_name, 
    description=pipeline_name)
print("Newly published pipeline id: {}".format(published_pipeline.id))

In [None]:
pipeline.submit(exp.name, credential_passthrough=True)