In [1]:
import os
import sys
from pathlib import Path

import datarobot as dr # type: ignore
from dotenv import load_dotenv # type: ignore

# The notebook should be executed from the project root directory
if "_correct_path" not in locals():
    os.chdir("..")
    sys.path.append(".")
    print(f"changed dir to {Path('.').resolve()})")
    _correct_path = True
load_dotenv()
client = dr.Client()

changed dir to /Users/farooq.azam/SAP_Related_Code/predictive-ai-starter)


In [2]:
print("Client version:", dr.__version__)

Client version: 3.6.0


In [3]:
import warnings
from datarobot.errors import DataRobotDeprecationWarning # type: ignore
warnings.filterwarnings('ignore', category=DataRobotDeprecationWarning)

In [4]:
from datarobotx.idp.use_cases import get_or_create_use_case # type: ignore
from infra.settings_main import use_case_args

if "DATAROBOT_DEFAULT_USE_CASE" in os.environ:
    use_case_id = os.environ["DATAROBOT_DEFAULT_USE_CASE"]
else:
    use_case_id = get_or_create_use_case(
        endpoint=client.endpoint,
        token=client.token,
        name=use_case_args.resource_name,
        description=use_case_args.description,
    )
    print(f"Use case {use_case_args.resource_name} created with id {use_case_id}")

Use case Recipe Template Use Case [LPM_FA_V22] created with id 677095535fe1ddf91d6057b6


# Data Ingest and Preparation

In [5]:
import pandas as pd # type: ignore
from infra.settings_datasets import training_dataset
from datarobotx.idp.datasets import get_or_create_dataset_from_datasource # type: ignore
from datarobotx.idp.datasource import get_or_create_datasource # type: ignore
from datarobotx.idp.datastore import get_or_create_datastore # type: ignore
from datarobotx.idp.credentials import get_replace_or_create_credential # type: ignore


sap_dsp_data_store_canonical_name=os.getenv("SAP_DSP_DATA_STORE_CANONICAL_NAME")
sap_dsp_host_name=os.getenv("SAP_DSP_HOST_NAME")
sap_dsp_port=os.getenv("SAP_DSP_PORT")
sap_dsp_data_source=os.getenv("SAP_DSP_DATA_SOURCE") 
sap_dsp_data_query=os.getenv("SAP_DSP_LATE_PAYMENTS_DATA_QUERY")
sap_dsp_training_data_set=os.getenv("SAP_DSP_LATE_PAYMENTS_TRAINING_DATA_SET")
sap_dsp_credentials=os.getenv("SAP_DSP_CREDENTIALS")

sap_dsp_credentials_id = get_replace_or_create_credential(
    endpoint=client.endpoint,
    token=client.token,
    name=sap_dsp_credentials,
    credential_type="basic",
)
print(f"SAP Credentials ID - {sap_dsp_credentials_id}")
sap_dsp_credentials = dr.Credential.get(sap_dsp_credentials_id)
print(f"SAP Credentials - {sap_dsp_credentials}")
sap_dsp_data_store_id=get_or_create_datastore(
    endpoint=client.endpoint,
    token=client.token,
    canonical_name=sap_dsp_data_store_canonical_name,
    driver_id='66c8ecdd45d2b5465fd74b49',
    data_store_type='dr-database-v1',
    fields=[{"id":"host","name":"Host Name","value":sap_dsp_host_name},{"id":"port","name":"port","value":sap_dsp_port}],
)
sap_dsp_data_store=dr.DataStore.get(sap_dsp_data_store_id)
print(f"SAP Datastore ID - {sap_dsp_data_store_id}")
print(f"SAP Datastore - {sap_dsp_data_store}")
params = dr.DataSourceParameters(
    data_store_id=sap_dsp_data_store.id,
    query=sap_dsp_data_query,
)
sap_dsp_data_source_id = get_or_create_datasource(
    endpoint=client.endpoint,
    token=client.token,
    data_source_type='dr-database-v1', 
    canonical_name=sap_dsp_data_source,
    params=params
)
print(f"SAP Datasource ID - {sap_dsp_data_source_id}")
print(f"SAP Datasource - {dr.DataSource.get(sap_dsp_data_source_id)}")
training_dataset_id=get_or_create_dataset_from_datasource(
    endpoint=client.endpoint,
    token=client.token,
    data_source_id=sap_dsp_data_source_id,
    name=sap_dsp_training_data_set,
    credential_id=sap_dsp_credentials.credential_id
)
print(f"SAP Training data set ID - {training_dataset_id}")
print(f"SAP Training data set - {dr.Dataset.get(training_dataset_id)}")
training_dataset=dr.Dataset.get(training_dataset_id)
training_dataset.training_data_id=training_dataset_id

SAP Credentials ID - 6747707642a92d74cd796960
SAP Credentials - Credential('6747707642a92d74cd796960', 'DR_SAP_TEMPLATE_CRED', 'basic')
SAP Datastore ID - 676c46ce57b0ab717e45149a
SAP Datastore - DataStore('DR_SAP_TEMPLATE [72d5954]')
SAP Datasource ID - 676c46d257b0ab717e45149b
SAP Datasource - DataSource('LATE_PAYMENTS_TRAINING_DATA_DSP [4f747c4]')
SAP Training data set ID - 676cad9bfc42109cd1340d6c
SAP Training data set - Dataset(name='DRS_LATE_PAYMENTS_TRAINING_DATA_VIEW_DSP [3092c80]', id='676cad9bfc42109cd1340d6c')


# Model Training

In [6]:
def get_or_create_featurelist(dataset: dr.Dataset, name: str, features: list[str]) -> str:
    try:
        feature_lists = dataset.get_featurelists()
        for feature_list in feature_lists:
            if feature_list.name == name:   
                return feature_list.id
        raise dr.errors.ClientError("Could not find featurelist!",404)
    except dr.errors.ClientError:
        print(f"Creating featurelist {name}")
        featurelist = dataset.create_featurelist(name=name, features=features)
    return featurelist.id

In [7]:
self_join_features = [
    "Days_Late", #target, invoice due date to actual paid date
    "Days_to_Ship", #historical information, order date to ship date
    "ORDER_DATE",
    "INVOICE_DUE_DATE",
    "Payment_Status", #late, on time, early
    "EXPECTED_AMOUNT",
    "PAYMENT_TERM",
    "CUSTOMER_NAME",
    "ACTUAL_INVOICED_QUANTITY_CASES",
    "MATERIAL_NAME", 
]
known_features = [
    "ORDER_DATE",
    "EXPECTED_AMOUNT",
    "PAYMENT_TERM",
    "CUSTOMER_NAME",
    "ACTUAL_INVOICED_QUANTITY_CASES",
    "MATERIAL_NAME", 
]

In [8]:
SELF_JOIN_LIST_NAME = "self_join_features_list"
KNOWN_FEATURES_LIST_NAME = "known_features_list"
self_join_features_list_id = get_or_create_featurelist(training_dataset, SELF_JOIN_LIST_NAME, self_join_features)
known_features_list_id = get_or_create_featurelist(training_dataset, KNOWN_FEATURES_LIST_NAME, known_features)
print(f"Self join feature list ID: {self_join_features_list_id}")
print(f"Known features list ID: {known_features_list_id}")

Self join feature list ID: 676cae1ee757893abb576c28
Known features list ID: 676cae1e1004162e5a8e54d7


In [9]:
from infra.settings_main import project_name
xpa_use_case=dr.UseCase.get(use_case_id)
project = dr.Project.create_from_dataset(
    dataset_id=training_dataset.id,
    project_name=f"Recipe Template Project [{project_name}]",
    use_case=xpa_use_case
)
print(f"[{project_name}] Project ID: {project.id}")

[LPM_FA_V22] Project ID: 6770a228734fd7a7927176e2


In [10]:
feature_lists_for_project = project.get_featurelists()
primary_featurelist = [ flist for flist in feature_lists_for_project if flist.name.find("known") != -1][0]
primary_featurelist_id = primary_featurelist.id
print(f"Primary feature list ID: {primary_featurelist_id}")

Primary feature list ID: 6770a232bae86891c5751167


In [11]:
# customer_secondary = dr.DatasetDefinition(
customer_secondary_dataset=dr.helpers.feature_discovery.DatasetDefinition(
    identifier="CUSTOMER_DATASET", #name of the secondary dataset
    catalog_id=training_dataset.id,
    catalog_version_id=training_dataset.version_id,
    primary_temporal_key="ORDER_DATE",
    feature_list_id=self_join_features_list_id,
    snapshot_policy = "latest",
)


In [12]:
#Define the relationship between the primary and secondary datasets
#customer_relationship = dr.Relationship(
customer_relationship = dr.helpers.feature_discovery.Relationship(
    dataset2_identifier="CUSTOMER_DATASET",
    dataset1_keys=["CUSTOMER_NAME"],
    dataset2_keys=["CUSTOMER_NAME"],
    feature_derivation_windows=[
        {"start": -91, "end": -61, "unit": "DAY"},
        {"start": -151, "end": -61, "unit": "DAY"},
    ],
    prediction_point_rounding=1,
    prediction_point_rounding_time_unit="DAY",
)


In [13]:
# Put the  data set definitions and four relationships into lists
datasets_definitions = [customer_secondary_dataset]
datasets_relationships = [customer_relationship]

In [14]:
# Define the aggregations to be created
feature_discovery_settings=[  
        {"name": "enable_days_from_prediction_point", "value": True},
        {"name": "enable_hour", "value": False},
        {"name": "enable_categorical_num_unique", "value": True},
        {"name": "enable_categorical_statistics", "value": True},
        {"name": "enable_numeric_minimum", "value": True},
        {"name": "enable_token_counts", "value": False},
        {"name": "enable_latest_value", "value": True},
        {"name": "enable_numeric_standard_deviation", "value": True},
        {"name": "enable_numeric_skewness", "value": True},
        {"name": "enable_day_of_week", "value": True},
        {"name": "enable_entropy", "value": True},
        {"name": "enable_numeric_median", "value": True},
        {"name": "enable_word_count", "value": False},
        {"name": "enable_pairwise_time_difference", "value": True},
        {"name": "enable_days_since_previous_event", "value": True},
        {"name": "enable_numeric_maximum", "value": True},
        {"name": "enable_numeric_kurtosis", "value": True},
        {"name": "enable_most_frequent", "value": True},
        {"name": "enable_day", "value": True},
        {"name": "enable_numeric_average", "value": True},
        {"name": "enable_summarized_counts", "value": True},
        {"name": "enable_missing_count", "value": False},
        {"name": "enable_record_count", "value": True},
        {"name": "enable_numeric_sum", "value": True},
]

In [15]:
# Create the relationships configuration to define connection between the datasets
#relationship_config=dr.RelationshipsConfiguration.create(
relationship_config = dr.RelationshipsConfiguration.create(
    dataset_definitions=datasets_definitions,
    relationships=datasets_relationships,
    feature_discovery_settings=feature_discovery_settings,
)
print(f"Relationship configuration ID: {relationship_config.id}")

Relationship configuration ID: 6770a25ad556b7e6126057dd


In [16]:
from datarobotx.idp.autopilot import get_or_create_autopilot_run # type: ignore
from datarobotx.idp.registered_model_versions import ( # type: ignore
    get_or_create_registered_leaderboard_model_version,
)
advanced_options = dr.AdvancedOptions(
    blend_best_models=False, 
    prepare_model_for_deployment=True,
    seed=0,
)
datetime_partition_spec = dr.DatetimePartitioningSpecification(
    datetime_partition_column="ORDER_DATE",
    use_time_series=False,  # This is not a time series project
    disable_holdout=False,
    holdout_start_date=pd.to_datetime("2023-12-01",format='%Y-%m-%d'),  # Adjust based on your data
    holdout_end_date=pd.to_datetime("2024-05-01",format='%Y-%m-%d') ,
    #validation_start_date="2023-12-01",  # Adjust based on your data
    validation_duration="P6M",
    #gap=61,  # Matching your feature derivation window gap
    number_of_backtests=2,
)

print("Running Autopilot...")
project.analyze_and_model(
    target="Days_Late",
    mode=dr.AUTOPILOT_MODE.QUICK,
    featurelist_id=primary_featurelist_id,
    relationships_configuration_id=relationship_config.id,
    feature_engineering_prediction_point="ORDER_DATE",
    partitioning_method=datetime_partition_spec,
    advanced_options=advanced_options,
    worker_count=-1, 
    max_wait=6000 * 6000 * 6000,
 )

project.wait_for_autopilot()
model_id = dr.ModelRecommendation.get(project.id).model_id
print(f"Recommended model ID: {model_id}")
registered_model_name = f"Recipe Template Registered Model [{project_name}]"
print("Registered recommended model...")
registered_model_version_id = get_or_create_registered_leaderboard_model_version(
    endpoint=client.endpoint,
    token=client.token,
    model_id=model_id,
    registered_model_name=registered_model_name,
)
print(f"Registered model version ID: {registered_model_version_id}")

Running Autopilot...
In progress: 15, queued: 0 (waited: 0s)
In progress: 15, queued: 0 (waited: 1s)
In progress: 15, queued: 0 (waited: 2s)
In progress: 15, queued: 0 (waited: 3s)
In progress: 15, queued: 0 (waited: 4s)
In progress: 15, queued: 0 (waited: 6s)
In progress: 15, queued: 0 (waited: 10s)
In progress: 15, queued: 0 (waited: 18s)
In progress: 15, queued: 0 (waited: 31s)
In progress: 15, queued: 0 (waited: 52s)
In progress: 15, queued: 0 (waited: 73s)
In progress: 15, queued: 0 (waited: 93s)
In progress: 10, queued: 0 (waited: 114s)
In progress: 9, queued: 0 (waited: 135s)
In progress: 9, queued: 0 (waited: 155s)
In progress: 9, queued: 0 (waited: 176s)
In progress: 8, queued: 0 (waited: 197s)
In progress: 8, queued: 0 (waited: 218s)
In progress: 8, queued: 0 (waited: 238s)
In progress: 7, queued: 0 (waited: 259s)
In progress: 6, queued: 0 (waited: 280s)
In progress: 5, queued: 0 (waited: 301s)
In progress: 5, queued: 0 (waited: 321s)
In progress: 5, queued: 0 (waited: 342s)


In [None]:
from datarobotx.idp.deployments import ( # type: ignore
    _lookup_registered_model_version,
    get_or_create_deployment_from_registered_model_version,
    get_replace_or_create_deployment_from_registered_model,
)
from infra.settings_main import model_training_output_path
import yaml # type: ignore

dr.PredictionServer.list()
prediction_server = dr.PredictionServer.list()[0]
deployment_name = f"Recipe Template Deployment [{project_name}]"
print("Creating deployment...")
deployment_id = get_or_create_deployment_from_registered_model_version(
    endpoint=client.endpoint,
    token=client.token,
    label=deployment_name,
    registered_model_version_id=registered_model_version_id,
    default_prediction_server_id=prediction_server.id,
)
print(f"Deployment ID: {deployment_id}")

scoring_related_data_file = "scoring_related_info.yaml"
scoring_related_data = {
    "use_case_id": use_case_id,
    "model_id": model_id,
    "project_id": project.id,
    "deployment_id": deployment_id,
    "registered_model_version_id": registered_model_version_id
    }
# wrtie to file for later use
with open(scoring_related_data_file, "w") as file:
    yaml.dump(scoring_related_data, file)
print(f"Saved scoring related information to {scoring_related_data_file}")

In [42]:
target = project.target

In [43]:
import yaml

from infra.settings_main import model_training_output_path
from starter.i18n import gettext
from starter.schema import AppSettings

print("Capturing settings required to deploy the frontend...")
registered_model = next(
    rm
    for rm in dr.RegisteredModel.list(search=registered_model_name)
    if rm.name == registered_model_name
)

app_settings = AppSettings(
    registered_model_version_id=registered_model_version_id,
    registered_model_name=registered_model_name,
    use_case_id=use_case_id,
    project_id=project.id,
    model_id=model_id,
    target=target,
    training_dataset_id=training_dataset_id,
    page_title=gettext("Predictive AI Starter"),
    page_description=gettext(
        "An application designed to simplify interactions with predictions while providing clear insights into the key drivers behind those predictions."
    ),
)
with open(model_training_output_path, "w") as f:
    yaml.dump(app_settings.model_dump(), f)

Capturing settings required to deploy the frontend...


FileNotFoundError: [Errno 2] No such file or directory: 'frontend/train_model_output.LPM_FA_V22.yaml'