# Autosetup
> Updated to version 3 of watsonx.data API corresponding to docs https://cloud.ibm.com/apidocs/watsonxdata-v3

> This file provisions Cloud Object Storage buckets, registers them in watsonx.data instance, associates catalogs, creates spark engine and milvus service and adds Postgres as a data source.  
* You can start Auto-setup after :exclamation:**watsonx.data quickstart is finished** as before watsonx.data API that is used in this notebook is not available.
* Additionally, make sure you have `.env_setup` created in the same folder and filled with relevant information.
* COS instance and Postgres credentials should be added in `./credentials` folder in json format
* Package requirements to run this JN are located in `requirements_autosetup.txt`



## Expected output in watsonx.data infrastructure manager
![](attachments/2025-05-23-19-34-59-pasted-vscode.png)


In [1]:
import os

import json
import random
import time
from datetime import datetime
import requests
import xmltodict
import io

import pandas as pd

from dotenv import load_dotenv

load_dotenv('./.env_setup')

True

## Initial configurations
Names of engines and catalogs for watsonx.data

In [2]:
current_timestamp =  str(datetime.now().timestamp()).replace('.', '')

env_json = dict()
buckets_to_create = ['wxd','hive', 'milvus', 'input-data']

presto_engine_name = f"presto_{current_timestamp}"
spark_engine_name = f"spark_{current_timestamp}"
milvus_servis_name = f"milvus_{current_timestamp}"

hive_catalog = {
        'catalog_name': 'hive_catalog',
        'catalog_type': 'hive-hadoop2'
      }
iceberg_catalog = {
        'catalog_name': 'iceberg_data',
        'catalog_type': 'iceberg'
      }
postgres_catalog = {
      'catalog_name': 'postgres_catalog',
      'catalog_type': 'postgresql'
}

netezza_catalog = {
      'catalog_name': 'nz_catalog',
      'catalog_type': 'netezza'
}

## Credentials and configurations based on env and credentials folder

### Read in environmental variables

In [3]:
# environmental variables for .data API support
# urls
identityURL = os.getenv("IDENTITY_URL")
wxd_url = f'https://{os.getenv('WXD_INSTANCE_HOST')}/lakehouse/api/v3'
# buckets location the same as watsonx.data
buckets_location = os.getenv("WXD_REGION")
bucket_endpoint =  f"https://s3.{buckets_location}.cloud-object-storage.appdomain.cloud"

print("watsonx.data url", wxd_url)
print(f"COS endpoint - the same as watsonx.data location {buckets_location}", bucket_endpoint)

watsonx.data url https://console-ibm-cator.lakehouse.saas.ibm.com/lakehouse/api/v3
COS endpoint - the same as watsonx.data location ca-tor https://s3.ca-tor.cloud-object-storage.appdomain.cloud


In [4]:
# Netezza credentials
nz_credentials = {
    "nz_database": os.getenv("NZ_DATABASE"),
    "nz_host": os.getenv("NZ_HOST"),
    "nz_port": os.getenv("NZ_PORT"),
    "nz_username": os.getenv("NZ_USERNAME"),
    "nz_password": os.getenv("NZ_PASSWORD")
}

In [5]:
os.getenv('COS_CREDENTIALS_PATH')

'./credentials/cos.json'

### Parsing COS Credentials

In [6]:
with open(os.getenv('COS_CREDENTIALS_PATH')) as j_f:
    cos_credentials_json = json.load(j_f)

In [7]:
env_json['COS_API_KEY'] = cos_credentials_json['apikey']
env_json['COS_ACCESS_KEY'] = cos_credentials_json['cos_hmac_keys']['access_key_id']
env_json['COS_SECRET_KEY'] = cos_credentials_json['cos_hmac_keys']['secret_access_key']
env_json['COS_INSTANCE_CRN'] = cos_credentials_json['resource_instance_id']

In [8]:
# wx.data credentials
env_json['CLOUD_API_KEY'] = os.getenv("CLOUD_API_KEY")
env_json['WXD_INSTANCE_CRN'] = os.getenv("WXD_INSTANCE_CRN")
env_json['WXD_USER'] = "ibmlhapikey"

### Generate token and use it for the session

In [9]:
def generate_token():
    """To geneate user token for other requests"""
    headers = {
        "Content-Type": "application/x-www-form-urlencoded"
    }

    payload = {
        'grant_type': f'urn:ibm:params:oauth:grant-type:apikey',
        'apikey': f"{env_json['CLOUD_API_KEY']}"
    }

    res = requests.post(f'{identityURL}', headers=headers, data=payload, verify=False)
    if res.status_code in [200, 201, 202]:
        print("Successfully generated token")
    else:
        print("Code for token generation", res.status_code)
        print("Message", res.text)
    cur_string = res.json()
    access_token = cur_string['access_token']

    return access_token

### Functions to create sessions

In [10]:
def create_cos_session(access_token):
    s_cos = requests.Session()
    s_cos.headers.clear()
    headers_cos={
        "Authorization":"Bearer {}".format(access_token),
        "ibm-service-instance-id": env_json['COS_INSTANCE_CRN']
    }

    s_cos.headers.update(headers_cos) 
    print("COS API session created")
    return s_cos

In [11]:
def create_wxdata_session(access_token):
    wxd_headers={
    "Accept": "application/json",
    "Content-Type": "application/json",
    "Connection": "keep-alive",
    "Authorization":"Bearer {}".format(access_token),
    "AuthInstanceID": f"{env_json['WXD_INSTANCE_CRN']}"
}
    s_data = requests.Session()
    s_data.headers.clear()
    s_data.headers.update(wxd_headers)
    print("wx data API session created")
    return s_data

## COS

### COS authentication -> session

In [12]:
cur_token = generate_token()
s_cos = create_cos_session(cur_token)



Successfully generated token
COS API session created


### Create bucket

[bucket_endpoints](https://cloud.ibm.com/docs/cloud-object-storage?topic=cloud-object-storage-endpoints)  
[COS API](https://cloud.ibm.com/apidocs/cos/cos-compatibility#createbucket)

In [13]:
s_cos.get(bucket_endpoint).text

'<?xml version="1.0" encoding="UTF-8" standalone="yes"?><ListAllMyBucketsResult xmlns="http://s3.amazonaws.com/doc/2006-03-01/"><Owner><ID>2932a509-4bbe-44d8-b1b2-f006afa4a973</ID><DisplayName>2932a509-4bbe-44d8-b1b2-f006afa4a973</DisplayName></Owner><Buckets/></ListAllMyBucketsResult>'

In [14]:
ex_buckets_names = []
r = s_cos.get(bucket_endpoint)
buckets_list = xmltodict.parse(r.text)['ListAllMyBucketsResult']['Buckets']
if buckets_list:
    if 'Bucket' in buckets_list:
        buckets_dict=buckets_list['Bucket']  
    else:
        buckets_dict = []
else:
    buckets_dict = []
if type(buckets_dict) == dict:
    if any(bucket_name in buckets_dict['Name'] for bucket_name in buckets_to_create):
        ex_buckets_names.append(buckets_dict["Name"])
elif type(buckets_dict) == list:
    for bucket in buckets_dict:
        if any(bucket_name in bucket['Name'] for bucket_name in buckets_to_create):
            ex_buckets_names.append(bucket["Name"])

# dictionary of created buckets
created_buckets = dict.fromkeys(buckets_to_create)
for bucket_name in ex_buckets_names:
    cur_key = bucket_name.rsplit("-", 1)[0]
    created_buckets[cur_key] = bucket_name

# to create buckets if some of them are None
for bucket_name, existing_bucket in created_buckets.items():
    if existing_bucket is None:
        f_bucket_name = f"{bucket_name}-{current_timestamp}{random.randint(100, 999)}"
        location=buckets_location
        url_new_bucket = f"{bucket_endpoint}/{f_bucket_name}"
        r = s_cos.put(url_new_bucket)
        if r.status_code == 200:
            created_buckets[bucket_name] = f_bucket_name


### Delete bucket

In [15]:
# s_cos.delete(url_new_bucket)

# watsonx.data

### watsonx.data API urls

In [16]:
presto_url = f"{wxd_url}/instance"
wxd_b_url = f"{wxd_url}/storage_registrations"
wxd_db_url = f"{wxd_url}/database_registrations"
wxd_p_url = f"{wxd_url}/presto_engines"
wxd_s_url = f"{wxd_url}/spark_engines"
wxd_m_url = f"{wxd_url}/milvus_services"

### authenticate session

In [17]:
cur_token = generate_token()
s = create_wxdata_session(cur_token)



Successfully generated token
wx data API session created


### Initial information

In [18]:
# keep headers for the whole session
wxd_info = s.get(presto_url)
print(wxd_info.status_code)
wxd_info.json()

200


{'deploymentresponse': {'deployment': {'account_type': 'PAYG',
   'cas_status': 'running',
   'cloud_type': 'ibm',
   'disablements': [],
   'enable_private_endpoints': False,
   'enable_public_endpoints': True,
   'endpoints': [{'connection_type': 'spark_private',
     'connections': [{'hostname': 'console-ibm-cator.cubsfi3r058mvjkbt2c0.private.lakehouse.saas.ibm.com',
       'port': 443,
       'protocol': 'https',
       'service_name': '*.cubsfi3r058mvjkbt2c0.private.lakehouse.saas.ibm.com'}]}],
   'first_time_use': False,
   'formation_id': 'c13d827f-a365-4908-847f-fa322af111ba',
   'id': 'crn:v1:bluemix:public:lakehouse:ca-tor:a/c877b54435304cd8990c5099b9c0c8b2:c13d827f-a365-4908-847f-fa322af111ba::',
   'instance_status': 'COMPLETED',
   'lite_tiny_milvus': False,
   'mds_status': 'RUNNING',
   'open_search': False,
   'pg_status': True,
   'plan_id': 'lakehouse-enterprise-mcsp',
   'platform_options': {},
   'private_preview_enabled': [],
   'region': 'ca-tor',
   'resource_gro

### .data COS buckets registration

In [19]:
r = s.get(wxd_b_url)
wxd_buckets_info = r.json()

In [20]:
# buckets to register
buckets_registerations = list()
for bucket_name in wxd_buckets_info['storage_registrations']:
    buckets_registerations.append(bucket_name['id'])
buckets_to_register = dict()
for bucket_name, bucket_f_name in created_buckets.items():
    if bucket_f_name in buckets_registerations or bucket_name == 'input-data':
        continue
    else:
        buckets_to_register[bucket_name] = bucket_f_name
print('Current buckets to register', buckets_to_register)

Current buckets to register {'wxd': 'wxd-1764608347913076777', 'hive': 'hive-1764608347913076781', 'milvus': 'milvus-1764608347913076573'}


In [21]:
for bucket_name in buckets_to_register:
  if bucket_name == 'input-data':
    continue
  data = { "connection": 
  { 
    "name": created_buckets[bucket_name],
    "endpoint": bucket_endpoint, 
    "secret_key": env_json['COS_SECRET_KEY'], 
    "access_key": env_json['COS_ACCESS_KEY'],
  }, 
  "display_name": bucket_name, 
  "type": "ibm_cos", 
  "description": f"COS bucket for {bucket_name} data",
  "managed_by": "customer", 
  "region": buckets_location
    }
  # for iceberg add iceberg catalog
  if bucket_name == 'wxd':
    data['associated_catalog'] = iceberg_catalog
  # for hive add hive catalog
  if bucket_name == 'hive':
    data['associated_catalog'] = hive_catalog
  r = s.post(wxd_b_url, data=json.dumps(data))
  print("Register", bucket_name, r.status_code, r.text)
  # to activate if created successfully
  if r.status_code in [200, 201] and bucket_name in ['wxd', 'hive']:
    wxd_ba_url = f"{wxd_b_url}/{created_buckets[bucket_name]}/activate"
    r = s.post(wxd_ba_url, data="")
    print("Activate", r.status_code, r.text)
  

Register wxd 201 {"actions":null,"associated_catalogs":[{"base_path":"/","catalog_name":"iceberg_data","catalog_tags":[]}],"connection":{"auth_mode":"hmac","endpoint":"https://s3.ca-tor.cloud-object-storage.appdomain.cloud","name":"wxd-1764608347913076777"},"created_at":"1764608381","created_by":"anna.istomina@ibm.com","description":"COS bucket for wxd data","display_name":"wxd","id":"wxd-1764608347913076777","managed_by":"customer","region":"ca-tor","state":"active","tags":[],"type":"ibm_cos"}

Activate 201 {"response":{"message_code":"Success"}}

Register hive 201 {"actions":null,"associated_catalogs":[{"base_path":"/","catalog_name":"hive_catalog","catalog_tags":[]}],"connection":{"auth_mode":"hmac","endpoint":"https://s3.ca-tor.cloud-object-storage.appdomain.cloud","name":"hive-1764608347913076781"},"created_at":"1764608386","created_by":"anna.istomina@ibm.com","description":"COS bucket for hive data","display_name":"hive","id":"hive-1764608347913076781","managed_by":"customer","re

In [22]:
r = s.get(wxd_b_url)
wxd_buckets_info = r.json()

In [23]:
# find bucket registered for iceberg catalog (default bucket)
for bucket_info in wxd_buckets_info['storage_registrations']:
    if len(bucket_info['associated_catalogs'])==0:
        continue
    for associated_catalog in bucket_info['associated_catalogs']:
        if 'iceberg' in associated_catalog['catalog_name']:
            # correct catalog name if incorrect
            iceberg_catalog['catalog_name'] = associated_catalog['catalog_name']
            if created_buckets.get('wxd') is None:
                created_buckets['wxd'] = bucket_info['id']

### postgres as data source

#### Postgres credentials

In [24]:
with open(os.getenv('POSTGRES_CREDENTIALS_PATH')) as j_f:
    postgres_cred_json = json.load(j_f)

In [25]:
data = {
    "display_name": "Postgres",
    "type": "postgresql",
    "associated_catalog": {
        "catalog_name": postgres_catalog["catalog_name"],
        "catalog_type": postgres_catalog["catalog_type"]
    },
    "connection": {
        "hostname": postgres_cred_json["connection"]["cli"]["arguments"][0][0].split(' ')[0].split('=')[-1],
        "port": int(postgres_cred_json["connection"]["cli"]["arguments"][0][0].split(' ')[1].split('=')[-1]),
        "name": postgres_cred_json["connection"]["cli"]["arguments"][0][0].split(' ')[2].split('=')[-1],
        "password": postgres_cred_json["connection"]["postgres"]["authentication"]["password"],
        "username": postgres_cred_json["connection"]["postgres"]["authentication"]["username"]
    }
}

In [26]:
# check if added, if not add
postgres_registration = False
database_registations = s.get(wxd_db_url).json()['database_registrations']
if database_registations is not None:
    for database_registration in database_registations:
        if database_registration['type'] == 'postgresql':
            postgres_registration = True
            print("Postgres registration already exists, won't be adding a new one")
if not postgres_registration:
    r = s.post(wxd_db_url, data=json.dumps(data))
    print(r.text)

{"actions":[],"associated_catalog":{"catalog_name":"postgres_catalog","catalog_tags":[]},"connection":{"hostname":"90589fd1-5b19-4342-9446-a16fc575e7a2.c1vt02ul0q3fa0509bog.databases.appdomain.cloud","name":"ibmclouddb","port":31117},"created_at":"1764608406","created_by":"anna.istomina@ibm.com","display_name":"Postgres","id":"postgresql352","properties":[{"encrypt":null,"key":"connector.name","value":"postgresql"},{"encrypt":null,"key":"connection-url","value":"jdbc:postgresql://90589fd1-5b19-4342-9446-a16fc575e7a2.c1vt02ul0q3fa0509bog.databases.appdomain.cloud:31117/ibmclouddb"},{"encrypt":null,"key":"connection-user","value":"ibm_cloud_94f1cffc_616a_4070_96c0_761573d921df"},{"encrypt":null,"key":"connection-password","value":"ibmlhenc__0002__77+RYDrFQVh9I/mNIQqn4+dTb7Fj583b0fldqrJ/fmUXpP7zVQ3v2zkVWpgWXZdQs2cwpgfvLBE4ZDIo"},{"encrypt":null,"key":"allow-drop-table","value":"true"}],"tables":[],"tags":null,"topics":[],"type":"postgresql"}



### Netezza as data source

In [27]:
nz_data = {
    "display_name": "INVESTMENTS_NZ",
    "type": "netezza",
    "associated_catalog": {
        "catalog_name": netezza_catalog["catalog_name"],
        "catalog_type": netezza_catalog["catalog_type"]
    },
    "connection": {
        "hostname": nz_credentials["nz_host"],
        "port": int(nz_credentials["nz_port"]),
        "name": nz_credentials["nz_database"],
        "password": nz_credentials["nz_password"],
        "username": nz_credentials["nz_username"]
    }
}

In [28]:
# check if added, if not add
netezza_registration = False
database_registations = s.get(wxd_db_url).json()['database_registrations']
if database_registations is not None:
    for database_registration in database_registations:
        if database_registration['type'] == 'netezza':
            netezza_registration = True
            print("Netezza registration already exists, won't be adding a new one")
if not netezza_registration:
    r = s.post(wxd_db_url, data=json.dumps(nz_data))
    print(r.text)

{"actions":[],"associated_catalog":{"catalog_name":"nz_catalog","catalog_tags":[]},"connection":{"hostname":"nz-b998fddd-a7b4-410f-aa8a-a9fb677c3b44.us-east-1.data-warehouse.cloud.ibm.com","name":"INVESTMENTS","port":5480},"created_at":"1764608411","created_by":"anna.istomina@ibm.com","display_name":"INVESTMENTS_NZ","id":"netezza284","properties":[],"tables":[],"tags":null,"topics":[],"type":"netezza"}



### .data engines and service

#### presto engine

In [29]:
data = {
    "description": "",
    "configuration": {
        "coordinator": {
            "node_type": "starter",
            "quantity": 1
        },
        "size_config": "custom",
        "worker": {
            "node_type": "starter",
            "quantity": 1
        }
    },
    "display_name": f"{presto_engine_name}2",
    "origin": "native",
    "associated_catalogs": [iceberg_catalog['catalog_name'], hive_catalog['catalog_name']],
    "tags": [],
}

In [None]:
# check if presto engine exists, if not create one
presto_engines = s.get(wxd_p_url).json()
# provision engine if not
if presto_engines['presto_engines'] is None:
    r = s.post(wxd_p_url, data=json.dumps(data))
    print(r.status_code)
    print(r.text)
    # wait till provisionning is finished
    if r.status_code in [200, 201, 202]:
        start_time=time.time()
        engine_wait = True
        while time.time() - start_time < 500 and engine_wait:
            try:
                r = s.get(wxd_p_url)
                if r.json()['presto_engines'] is None:
                    print("Provisioning not started yes. Waiting...")
                    time.sleep(10)
                    continue
                if r.json()['presto_engines'][0]['status'].upper() in ['PENDING', 'PROVISIONING']:
                    print("Provisioning presto engine. Waiting...")
                    time.sleep(30)
                    continue                
                if r.json()['presto_engines'][0]['status'].upper() == 'RUNNING':
                    print("Engine has provisioned")
                    engine_wait = False
                    continue
                print("Timeout reached.  Engine not provisioned")
            except requests.RequestException as e:
                print(f"Request failed: {e}")


201
{"actions":null,"associated_catalogs":null,"drivers":null,"external_host_name":null,"id":"presto972","origin":"native","resource_groups":null,"size_config":"custom","status":"running","status_code":null,"tags":null}

Provisioning not started yes. Waiting...
Provisioning presto engine. Waiting...
Provisioning presto engine. Waiting...
Provisioning presto engine. Waiting...
Provisioning presto engine. Waiting...
Provisioning presto engine. Waiting...
Provisioning presto engine. Waiting...
Provisioning presto engine. Waiting...
Provisioning presto engine. Waiting...
Provisioning presto engine. Waiting...
Provisioning presto engine. Waiting...


In [31]:
s.headers.update({'Content-Type': 'application/merge-patch+json'})
# rename presto engine
presto_engines = s.get(wxd_p_url).json()['presto_engines']
presto_engine_id = presto_engines[0]['id']
if presto_engines[0]['display_name'] != presto_engine_name:
    data = {
        "description": "Updated presto engine",
        "display_name": presto_engine_name
    }
    r = s.patch(f"{wxd_p_url}/{presto_engine_id}", data=json.dumps(data))
    print(r.status_code, r.text)
# return back original headers
s.headers.update({'Content-Type': 'application/json'})

200 {"actions":null,"associated_catalogs":["iceberg_data","hive_catalog"],"build_version":"2.2.2.0.0","coordinator":{"node_type":"starter","quantity":1},"created_at":1764608417,"created_by":"anna.istomina@ibm.com","description":"Updated presto engine","display_name":"presto_1764608347913076","drivers":null,"external_host_name":null,"group_id":"presto972","host_name":"c13d827f-a365-4908-847f-fa322af111ba.d1oo8kjr06m4m2irtj4g.lakehouse.ibmappdomain.cloud","id":"presto972","origin":null,"port":31491,"query_status":"not_started","resource_groups":null,"size_config":"custom","status":"PROVISIONING","status_code":33,"tags":[],"type":"presto","worker":{"node_type":"starter","quantity":1}}



In [38]:
# associate catalogs if not yes
p_catalogs = presto_engines[0]['associated_catalogs'].copy()
catalogs_associate = []
if not any('hive' in cat for cat in p_catalogs):
    catalogs_associate.append(hive_catalog['catalog_name'])
if not any('iceberg' in cat for cat in p_catalogs):
    catalogs_associate.append(iceberg_catalog['catalog_name'])
if not any('postgre' in cat for cat in p_catalogs):
    catalogs_associate.append(postgres_catalog['catalog_name'])
if not any('nz' in cat for cat in p_catalogs):
    catalogs_associate.append(netezza_catalog['catalog_name'])

if catalogs_associate:
    data = {
        "catalog_names": catalogs_associate
    }
    r = s.post(f"{wxd_p_url}/{presto_engine_id}/catalogs", data=json.dumps(data))
    print(r.status_code, r.text)

201 {"catalogs":[{"actions":null,"associated_databases":["postgresql352"],"associated_engines":["presto972"],"associated_storage":[],"managed_by":"anna.istomina@ibm.com","name":"postgres_catalog","tags":[]},{"actions":null,"associated_databases":["netezza284"],"associated_engines":["presto972"],"associated_storage":[],"managed_by":"anna.istomina@ibm.com","name":"nz_catalog","tags":[]}]}



#### spark engines

In [33]:
data = {
    "description": "Spark engine",
    "type": "spark",
    "configuration": {
        "default_version": "3.5",
        "scale_config": {
            "node_type": "small",
            "number_of_nodes": 3
        },
        "engine_home": {
            "storage_name": created_buckets['wxd']
            }
    },
    "display_name": spark_engine_name,
    "origin": "native",
    "associated_catalogs": [iceberg_catalog['catalog_name'], hive_catalog['catalog_name']]
}

In [34]:
# check if spark engine exists, if not create one
spark_engines = s.get(wxd_s_url).json()
# provision engine if not
if spark_engines['spark_engines'] is None:
    r = s.post(wxd_s_url, data=json.dumps(data))
    print(r.status_code)
    print(r.text)
    # wait till provisionning is finished
    if r.status_code in [200, 201, 202]:
        start_time=time.time()
        engine_wait = True
        while time.time() - start_time < 200 and engine_wait:
            try:
                r = s.get(wxd_s_url)
                if r.json()['spark_engines'] is None:
                    print("Provisioning not started yes. Waiting...")
                    time.sleep(10)
                    continue
                if r.json()['spark_engines'][0]['status'].upper() in ['PENDING', 'PROVISIONING']:
                    print("Provisioning spark engine. Waiting...")
                    time.sleep(30)
                    continue                
                if r.json()['spark_engines'][0]['status'].upper() == 'RUNNING':
                    print("Engine has provisioned")
                    engine_wait = False
                    continue
                print('Request has timed out')
            except requests.RequestException as e:
                print(f"Request failed: {e}")

202
{"actions":null,"associated_catalogs":["iceberg_data","hive_catalog"],"id":"spark72","origin":"native","status":"provisioning","tags":null}

Provisioning spark engine. Waiting...
Provisioning spark engine. Waiting...
Provisioning spark engine. Waiting...
Provisioning spark engine. Waiting...
Provisioning spark engine. Waiting...
Engine has provisioned


In [35]:
# make sure that iceberg and hive catalogs are associated
# associate catalogs if not yes
spark_engines = s.get(wxd_s_url).json()
spark_engine_id = spark_engines['spark_engines'][0]['id']
s_catalogs = spark_engines['spark_engines'][0]['associated_catalogs'].copy()
catalogs_associate = []
if not any('hive' in cat for cat in s_catalogs):
    catalogs_associate.append(hive_catalog['catalog_name'])
if not any('iceberg' in cat for cat in s_catalogs):
    catalogs_associate.append(iceberg_catalog['catalog_name'])

if catalogs_associate:
    data = {
        "catalog_names": ','.join(catalogs_associate)
    }
    r = s.post(f"{wxd_s_url}/{spark_engine_id}/catalogs", data=json.dumps(data))
    print(r.status_code, r.text)

#### milvus service

In [36]:
# root path should not have underscores
data = {
    "origin": "native",
    "storage_name": created_buckets["milvus"],
    "root_path": f"/{milvus_servis_name}-metadata".replace('_', '-'),
    "display_name": milvus_servis_name,
    "tshirt_size": "starter"
}

In [37]:
# check if exists, if not create (wait till it's provisioned)
milvus_services = s.get(wxd_m_url).json()
# provision engine if not
if milvus_services['milvus_services'] == [] or milvus_services['milvus_services'] is None:
    r = s.post(wxd_m_url, data=json.dumps(data))
    print(r.status_code)
    print(r.text)
    # wait till provisionning is finished
    if r.status_code in [200, 201, 202]:
        start_time=time.time()
        engine_wait = True
        while time.time() - start_time < 1200 and engine_wait:
            try:
                r = s.get(wxd_m_url)
                if r.json()['milvus_services'] == [] or r.json()['milvus_services'] is None:
                    print("Provisioning not started yes. Waiting...")
                    time.sleep(10)
                    continue
                if r.json()['milvus_services'][0]['status'].upper() in ['PENDING', 'PROVISIONING']:
                    print("Provisioning spark engine. Waiting...")
                    time.sleep(30)
                    continue                
                if r.json()['milvus_services'][0]['status'].upper() == 'RUNNING':
                    print("Engine has provisioned")
                    engine_wait = False
                    continue
                print('Request has timed out')
            except requests.RequestException as e:
                print(f"Request failed: {e}")

202
{"actions":[],"id":"milvus402","index_type":null,"origin":"native","status":"running","status_code":null,"tags":[],"tshirt_size":null}

Provisioning spark engine. Waiting...
Provisioning spark engine. Waiting...
Provisioning spark engine. Waiting...
Provisioning spark engine. Waiting...
Provisioning spark engine. Waiting...
Provisioning spark engine. Waiting...
Provisioning spark engine. Waiting...
Provisioning spark engine. Waiting...
Provisioning spark engine. Waiting...
Provisioning spark engine. Waiting...
Provisioning spark engine. Waiting...
Provisioning spark engine. Waiting...
Engine has provisioned


## Create env file

### Add initial information from env file

In [39]:
env_json['IDENTITY_URL'] = os.getenv("IDENTITY_URL")
env_json['WXD_REGION'] = os.getenv("WXD_REGION")

### Update watsonx.data session

In [40]:
cur_token = generate_token()
s = create_wxdata_session(cur_token)



Successfully generated token
wx data API session created


### Add COS buckets information and catalogs associated

In [41]:
r_buckets = s.get(wxd_b_url)
buckets_info = r_buckets.json()

In [42]:
endpoints = {}
for bucket_info in buckets_info['storage_registrations']:
    if 'milvus' in bucket_info['id']:
        env_json['MILVUS_BUCKET'] =  bucket_info['connection']['name']
        endpoints['milvus'] = bucket_info['connection']['endpoint']
        continue
    if 'hive' in bucket_info['associated_catalogs'][0]['catalog_name']:
        env_json['HIVE_CATALOG'] = bucket_info['associated_catalogs'][0]['catalog_name']
        env_json['HIVE_BUCKET'] =  bucket_info['connection']['name']
        endpoints['hive'] = bucket_info['connection']['endpoint']
        continue
    if 'iceberg' in bucket_info['associated_catalogs'][0]['catalog_name']:
        env_json['ICEBERG_CATALOG'] = bucket_info['associated_catalogs'][0]['catalog_name']
        env_json['WXD_BUCKET'] =  bucket_info['connection']['name']
        endpoints['iceberg'] = bucket_info['connection']['endpoint']
        continue
env_json['INPUT_BUCKET'] = created_buckets['input-data']

In [43]:
end_values = [url_v.split("://")[1].split(".")[-4] for url_v in endpoints.values()]
all_same = all(v == end_values[0] for v in end_values)

print('Are all of the bucket endpoints the same?', all_same)

env_json['COS_BUCKETS_LOCATION'] = end_values[1]

Are all of the bucket endpoints the same? True


If False check which endpoint is not the same (if it's the region or direct / public endpoint only)

### Update env file with engines and milvus service

:exclamation:make sure that all engines and milvus are provisioned (in the UI)!!!  
**Milvus** might take longer

In [44]:
# update info first
wxd_info = s.get(presto_url)
presto_engine_info = s.get(wxd_p_url)
if presto_engine_info.status_code == 200:
    env_json['PRESTO_HOST'] = presto_engine_info.json()['presto_engines'][0]['host_name']
    env_json['PRESTO_PORT'] = presto_engine_info.json()['presto_engines'][0]['port']
spark_engine_info = s.get(wxd_s_url)
if spark_engine_info.status_code == 200:
    env_json['SPARK_ENGINE_ID'] = spark_engine_info.json()['spark_engines'][0]['id']
milvus_service_info = s.get(wxd_m_url)
if milvus_service_info.status_code == 200:
    assert 'grpc_host' in milvus_service_info.json()['milvus_services'][0], 'MILVUS is still being setup'
    env_json['MILVUS_HOST'] = milvus_service_info.json()['milvus_services'][0]['grpc_host']
    env_json['MILVUS_PORT'] = milvus_service_info.json()['milvus_services'][0]['grpc_port']

In [45]:
# Write to .env format
with open('.env_output', 'w') as env_file:
    for key, value in env_json.items():
        # Convert value to string and quote it
        quoted_value = f'"{str(value)}"'
        env_file.write(f"{key}={quoted_value}\n")
print("Checkout .env_output file in the current directory")

Checkout .env_output file in the current directory
