# Autosetup
> This file provisions Cloud Object Storage buckets, registers them in watsonx.data instance, associates catalogs, creates spark engine and milvus service and adds Postgres as a data source.  
* You can start Auto-setup after :exclamation:**watsonx.data quickstart is finished** as before watsonx.data API that is used in this notebook is not available.
* Additionally, make sure you have `.env_setup` created in the same folder and filled with relevant information.
* COS instance and Postgres credentials should be added in `./credentials` folder in json format
* Package requirements to run this JN are located in `requirements_autosetup.txt`



## Expected output in watsonx.data infrastructure manager
![](attachments/2025-05-23-19-34-59-pasted-vscode.png)


In [5]:
import os

import json
import random
import time
from datetime import datetime
import requests
import xmltodict
import io

import pandas as pd

from dotenv import load_dotenv

load_dotenv('./.env_setup')

True

## Initial configurations
Names of engines and catalogs for watsonx.data

In [6]:
current_timestamp =  str(datetime.now().timestamp()).replace('.', '')

env_json = dict()
buckets_to_create = ['hive', 'milvus', 'input-data']

presto_engine_name = f"presto_{current_timestamp}"
spark_engine_name = f"spark_{current_timestamp}"
milvus_servis_name = f"milvus_{current_timestamp}"

hive_catalog = {
        'catalog_name': 'hive_catalog',
        'catalog_type': 'hive-hadoop2'
      }
iceberg_catalog = {
        'catalog_name': 'iceberg_data',
        'catalog_type': 'iceberg'
      }
postgres_catalog = {
      'catalog_name': 'postgres_catalog',
      'catalog_type': 'postgresql'
}

netezza_catalog = {
      'catalog_name': 'nz_catalog',
      'catalog_type': 'netezza'
}

## Credentials and configurations based on env and credentials folder

### Read in environmental variables

In [7]:
# environmental variables for .data API support
# urls
identityURL = os.getenv("IDENTITY_URL")
wxd_url = f'https://{os.getenv("WXD_REGION")}.lakehouse.cloud.ibm.com/lakehouse/api/v2'
# buckets location the same as watsonx.data
buckets_location = os.getenv("WXD_REGION")
bucket_endpoint =  f"https://s3.{buckets_location}.cloud-object-storage.appdomain.cloud"

print("watsonx.data url", wxd_url)
print(f"COS endpoint - the same as watsonx.data location {buckets_location}", bucket_endpoint)

watsonx.data url https://eu-de.lakehouse.cloud.ibm.com/lakehouse/api/v2
COS endpoint - the same as watsonx.data location eu-de https://s3.eu-de.cloud-object-storage.appdomain.cloud


In [8]:
# Netezza credentials
nz_credentials = {
    "nz_database": os.getenv("NZ_DATABASE"),
    "nz_host": os.getenv("NZ_HOST"),
    "nz_port": os.getenv("NZ_PORT"),
    "nz_username": os.getenv("NZ_USERNAME"),
    "nz_password": os.getenv("NZ_PASSWORD")
}

### watsonx.data API urls

In [9]:
presto_url = f"{wxd_url}/instance_details"
wxd_b_url = f"{wxd_url}/bucket_registrations"
wxd_db_url = f"{wxd_url}/database_registrations"
wxd_p_url = f"{wxd_url}/presto_engines"
wxd_s_url = f"{wxd_url}/spark_engines"
wxd_m_url = f"{wxd_url}/milvus_services"

In [10]:
os.getenv('COS_CREDENTIALS_PATH')

'./credentials/cos.json'

### Parsing COS Credentials

In [11]:
with open(os.getenv('COS_CREDENTIALS_PATH')) as j_f:
    cos_credentials_json = json.load(j_f)

In [12]:
env_json['COS_API_KEY'] = cos_credentials_json['apikey']
env_json['COS_ACCESS_KEY'] = cos_credentials_json['cos_hmac_keys']['access_key_id']
env_json['COS_SECRET_KEY'] = cos_credentials_json['cos_hmac_keys']['secret_access_key']
env_json['COS_INSTANCE_CRN'] = cos_credentials_json['resource_instance_id']

In [13]:
# wx.data credentials
env_json['CLOUD_API_KEY'] = os.getenv("CLOUD_API_KEY")
env_json['WXD_INSTANCE_CRN'] = os.getenv("WXD_INSTANCE_CRN")
env_json['WXD_USER'] = "ibmlhapikey"

### Generate token and use it for the session

In [14]:
def generate_token():
    """To geneate user token for other requests"""
    headers = {
        "Content-Type": "application/x-www-form-urlencoded"
    }

    payload = {
        'grant_type': f'urn:ibm:params:oauth:grant-type:apikey',
        'apikey': f"{env_json['CLOUD_API_KEY']}"
    }

    res = requests.post(f'{identityURL}', headers=headers, data=payload, verify=False)
    if res.status_code in [200, 201, 202]:
        print("Successfully generated token")
    else:
        print("Code for token generation", res.status_code)
        print("Message", res.text)
    cur_string = res.json()
    access_token = cur_string['access_token']

    return access_token

### Functions to create sessions

In [15]:
def create_cos_session(access_token):
    s_cos = requests.Session()
    s_cos.headers.clear()
    headers_cos={
        "Authorization":"Bearer {}".format(access_token),
        "ibm-service-instance-id": env_json['COS_INSTANCE_CRN']
    }

    s_cos.headers.update(headers_cos) 
    print("COS API session created")
    return s_cos

In [51]:
def create_wxdata_session(access_token):
    wxd_headers={
    "Accept": "application/json",
    "Content-Type": "application/json",
    "Connection": "keep-alive",
    "Authorization":"Bearer {}".format(access_token),
    "AuthInstanceID": f"{env_json['WXD_INSTANCE_CRN']}"
}
    s_data = requests.Session()
    s_data.headers.clear()
    s_data.headers.update(wxd_headers)
    print("wx data API session created")
    return s_data

## COS

### COS authentication -> session

In [17]:
cur_token = generate_token()
s_cos = create_cos_session(cur_token)



Successfully generated token
COS API session created


### Create bucket

[bucket_endpoints](https://cloud.ibm.com/docs/cloud-object-storage?topic=cloud-object-storage-endpoints)  
[COS API](https://cloud.ibm.com/apidocs/cos/cos-compatibility#createbucket)

In [13]:
s_cos.get(bucket_endpoint).text

'<?xml version="1.0" encoding="UTF-8" standalone="yes"?><ListAllMyBucketsResult xmlns="http://s3.amazonaws.com/doc/2006-03-01/"><Owner><ID>3701ca4b-7d2a-4dca-aed6-29b96d29bddd</ID><DisplayName>3701ca4b-7d2a-4dca-aed6-29b96d29bddd</DisplayName></Owner><Buckets><Bucket><Name>watsonx-data-70685b2e-03dc-4cfa-a8bd-fa7a5de42bcf</Name><CreationDate>2025-07-11T12:18:15.109Z</CreationDate></Bucket></Buckets></ListAllMyBucketsResult>'

In [14]:
ex_buckets_names = []
r = s_cos.get(bucket_endpoint)
buckets_list = xmltodict.parse(r.text)['ListAllMyBucketsResult']['Buckets']
buckets_dict=buckets_list['Bucket'] if 'Bucket' in buckets_list else []
if type(buckets_dict) == dict:
    if any(bucket_name in buckets_dict['Name'] for bucket_name in buckets_to_create):
        ex_buckets_names.append(buckets_dict["Name"])
elif type(buckets_dict) == list:
    for bucket in buckets_dict:
        if any(bucket_name in bucket['Name'] for bucket_name in buckets_to_create):
            ex_buckets_names.append(bucket["Name"])

# dictionary of created buckets
created_buckets = dict.fromkeys(buckets_to_create)
for bucket_name in ex_buckets_names:
    cur_key = bucket_name.rsplit("-", 1)[0]
    created_buckets[cur_key] = bucket_name

# to create buckets if some of them are None
for bucket_name, existing_bucket in created_buckets.items():
    if existing_bucket is None:
        f_bucket_name = f"{bucket_name}-{current_timestamp}{random.randint(100, 999)}"
        location=buckets_location
        url_new_bucket = f"{bucket_endpoint}/{f_bucket_name}"
        r = s_cos.put(url_new_bucket)
        if r.status_code == 200:
            created_buckets[bucket_name] = f_bucket_name


### Delete bucket

In [15]:
# s_cos.delete(url_new_bucket)

# watsonx.data

### authenticate session

In [47]:
cur_token = generate_token()
s = create_wxdata_session(cur_token)



Successfully generated token
wx data API session created


### Initial information

In [48]:
# keep headers for the whole session
wxd_info = s.get(presto_url)
print(wxd_info.status_code)
wxd_info.json()

200


{'engines_services': [{'details': [{'id': 'presto718',
     'internal': {'hostname': '70685b2e-03dc-4cfa-a8bd-fa7a5de42bcf.d1d6f6nf0k9rj3mtfm80.lakehouse.appdomain.cloud',
      'port': 32170},
     'jdbc_class': 'com.facebook.presto.jdbc.PrestoDriver',
     'jdbc_urls': {'internal': 'jdbc:presto://username:password@70685b2e-03dc-4cfa-a8bd-fa7a5de42bcf.d1d6f6nf0k9rj3mtfm80.lakehouse.appdomain.cloud:32170?SSL=true&SSLTrustStorePath=path/to/presto-java2-ssl-certificate'},
     'name': 'starter',
     'ssl_certificate': '-----BEGIN CERTIFICATE-----\nMIIFQDCCBCigAwIBAgISBsOIb4XStsu1yXdFClk1TaacMA0GCSqGSIb3DQEBCwUA\nMDMxCzAJBgNVBAYTAlVTMRYwFAYDVQQKEw1MZXQncyBFbmNyeXB0MQwwCgYDVQQD\nEwNSMTAwHhcNMjUwNjMwMTM1NjQ5WhcNMjUwOTI4MTM1NjQ4WjAkMSIwIAYDVQQD\nExlsYWtlaG91c2UuYXBwZG9tYWluLmNsb3VkMIIBIjANBgkqhkiG9w0BAQEFAAOC\nAQ8AMIIBCgKCAQEA2dYEMvc6i5td+SSxYiHS3VurNvgtnbc6ySa0+xgrP6hjvrpx\n/MHLAgrLR4gkKcWhVW+saXhs6TbXLyqaU7oLxDbxfFiCAVtf3DPkPPBAF0cc52y+\nqsTmGMCO2VMU5czj7aywKyJrCvIKTODREthYrqjrciDOJ2SAxTf

### .data COS buckets registration

In [18]:
r = s.get(wxd_b_url)
wxd_buckets_info = r.json()

In [19]:
# buckets to register
buckets_registerations = list()
for bucket_name in wxd_buckets_info['bucket_registrations']:
    buckets_registerations.append(bucket_name['bucket_details']['bucket_name'])
buckets_to_register = dict()
for bucket_name, bucket_f_name in created_buckets.items():
    if bucket_f_name in buckets_registerations or bucket_name == 'input-data':
        continue
    else:
        buckets_to_register[bucket_name] = bucket_f_name
print('Current buckets to register', buckets_to_register)

Current buckets to register {'hive': 'hive-1752683278008064580', 'milvus': 'milvus-1752683278008064504'}


In [20]:
for bucket_name in buckets_to_register:
  if bucket_name == 'input-data':
    continue
  data = { "bucket_details": 
  { "access_key": env_json['COS_ACCESS_KEY'], "bucket_name": created_buckets[bucket_name], "endpoint": bucket_endpoint, "secret_key": env_json['COS_SECRET_KEY']}, 
  "bucket_display_name": bucket_name, 
  "bucket_type": "ibm_cos", 
  "description": f"COS bucket for {bucket_name} data",
  "managed_by": "customer", 
  "region": buckets_location
    }
  # for iceberg add iceberg catalog
  if bucket_name == 'wxd':
    data['associated_catalog'] = iceberg_catalog
  # for hive add hive catalog
  if bucket_name == 'hive':
    data['associated_catalog'] = hive_catalog
  r = s.post(wxd_b_url, data=json.dumps(data))
  print("Register", bucket_name, r.status_code, r.text)
  # to activate if created successfully
  if r.status_code in [200, 201] and bucket_name in ['wxd', 'hive']:
    wxd_ba_url = f"{wxd_b_url}/{created_buckets[bucket_name]}/activate"
    r = s.post(wxd_ba_url, data="")
    print("Activate", r.status_code, r.text)
  

Register hive 201 {"actions":null,"associated_catalogs":[{"base_path":"/","catalog_name":"hive_catalog","catalog_tags":[]}],"bucket_details":{"bucket_name":"hive-1752683278008064580","endpoint":"https://s3.eu-de.cloud-object-storage.appdomain.cloud"},"bucket_display_name":"hive","bucket_id":"hive-1752683278008064580","bucket_type":"ibm_cos","created_by":"anna.istomina@ibm.com","created_on":"1752683311","description":"COS bucket for hive data","managed_by":"customer","region":"eu-de","state":"active","tags":[]}

Activate 201 {"response":{"message":"Activate Bucket","message_code":"Success"}}

Register milvus 201 {"actions":null,"associated_catalogs":[],"bucket_details":{"bucket_name":"milvus-1752683278008064504","endpoint":"https://s3.eu-de.cloud-object-storage.appdomain.cloud"},"bucket_display_name":"milvus","bucket_id":"milvus-1752683278008064504","bucket_type":"ibm_cos","created_by":"anna.istomina@ibm.com","created_on":"1752683316","description":"COS bucket for milvus data","managed_

In [21]:
# find bucket registered for iceberg catalog (default bucket)
for bucket_info in wxd_buckets_info['bucket_registrations']:
    if len(bucket_info['associated_catalogs'])==0:
        continue
    for associated_catalog in bucket_info['associated_catalogs']:
        if 'iceberg' in associated_catalog['catalog_name']:
            # correct catalog name if incorrect
            iceberg_catalog['catalog_name'] = associated_catalog['catalog_name']
            if created_buckets.get('wxd') is None:
                created_buckets['wxd'] = bucket_info['bucket_details']['bucket_name']

### postgres as data source

#### Postgres credentials

In [22]:
with open(os.getenv('POSTGRES_CREDENTIALS_PATH')) as j_f:
    postgres_cred_json = json.load(j_f)

In [23]:
data = {
    "database_display_name": "Postgres",
    "database_type": "postgresql",
    "associated_catalog": {
        "catalog_name": postgres_catalog["catalog_name"],
        "catalog_type": postgres_catalog["catalog_type"]
    },
    "database_details": {
        "hostname": postgres_cred_json["connection"]["cli"]["arguments"][0][0].split(' ')[0].split('=')[-1],
        "port": int(postgres_cred_json["connection"]["cli"]["arguments"][0][0].split(' ')[1].split('=')[-1]),
        "database_name": postgres_cred_json["connection"]["cli"]["arguments"][0][0].split(' ')[2].split('=')[-1],
        "password": postgres_cred_json["connection"]["postgres"]["authentication"]["password"],
        "username": postgres_cred_json["connection"]["postgres"]["authentication"]["username"]
    }
}

In [24]:
# check if added, if not add
postgres_registration = False
database_registations = s.get(wxd_db_url).json()['database_registrations']
if database_registations is not None:
    for database_registration in database_registations:
        if database_registration['database_type'] == 'postgresql':
            postgres_registration = True
            print("Postgres registration already exists, won't be adding a new one")
if not postgres_registration:
    r = s.post(wxd_db_url, data=json.dumps(data))
    print(r.text)

{"actions":[],"associated_catalog":{"catalog_name":"postgres_catalog","catalog_tags":[]},"created_by":"anna.istomina@ibm.com","created_on":"1752683356","database_details":{"database_name":"ibmclouddb","hostname":"a485c259-ff39-4c65-b74b-29c3fa30a4d3.bn2a2vgd01r3l0hfmvc0.databases.appdomain.cloud","port":30632},"database_display_name":"Postgres","database_id":"postgresql28","database_properties":[{"encrypt":null,"key":"connector.name","value":"postgresql"},{"encrypt":null,"key":"connection-url","value":"jdbc:postgresql://a485c259-ff39-4c65-b74b-29c3fa30a4d3.bn2a2vgd01r3l0hfmvc0.databases.appdomain.cloud:30632/ibmclouddb"},{"encrypt":null,"key":"connection-user","value":"ibm_cloud_0c5df8d3_d68b_4f72_b9e8_bd3d9629acd9"},{"encrypt":null,"key":"allow-drop-table","value":"true"}],"database_type":"postgresql","tables":[],"tags":null,"topics":[]}



### Netezza as data source

In [22]:
nz_data = {
    "database_display_name": "INVESTMENTS_NZ",
    "database_type": "netezza",
    "associated_catalog": {
        "catalog_name": netezza_catalog["catalog_name"],
        "catalog_type": netezza_catalog["catalog_type"]
    },
    "database_details": {
        "hostname": nz_credentials["nz_host"],
        "port": int(nz_credentials["nz_port"]),
        "database_name": nz_credentials["nz_database"],
        "password": nz_credentials["nz_password"],
        "username": nz_credentials["nz_username"]
    }
}

In [23]:
# check if added, if not add
netezza_registration = False
database_registations = s.get(wxd_db_url).json()['database_registrations']
if database_registations is not None:
    for database_registration in database_registations:
        if database_registration['database_type'] == 'netezza':
            netezza_registration = True
            print("Netezza registration already exists, won't be adding a new one")
if not netezza_registration:
    r = s.post(wxd_db_url, data=json.dumps(nz_data))
    print(r.text)

Netezza registration already exists, won't be adding a new one


### .data engines and service

#### presto engine

In [24]:
data = {
    "description": "",
    "engine_details": {
        "coordinator": {
            "node_type": "starter",
            "quantity": 1
        },
        "size_config": "custom",
        "worker": {
            "node_type": "starter",
            "quantity": 1
        }
    },
    "engine_display_name": f"{presto_engine_name}2",
    "origin": "native",
    "associated_catalogs": [iceberg_catalog['catalog_name'], hive_catalog['catalog_name']],
    "version": "v0.286",
    "tags": [],
    "region": ""
}

In [29]:
# check if presto engine exists, if not create one
presto_engines = s.get(wxd_p_url).json()
# provision engine if not
if presto_engines['presto_engines'] is None:
    r = s.post(wxd_p_url, data=json.dumps(data))
    print(r.status_code)
    print(r.text)
    # wait till provisionning is finished
    if r.status_code in [200, 201, 202]:
        start_time=time.time()
        engine_wait = True
        while time.time() - start_time < 200 and engine_wait:
            try:
                r = s.get(wxd_p_url)
                if r.json()['presto_engines'] is None:
                    print("Provisioning not started yes. Waiting...")
                    time.sleep(10)
                    continue
                if r.json()['presto_engines'][0]['status'].upper() in ['PENDING', 'PROVISIONING']:
                    print("Provisioning presto engine. Waiting...")
                    time.sleep(30)
                    continue                
                if r.json()['presto_engines'][0]['status'].upper() == 'RUNNING':
                    print("Engine has provisioned")
                    engine_wait = False
                    continue
                print("Timeout reached.  Engine not provisioned")
            except requests.RequestException as e:
                print(f"Request failed: {e}")


In [50]:
# rename presto engine
presto_engines = s.get(wxd_p_url).json()['presto_engines']
presto_engine_id = presto_engines[0]['engine_id']
if presto_engines[0]['engine_display_name'] != presto_engine_name:
    data = {
        "engine_display_name": presto_engine_name
    }
    r = s.patch(f"{wxd_p_url}/{presto_engine_id}", data=json.dumps(data))
    print(r.status_code, r.text)

422 Unprocessable Entity



In [33]:
# associate catalogs if not yes
p_catalogs = presto_engines[0]['associated_catalogs'].copy()
catalogs_associate = []
if not any('hive' in cat for cat in p_catalogs):
    catalogs_associate.append(hive_catalog['catalog_name'])
if not any('iceberg' in cat for cat in p_catalogs):
    catalogs_associate.append(iceberg_catalog['catalog_name'])
if not any('postgre' in cat for cat in p_catalogs):
    catalogs_associate.append(postgres_catalog['catalog_name'])
if not any('netezza' in cat for cat in p_catalogs):
    catalogs_associate.append(netezza_catalog['catalog_name'])

if catalogs_associate:
    data = {
        "catalog_names": ','.join(catalogs_associate)
    }
    r = s.post(f"{wxd_p_url}/{presto_engine_id}/catalogs", data=json.dumps(data))
    print(r.status_code, r.text)

201 {"catalogs":[{"actions":null,"associated_buckets":null,"associated_databases":null,"associated_engines":null,"catalog_name":"nz_catalog","managed_by":null,"sync_exception":null,"tags":null}]}



#### spark engines

In [29]:
data = {
    "description": "Spark engine",
    "type": "spark",
    "engine_details": {
        "default_version": "3.5",
        "scale_config": {
            "node_type": "small",
            "number_of_nodes": 3
        },
        "engine_home_bucket_name": created_buckets['wxd']
    },
    "engine_display_name": spark_engine_name,
    "origin": "native",
    "associated_catalogs": [iceberg_catalog['catalog_name'], hive_catalog['catalog_name']]
}

In [30]:
# check if spark engine exists, if not create one
spark_engines = s.get(wxd_s_url).json()
# provision engine if not
if spark_engines['spark_engines'] is None:
    r = s.post(wxd_s_url, data=json.dumps(data))
    print(r.status_code)
    print(r.text)
    # wait till provisionning is finished
    if r.status_code in [200, 201, 202]:
        start_time=time.time()
        engine_wait = True
        while time.time() - start_time < 200 and engine_wait:
            try:
                r = s.get(wxd_s_url)
                if r.json()['spark_engines'] is None:
                    print("Provisioning not started yes. Waiting...")
                    time.sleep(10)
                    continue
                if r.json()['spark_engines'][0]['status'].upper() in ['PENDING', 'PROVISIONING']:
                    print("Provisioning spark engine. Waiting...")
                    time.sleep(30)
                    continue                
                if r.json()['spark_engines'][0]['status'].upper() == 'RUNNING':
                    print("Engine has provisioned")
                    engine_wait = False
                    continue
                print('Request has timed out')
            except requests.RequestException as e:
                print(f"Request failed: {e}")

202
{"actions":null,"associated_catalogs":["iceberg_data","hive_catalog"],"engine_id":"spark145","origin":"native","status":"provisioning","tags":null}

Provisioning spark engine. Waiting...
Provisioning spark engine. Waiting...
Provisioning spark engine. Waiting...
Provisioning spark engine. Waiting...
Provisioning spark engine. Waiting...
Provisioning spark engine. Waiting...
Provisioning spark engine. Waiting...


In [31]:
# make sure that iceberg and hive catalogs are associated
# associate catalogs if not yes
spark_engines = s.get(wxd_s_url).json()
spark_engine_id = spark_engines['spark_engines'][0]['engine_id']
s_catalogs = spark_engines['spark_engines'][0]['associated_catalogs'].copy()
catalogs_associate = []
if not any('hive' in cat for cat in s_catalogs):
    catalogs_associate.append(hive_catalog['catalog_name'])
if not any('iceberg' in cat for cat in s_catalogs):
    catalogs_associate.append(iceberg_catalog['catalog_name'])

if catalogs_associate:
    data = {
        "catalog_names": ','.join(catalogs_associate)
    }
    r = s.post(f"{wxd_s_url}/{spark_engine_id}/catalogs", data=json.dumps(data))
    print(r.status_code, r.text)

#### milvus service

In [32]:
data = {
    "origin": "native",
    "bucket_name": created_buckets["milvus"],
    "bucket_type": "ibm_cos",
    "service_display_name": milvus_servis_name,
    "root_path": f"/{milvus_servis_name}_metadata",
    "tshirt_size": "starter"
}

In [33]:
s.get(wxd_m_url).json()

{'milvus_services': []}

In [34]:
r = s.post(wxd_m_url, data=json.dumps(data))
print(r.status_code, r.text)

201 {"actions":[],"origin":"native","service_id":"milvus496","status":"running","status_code":null,"tags":[]}



In [35]:
# check if exists, if not create (wait till it's provisioned)
milvus_services = s.get(wxd_m_url).json()
# provision engine if not
if milvus_services['milvus_services'] == [] or milvus_services['milvus_services'] is None:
    r = s.post(wxd_m_url, data=json.dumps(data))
    print(r.status_code)
    print(r.text)
    # wait till provisionning is finished
    if r.status_code in [200, 201, 202]:
        start_time=time.time()
        engine_wait = True
        while time.time() - start_time < 1200 and engine_wait:
            try:
                r = s.get(wxd_m_url)
                if r.json()['milvus_services'] == [] or r.json()['milvus_services'] is None:
                    print("Provisioning not started yes. Waiting...")
                    time.sleep(10)
                    continue
                if r.json()['milvus_services'][0]['status'].upper() in ['PENDING', 'PROVISIONING']:
                    print("Provisioning spark engine. Waiting...")
                    time.sleep(30)
                    continue                
                if r.json()['milvus_services'][0]['status'].upper() == 'RUNNING':
                    print("Engine has provisioned")
                    engine_wait = False
                    continue
                print('Request has timed out')
            except requests.RequestException as e:
                print(f"Request failed: {e}")

## Create env file

### Add initial information from env file

In [36]:
env_json['IDENTITY_URL'] = os.getenv("IDENTITY_URL")
env_json['WXD_REGION'] = os.getenv("WXD_REGION")

### Update watsonx.data session

In [37]:
cur_token = generate_token()
s = create_wxdata_session(cur_token)

Successfully generated token
wx data API session created




### Add COS buckets information and catalogs associated

In [38]:
r_buckets = s.get(wxd_b_url)
buckets_info = r_buckets.json()

In [39]:
endpoints = {}
for bucket_info in buckets_info['bucket_registrations']:
    if 'milvus' in bucket_info['bucket_details']['bucket_name']:
        env_json['MILVUS_BUCKET'] =  bucket_info['bucket_details']['bucket_name']
        endpoints['milvus'] = bucket_info['bucket_details']['endpoint']
        continue
    if 'hive' in bucket_info['associated_catalogs'][0]['catalog_name']:
        env_json['HIVE_CATALOG'] = bucket_info['associated_catalogs'][0]['catalog_name']
        env_json['HIVE_BUCKET'] =  bucket_info['bucket_details']['bucket_name']
        endpoints['hive'] = bucket_info['bucket_details']['endpoint']
        continue
    if 'iceberg' in bucket_info['associated_catalogs'][0]['catalog_name']:
        env_json['ICEBERG_CATALOG'] = bucket_info['associated_catalogs'][0]['catalog_name']
        env_json['WXD_BUCKET'] =  bucket_info['bucket_details']['bucket_name']
        endpoints['iceberg'] = bucket_info['bucket_details']['endpoint']
        continue
env_json['INPUT_BUCKET'] = created_buckets['input-data']

In [40]:
end_values = [url_v.split("://")[1].split(".")[-4] for url_v in endpoints.values()]
all_same = all(v == end_values[0] for v in end_values)

print('Are all of the bucket endpoints the same?', all_same)

env_json['COS_ENDPOINT'] = end_values[1]

Are all of the bucket endpoints the same? True


If False check which endpoint is not the same (if it's the region or direct / public endpoint only)

### Update env file with engines and milvus service

:exclamation:make sure that all engines and milvus are provisioned (in the UI)!!!  
**Milvus** might take longer

In [41]:
# update info first
wxd_info = s.get(presto_url)
if wxd_info.status_code == 200:
    wxd_engines = wxd_info.json()['engines_services']
    for c_engine in wxd_engines:
        if c_engine['type'] == 'presto':
            env_json['PRESTO_HOST'] = c_engine['details'][0]['internal']['hostname']
            env_json['PRESTO_PORT'] = c_engine['details'][0]['internal']['port']
        if c_engine['type'] == 'spark_native':
            env_json['SPARK_ENGINE_ID'] = c_engine['details'][0]['id']
        if c_engine['type'] == 'milvus':
            assert 'hostname' in c_engine['details'][0]['grpc_api_endpoint'], 'MILVUS is still being setup'
            env_json['MILVUS_HOST'] = c_engine['details'][0]['grpc_api_endpoint']['hostname']
            env_json['MILVUS_PORT'] = c_engine['details'][0]['grpc_api_endpoint']['port']

AssertionError: MILVUS is still being setup

In [None]:
# Write to .env format
with open('.env_output', 'w') as env_file:
    for key, value in env_json.items():
        # Convert value to string and quote it
        quoted_value = f'"{str(value)}"'
        env_file.write(f"{key}={quoted_value}\n")
print("Checkout .env_output file in the current directory")