In [1]:
%run /db_utils

StatementMeta(, , -1, Finished, Available)

In [2]:
from concurrent.futures import ThreadPoolExecutor
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, StringType
import requests
import msal
import concurrent.futures

StatementMeta(, 942c7f91-4ac8-401b-88c0-a49c5fb77d3a, 5, Finished, Available)

In [3]:
# Setup variables
tenant_id = "20496407-13b5-4bb9-82e6-51962685d376"
environment = "ISPLBCDemoIN"
client_id = "10825cb2-6f85-436c-8cef-521427c780fa"
client_secret = "Iem8Q~TTUhlhdIhX8dZ6xIRcEBDyZpp6zKotsajy"
cronus_in_company_id = "afbd363d-54e9-ed11-884e-6045bdaa603c"
table_name = "GeneralLedgerEntries"

spark = (
    SparkSession.builder.appName("BC2Fabric")
    .config("spark.executor.cores", "64")
    .config("spark.executor.memory", "100g")
    .config("spark.executor.instances", "2")
    .config("spark.dynamicAllocation.enabled", "true")
    .config("spark.dynamicAllocation.maxExecutors", "2")
    .config("spark.executor.memoryOverhead", "10g")
    .config("spark.sql.shuffle.partitions", "128")
    .getOrCreate()
)

StatementMeta(, 942c7f91-4ac8-401b-88c0-a49c5fb77d3a, 6, Finished, Available)

In [4]:
def getToken(tenant, client_id, client_secret):
    authority = "https://login.microsoftonline.com/" + tenant
    scope = ["https://api.businesscentral.dynamics.com/.default"]

    app = msal.ConfidentialClientApplication(client_id, authority=authority, client_credential=client_secret)
    
    try:
        token_response = app.acquire_token_for_client(scopes=scope)
        access_token = token_response.get('access_token')
        
        if access_token:
            print('New access token retrieved....')
        else:
            print('Error acquiring authorization token.')
    except Exception as err:
        print(err)
        access_token = None
    
    return access_token

StatementMeta(, 942c7f91-4ac8-401b-88c0-a49c5fb77d3a, 7, Finished, Available)

In [5]:
class Connect(object):
    def __init__(self, url, auth=None, headers=None):
        self.url = url
        self._auth = auth
        self._headers = headers
        self._etag = str()
        self.except_error = None

    def overwrite(self, batch_size=20000, total_records=20000):
        try:
            spark = SparkSession.builder.getOrCreate()
            batch_start = 0
            print("Batch Start:", batch_start)  # Debugging: Print batch_start
            df = None  # Initialize the DataFrame
            response = requests.get(self.url, auth=self._auth, headers=self._headers, params={"$top": batch_size, "$skip": batch_start})
            if response.status_code == 200:
                    response_dict = response.json()
                    value_list = response_dict.get("value", [])
                    print("Get the Response:", batch_start)
                    if value_list:
                        schema = StructType([
                            StructField(col, StringType(), True) for col, val in value_list[0].items() if col.lower() != "@odata.etag"
                        ])
                        batch_df = spark.createDataFrame(value_list, schema=schema)
                        batch_df = batch_df.drop('@odata.etag')
                        print("Fetched batch:", batch_start)
                        return batch_df
            return None
        except Exception as e:
            print(f"An error occurred: {e}")
            return None

    def upsert(self, batch_size=20000, total_records=100000):
        try:
            spark = SparkSession.builder.getOrCreate()
            batch_start = spark.sql(f"SELECT count(*) FROM BC_Lakehouse.{table_name}").collect()[0][0]
            # batch_start = 11480000
            print("Batch Start:", batch_start)  # Debugging: Print batch_start
            df = None  # Initialize the DataFrame
            temp = 0
            

            # Define a function to fetch data for a batch
            def fetch_batch(batch_start):
                nonlocal df
                print("Fetching batch starting from:", batch_start)  # Debugging: Print batch_start
                response = requests.get(self.url, auth=self._auth, headers=self._headers, params={"$top": batch_size, "$skip": batch_start})
                if response.status_code == 200:
                    response_dict = response.json()
                    value_list = response_dict.get("value", [])
                    print("Get the Response:", batch_start)
                    if value_list:
                        schema = StructType([
                            StructField(col, StringType(), True) for col, val in value_list[0].items() if col.lower() != "@odata.etag"
                        ])
                        batch_df = spark.createDataFrame(value_list, schema=schema)
                        batch_df = batch_df.drop('@odata.etag')
                        print("Fetched batch:", batch_start)
                        return batch_df
                return None

            with concurrent.futures.ThreadPoolExecutor(max_workers=5) as executor:  # Set max_workers to 5
                # Submit tasks for parallel execution
                future_to_batch = {executor.submit(fetch_batch, batch_start + i * batch_size): i for i in range(total_records // batch_size)}
                for future in concurrent.futures.as_completed(future_to_batch):
                    batch_df = future.result()
                    if batch_df.count() > 0:
                        if df is None:
                            df = batch_df
                        else:
                            df = df.union(batch_df)
                    else:
                        temp = temp +1 
                        if temp > 1:
                            break

            # Process the collected data
            if df is not None:
                print("Total Records: " + str(df.count()))
                return df
            else:
                print("No data retrieved.")
                return None
        except Exception as e:
            print(f"An error occurred: {e}")
            return None


StatementMeta(, 942c7f91-4ac8-401b-88c0-a49c5fb77d3a, 8, Finished, Available)

In [6]:
reqToken = getToken(tenant_id, client_id, client_secret)

# Check if the access token is obtained successfully
if reqToken:
    try:
        # Build the request URL for fetching customers
        url = f"https://api.businesscentral.dynamics.com/v2.0/{tenant_id}/{environment}/api/v2.0/companies({cronus_in_company_id})/{table_name}"

        # Build the request Headers
        reqHeaders = {"Accept-Language": "en-us", "Authorization": f"Bearer {reqToken}"}

        # Create a Connect instance for customers
        customers_connect = Connect(url, headers=reqHeaders)
        # Fetch the customer data into a PySpark DataFrame
        spark = SparkSession.builder.getOrCreate()
        if spark.sql(f"SHOW TABLES LIKE '{table_name}'").count() == 0:
            customers_df = customers_connect.overwrite(batch_size=20000,total_records=20000)
            write_delta_v2(
                type="data_copy",
                df=customers_df,
                mode="overwrite",
                dbname="dbo",
                targetTable=table_name,
                sourceTable=table_name,
                keyCol="id",
            )
        else :
            for i in range(5):
                customers_df = customers_connect.upsert(batch_size=20000)
                write_delta_v2(
                    type="data_copy",
                    df=customers_df,
                    mode="upsert",
                    dbname="dbo",
                    targetTable=table_name,
                    sourceTable=table_name,
                    keyCol="id",
                )
    except Exception as e:
        print(f"An error occurred: {e}")
else:
    print("Failed to obtain access token.")

StatementMeta(, 942c7f91-4ac8-401b-88c0-a49c5fb77d3a, 9, Submitted, Running)

New access token retrieved....
Batch Start: 10020000
Fetching batch starting from: 10020000
Fetching batch starting from: 10040000
Fetching batch starting from: 10060000
Fetching batch starting from: 10080000
Fetching batch starting from: 10100000
Get the Response: 10080000
Fetched batch: 10080000
Get the Response: 10040000
Fetched batch: 10040000
Get the Response: 10060000
Fetched batch: 10060000
Get the Response: 10100000
Fetched batch: 10100000
Get the Response: 10020000
Fetched batch: 10020000
Total Records: 100000
table already exists!
upsert mode
Batch Start: 10120000
Fetching batch starting from: 10120000
Fetching batch starting from: 10140000
Fetching batch starting from: 10160000
Fetching batch starting from: 10180000
Fetching batch starting from: 10200000
Get the Response: 10140000
Fetched batch: 10140000
Get the Response: 10120000
Fetched batch: 10120000
Get the Response: 10200000
Fetched batch: 10200000
Get the Response: 10160000
Fetched batch: 10160000
Get the Response: 10

In [None]:
df = spark.sql(f"SELECT count(*) FROM BC_Lakehouse.{table_name}")
display(df)

StatementMeta(, , , Waiting, )