Azure · FabianMeiswinkel · Apr 2, 2024 · Mar 27, 2024 · Mar 27, 2024 · Mar 28, 2024
@@ -9,6 +9,7 @@
 #### Bugs Fixed
 
 #### Other Changes
+* Optimized the partitioning strategy implementation details to avoid unnecessarily high RU usage. - See [PR 39438](https://github.com/Azure/azure-sdk-for-java/pull/39438)
 
 ### 4.28.4 (2024-03-18)
 

@@ -9,6 +9,7 @@
 #### Bugs Fixed
 
 #### Other Changes
+* Optimized the partitioning strategy implementation details to avoid unnecessarily high RU usage. - See [PR 39438](https://github.com/Azure/azure-sdk-for-java/pull/39438)
 
 ### 4.28.4 (2024-03-18)
 

@@ -9,6 +9,7 @@
 #### Bugs Fixed
 
 #### Other Changes
+* Optimized the partitioning strategy implementation details to avoid unnecessarily high RU usage. - See [PR 39438](https://github.com/Azure/azure-sdk-for-java/pull/39438)
 
 ### 4.28.4 (2024-03-18)
 

@@ -9,6 +9,7 @@
 #### Bugs Fixed
 
 #### Other Changes
+* Optimized the partitioning strategy implementation details to avoid unnecessarily high RU usage. - See [PR 39438](https://github.com/Azure/azure-sdk-for-java/pull/39438)
 
 ### 4.28.4 (2024-03-18)
 

@@ -380,7 +380,7 @@
         "  { \"spark.cosmos.accountKey\", cosmosMasterKey },\r\n",
         "  { \"spark.cosmos.database\", \"SampleDatabase\" },\r\n",
         "  { \"spark.cosmos.container\", \"GreenTaxiRecords\" },\r\n",
-        "  { \"spark.cosmos.read.partitioning.strategy\", \"Default\" }, \r\n",
+        "  { \"spark.cosmos.read.partitioning.strategy\", \"Restrictive\" }, \r\n",
         "  { \"spark.cosmos.read.inferSchema.enabled\", \"false\" },\r\n",
         "  { \"spark.cosmos.changeFeed.startFrom\", \"Beginning\" },\r\n",
         "  { \"spark.cosmos.changeFeed.mode\", \"Incremental\" }\r\n",
@@ -437,7 +437,7 @@
         "  { \"spark.cosmos.accountKey\", cosmosMasterKey },\r\n",
         "  { \"spark.cosmos.database\", \"SampleDatabase\" },\r\n",
         "  { \"spark.cosmos.container\", \"GreenTaxiRecords\" },\r\n",
-        "  { \"spark.cosmos.read.partitioning.strategy\", \"Default\" }, \r\n",
+        "  { \"spark.cosmos.read.partitioning.strategy\", \"Restrictive\" }, \r\n",
         "  { \"spark.cosmos.read.inferSchema.enabled\", \"false\" }\r\n",
         "};\r\n",
         "\r\n",

@@ -48,7 +48,7 @@ def createCosmosView(cosmosDatabaseName: String, cosmosContainerName: String, co
                             spark.cosmos.database = '$cosmosDatabaseName',
                             spark.cosmos.container = '$cosmosContainerName',
                             spark.cosmos.read.inferSchema.enabled = 'False',  
-                            spark.cosmos.read.partitioning.strategy = 'Default'
+                            spark.cosmos.read.partitioning.strategy = 'Restrictive'
                           );
               """
   println("Executing create View...")

@@ -72,7 +72,7 @@ spark.sql(createTargetResources)
     "spark.cosmos.accountKey" -> cosmosSourceMasterKey,
     "spark.cosmos.database" -> cosmosSourceDatabaseName,
     "spark.cosmos.container" -> cosmosSourceContainerName,
-    "spark.cosmos.read.partitioning.strategy" -> "Default",
+    "spark.cosmos.read.partitioning.strategy" -> "Restrictive",
     "spark.cosmos.read.inferSchema.enabled" -> "false",   
     "spark.cosmos.changeFeed.startFrom" -> "Beginning",
     "spark.cosmos.changeFeed.mode" -> "Incremental",

@@ -52,7 +52,7 @@ OPTIONS (
   spark.cosmos.database = '${cosmosSourceDatabaseName}', -- source database 
   spark.cosmos.container = '${cosmosSourceContainerName}', -- source container 
   spark.cosmos.read.inferSchema.enabled = 'False',  
-  spark.cosmos.read.partitioning.strategy = 'Default');
+  spark.cosmos.read.partitioning.strategy = 'Restrictive');
 """
 
 var selectView = s"""

@@ -271,7 +271,7 @@
     {
       "cell_type": "code",
       "source": [
-        "print(\"Starting validation via change feed: \", datetime.datetime.utcnow().strftime(\"%Y-%m-%d %H:%M:%S.%f\"))\nchangeFeedCfg = {\n  \"spark.cosmos.accountEndpoint\": cosmosEndpoint,\n  \"spark.cosmos.accountKey\": cosmosMasterKey,\n  \"spark.cosmos.database\": \"SampleDatabase\",\n  \"spark.cosmos.container\": \"GreenTaxiRecords\",\n  \"spark.cosmos.read.partitioning.strategy\": \"Default\",\n  \"spark.cosmos.read.inferSchema.enabled\" : \"false\",\n  \"spark.cosmos.changeFeed.startFrom\" : \"Beginning\",\n  \"spark.cosmos.changeFeed.mode\" : \"Incremental\"\n}\nchangeFeed_df = spark.read.format(\"cosmos.oltp.changeFeed\").options(**changeFeedCfg).load()\ncount_changeFeed = changeFeed_df.count()\nprint(\"Number of records retrieved via change feed: \", count_changeFeed) \nprint(\"Finished validation via change feed: \", datetime.datetime.utcnow().strftime(\"%Y-%m-%d %H:%M:%S.%f\"))\n\nassert count_source == count_changeFeed"
+        "print(\"Starting validation via change feed: \", datetime.datetime.utcnow().strftime(\"%Y-%m-%d %H:%M:%S.%f\"))\nchangeFeedCfg = {\n  \"spark.cosmos.accountEndpoint\": cosmosEndpoint,\n  \"spark.cosmos.accountKey\": cosmosMasterKey,\n  \"spark.cosmos.database\": \"SampleDatabase\",\n  \"spark.cosmos.container\": \"GreenTaxiRecords\",\n  \"spark.cosmos.read.partitioning.strategy\": \"Restrictive\",\n  \"spark.cosmos.read.inferSchema.enabled\" : \"false\",\n  \"spark.cosmos.changeFeed.startFrom\" : \"Beginning\",\n  \"spark.cosmos.changeFeed.mode\" : \"Incremental\"\n}\nchangeFeed_df = spark.read.format(\"cosmos.oltp.changeFeed\").options(**changeFeedCfg).load()\ncount_changeFeed = changeFeed_df.count()\nprint(\"Number of records retrieved via change feed: \", count_changeFeed) \nprint(\"Finished validation via change feed: \", datetime.datetime.utcnow().strftime(\"%Y-%m-%d %H:%M:%S.%f\"))\n\nassert count_source == count_changeFeed"
       ],
       "metadata": {
         "application/vnd.databricks.v1+cell": {
@@ -301,7 +301,7 @@
     {
       "cell_type": "code",
       "source": [
-        "import math\n\nprint(\"Starting to identify to be deleted documents: \", datetime.datetime.utcnow().strftime(\"%Y-%m-%d %H:%M:%S.%f\"))\nreadCfg = {\n  \"spark.cosmos.accountEndpoint\": cosmosEndpoint,\n  \"spark.cosmos.accountKey\": cosmosMasterKey,\n  \"spark.cosmos.database\": \"SampleDatabase\",\n  \"spark.cosmos.container\": \"GreenTaxiRecords\",\n  \"spark.cosmos.read.partitioning.strategy\": \"Default\",\n  \"spark.cosmos.read.inferSchema.enabled\" : \"false\",\n}\n\ntoBeDeleted_df = spark.read.format(\"cosmos.oltp\").options(**readCfg).load().limit(100_000)\nprint(\"Number of records to be deleted: \", toBeDeleted_df.count()) \n\nprint(\"Starting to bulk delete documents: \", datetime.datetime.utcnow().strftime(\"%Y-%m-%d %H:%M:%S.%f\"))\ndeleteCfg = writeCfg.copy()\ndeleteCfg[\"spark.cosmos.write.strategy\"] = \"ItemDelete\"\ntoBeDeleted_df \\\n        .write \\\n        .format(\"cosmos.oltp\") \\\n        .mode(\"Append\") \\\n        .options(**deleteCfg) \\\n        .save()\nprint(\"Finished deleting documents: \", datetime.datetime.utcnow().strftime(\"%Y-%m-%d %H:%M:%S.%f\"))\n\nprint(\"Starting count validation via query: \", datetime.datetime.utcnow().strftime(\"%Y-%m-%d %H:%M:%S.%f\"))\ncount_query_schema=StructType(fields=[StructField(\"Count\", LongType(), True)])\nreadCfg[\"spark.cosmos.read.customQuery\"] = \"SELECT COUNT(0) AS Count FROM c\"\nquery_df = spark.read.format(\"cosmos.oltp\").schema(count_query_schema).options(**readCfg).load()\ncount_query = query_df.select(F.sum(\"Count\").alias(\"TotalCount\")).first()[\"TotalCount\"]\nprint(\"Number of records retrieved via query: \", count_query) \nprint(\"Finished count validation via query: \", datetime.datetime.utcnow().strftime(\"%Y-%m-%d %H:%M:%S.%f\"))\n\nassert max(0, count_source - 100_000) == count_query"
+        "import math\n\nprint(\"Starting to identify to be deleted documents: \", datetime.datetime.utcnow().strftime(\"%Y-%m-%d %H:%M:%S.%f\"))\nreadCfg = {\n  \"spark.cosmos.accountEndpoint\": cosmosEndpoint,\n  \"spark.cosmos.accountKey\": cosmosMasterKey,\n  \"spark.cosmos.database\": \"SampleDatabase\",\n  \"spark.cosmos.container\": \"GreenTaxiRecords\",\n  \"spark.cosmos.read.partitioning.strategy\": \"Restrictive\",\n  \"spark.cosmos.read.inferSchema.enabled\" : \"false\",\n}\n\ntoBeDeleted_df = spark.read.format(\"cosmos.oltp\").options(**readCfg).load().limit(100_000)\nprint(\"Number of records to be deleted: \", toBeDeleted_df.count()) \n\nprint(\"Starting to bulk delete documents: \", datetime.datetime.utcnow().strftime(\"%Y-%m-%d %H:%M:%S.%f\"))\ndeleteCfg = writeCfg.copy()\ndeleteCfg[\"spark.cosmos.write.strategy\"] = \"ItemDelete\"\ntoBeDeleted_df \\\n        .write \\\n        .format(\"cosmos.oltp\") \\\n        .mode(\"Append\") \\\n        .options(**deleteCfg) \\\n        .save()\nprint(\"Finished deleting documents: \", datetime.datetime.utcnow().strftime(\"%Y-%m-%d %H:%M:%S.%f\"))\n\nprint(\"Starting count validation via query: \", datetime.datetime.utcnow().strftime(\"%Y-%m-%d %H:%M:%S.%f\"))\ncount_query_schema=StructType(fields=[StructField(\"Count\", LongType(), True)])\nreadCfg[\"spark.cosmos.read.customQuery\"] = \"SELECT COUNT(0) AS Count FROM c\"\nquery_df = spark.read.format(\"cosmos.oltp\").schema(count_query_schema).options(**readCfg).load()\ncount_query = query_df.select(F.sum(\"Count\").alias(\"TotalCount\")).first()[\"TotalCount\"]\nprint(\"Number of records retrieved via query: \", count_query) \nprint(\"Finished count validation via query: \", datetime.datetime.utcnow().strftime(\"%Y-%m-%d %H:%M:%S.%f\"))\n\nassert max(0, count_source - 100_000) == count_query"
       ],
       "metadata": {
         "application/vnd.databricks.v1+cell": {
@@ -421,7 +421,7 @@
     {
       "cell_type": "code",
       "source": [
-        "%sql\nCREATE TABLE cosmosCatalog.SampleDatabase.GreenTaxiRecordsView \n  (id STRING, _ts TIMESTAMP, vendorID INT, totalAmount DOUBLE)\nUSING cosmos.oltp\nTBLPROPERTIES(isCosmosView = 'True')\nOPTIONS (\n  spark.cosmos.database = 'SampleDatabase',\n  spark.cosmos.container = 'GreenTaxiRecords',\n  spark.cosmos.read.inferSchema.enabled = 'False',\n  spark.cosmos.read.inferSchema.includeSystemProperties = 'True',\n  spark.cosmos.read.partitioning.strategy = 'Aggressive');\n\nSELECT * FROM cosmosCatalog.SampleDatabase.GreenTaxiRecordsView LIMIT 10"
+        "%sql\nCREATE TABLE cosmosCatalog.SampleDatabase.GreenTaxiRecordsView \n  (id STRING, _ts TIMESTAMP, vendorID INT, totalAmount DOUBLE)\nUSING cosmos.oltp\nTBLPROPERTIES(isCosmosView = 'True')\nOPTIONS (\n  spark.cosmos.database = 'SampleDatabase',\n  spark.cosmos.container = 'GreenTaxiRecords',\n  spark.cosmos.read.inferSchema.enabled = 'False',\n  spark.cosmos.read.inferSchema.includeSystemProperties = 'True',\n  spark.cosmos.read.partitioning.strategy = 'Restrictive');\n\nSELECT * FROM cosmosCatalog.SampleDatabase.GreenTaxiRecordsView LIMIT 10"
       ],
       "metadata": {
         "application/vnd.databricks.v1+cell": {

@@ -242,7 +242,7 @@
     "  \"spark.cosmos.accountKey\": cosmosMasterKey,\n",
     "  \"spark.cosmos.database\": \"SampleDatabase\",\n",
     "  \"spark.cosmos.container\": \"GreenTaxiRecords\",\n",
-    "  \"spark.cosmos.read.partitioning.strategy\": \"Default\",\n",
+    "  \"spark.cosmos.read.partitioning.strategy\": \"Restrictive\",\n",
     "  \"spark.cosmos.read.inferSchema.enabled\" : \"true\",\n",
     "  \"spark.cosmos.read.inferSchema.forceNullableProperties\" : \"true\",\n",
     "  \"spark.cosmos.changeFeed.startFrom\" : \"Beginning\",\n",
@@ -338,7 +338,7 @@
     "  spark.cosmos.database = 'SampleDatabase',\n",
     "  spark.cosmos.container = 'GreenTaxiRecordsCFSink',\n",
     "  spark.cosmos.read.inferSchema.enabled = 'False',\n",
-    "  spark.cosmos.read.partitioning.strategy = 'Default');\n",
+    "  spark.cosmos.read.partitioning.strategy = 'Restrictive');\n",
     "\n",
     "SELECT COUNT(*) FROM cosmosCatalog.SampleDatabase.GreenTaxiRecordsCFSinkView"
    ]

@@ -228,7 +228,7 @@ val changeFeedCfg = Map(
   "spark.cosmos.accountKey" -> cosmosMasterKey,
   "spark.cosmos.database" -> "SampleDatabase",
   "spark.cosmos.container" -> "GreenTaxiRecords",
-  "spark.cosmos.read.partitioning.strategy" -> "Default",
+  "spark.cosmos.read.partitioning.strategy" -> "Restrictive",
   "spark.cosmos.read.inferSchema.enabled" -> "false",
   "spark.cosmos.changeFeed.startFrom" -> "Beginning",
   "spark.cosmos.changeFeed.mode" -> "Incremental"
@@ -255,7 +255,7 @@ val readCfg = Map(
   "spark.cosmos.accountKey" -> cosmosMasterKey,
   "spark.cosmos.database" -> "SampleDatabase",
   "spark.cosmos.container" -> "GreenTaxiRecords",
-  "spark.cosmos.read.partitioning.strategy" -> "Default",
+  "spark.cosmos.read.partitioning.strategy" -> "Restrictive",
   "spark.cosmos.read.inferSchema.enabled" -> "false",
 )
 
@@ -329,7 +329,7 @@ assert(df_Tables.count() == 3)
 // MAGIC   spark.cosmos.container = 'GreenTaxiRecords',
 // MAGIC   spark.cosmos.read.inferSchema.enabled = 'False',
 // MAGIC   spark.cosmos.read.inferSchema.includeSystemProperties = 'True',
-// MAGIC   spark.cosmos.read.partitioning.strategy = 'Aggressive');
+// MAGIC   spark.cosmos.read.partitioning.strategy = 'Restrictive');
 // MAGIC 
 // MAGIC SELECT * FROM cosmosCatalog.SampleDatabase.GreenTaxiRecordsView LIMIT 10
 

@@ -253,7 +253,7 @@ val changeFeedCfg = Map(
   "spark.cosmos.auth.aad.clientSecret" -> clientSecret,
   "spark.cosmos.database" -> "SampleDatabase",
   "spark.cosmos.container" -> "GreenTaxiRecords",
-  "spark.cosmos.read.partitioning.strategy" -> "Default",
+  "spark.cosmos.read.partitioning.strategy" -> "Restrictive",
   "spark.cosmos.read.inferSchema.enabled" -> "false",
   "spark.cosmos.changeFeed.startFrom" -> "Beginning",
   "spark.cosmos.changeFeed.mode" -> "Incremental"
@@ -284,7 +284,7 @@ val readCfg = Map(
   "spark.cosmos.auth.aad.clientId" -> clientId,
   "spark.cosmos.auth.aad.clientSecret" -> clientSecret,  "spark.cosmos.database" -> "SampleDatabase",
   "spark.cosmos.container" -> "GreenTaxiRecords",
-  "spark.cosmos.read.partitioning.strategy" -> "Default",
+  "spark.cosmos.read.partitioning.strategy" -> "Restrictive",
   "spark.cosmos.read.inferSchema.enabled" -> "false",
 )
 
@@ -358,7 +358,7 @@ assert(df_Tables.count() == 3)
 // MAGIC   spark.cosmos.container = 'GreenTaxiRecords',
 // MAGIC   spark.cosmos.read.inferSchema.enabled = 'False',
 // MAGIC   spark.cosmos.read.inferSchema.includeSystemProperties = 'True',
-// MAGIC   spark.cosmos.read.partitioning.strategy = 'Aggressive');
+// MAGIC   spark.cosmos.read.partitioning.strategy = 'Restrictive');
 // MAGIC
 // MAGIC SELECT * FROM cosmosCatalog.SampleDatabase.GreenTaxiRecordsView LIMIT 10
 

@@ -76,24 +76,18 @@ private object CosmosPartitionPlanner extends BasicLoggingTrait {
 
       cosmosPartitioningConfig.partitioningStrategy match {
         case PartitioningStrategies.Restrictive =>
+        case PartitioningStrategies.Default =>
           applyRestrictiveStrategy(planningInfo)
         case PartitioningStrategies.Custom =>
           applyCustomStrategy(
             container,
             planningInfo,
             cosmosPartitioningConfig.targetedPartitionCount.get)
-        case PartitioningStrategies.Default =>
-          applyStorageAlignedStrategy(
-            container,
-            planningInfo,
-            1 / defaultMaxPartitionSizeInMB.toDouble,
-            defaultMinimalPartitionCount
-          )
         case PartitioningStrategies.Aggressive =>
           applyStorageAlignedStrategy(
             container,
             planningInfo,
-            5 / defaultMaxPartitionSizeInMB.toDouble,
+            1 / defaultMaxPartitionSizeInMB.toDouble,
             defaultMinimalPartitionCount
           )
       }