In [0]:
# configuration
cosmosEndpoint = "https://REPLACEME.documents.azure.com:443/"
cosmosMasterKey = "REPLACEME"
cosmosDatabaseName = "sampleDB"
cosmosContainerName = "sampleContainer"

cfg = { 
  "spark.cosmos.accountEndpoint" : cosmosEndpoint,
  "spark.cosmos.accountKey" : cosmosMasterKey,
  "spark.cosmos.database" : cosmosDatabaseName,
  "spark.cosmos.container" : cosmosContainerName
}

cfgWithAutoSchemaInferance = {
  "spark.cosmos.accountEndpoint" : cosmosEndpoint,
  "spark.cosmos.accountKey" : cosmosMasterKey,
  "spark.cosmos.database" : cosmosDatabaseName,
  "spark.cosmos.container" : cosmosContainerName,
  "spark.cosmos.read.inferSchemaEnabled" : "true"                          
}

In [0]:
# create Cosmos Database and Cosmos Container using Catalog APIs
spark.conf.set("spark.sql.catalog.cosmosCatalog", "com.azure.cosmos.spark.CosmosCatalog")
spark.conf.set("spark.sql.catalog.cosmosCatalog.spark.cosmos.accountEndpoint", cosmosEndpoint)
spark.conf.set("spark.sql.catalog.cosmosCatalog.spark.cosmos.accountKey", cosmosMasterKey)

# create a cosmos database
spark.sql("CREATE DATABASE IF NOT EXISTS cosmosCatalog.{};".format(cosmosDatabaseName))

# create a cosmos container
spark.sql("CREATE TABLE IF NOT EXISTS cosmosCatalog.{}.{} using cosmos.items TBLPROPERTIES(partitionKeyPath = '/id', manualThroughput = '1100')".format(cosmosDatabaseName, cosmosContainerName))

In [0]:
# ingestion
spark.createDataFrame((("cat-alive", "Schrodinger cat", 2, True), ("cat-dead", "Schrodinger cat", 2, False)))\
  .toDF("id","Name","Age","isAlive") \
   .write\
   .format("cosmos.items")\
   .options(**cfg)\
   .mode("APPEND")\
   .save()

In [0]:
# Show the schema of the table and data without auto schema inference
df = spark.read.format("cosmos.items").options(**cfg).load()
df.printSchema()

df.show()

In [0]:
# Show the schema of the table and data with auto schema inference
df = spark.read.format("cosmos.items").options(**cfgWithAutoSchemaInferance).load()
df.printSchema()

df.show()

In [0]:
## Query to find the live cat and increment age of the alive cat
from pyspark.sql.functions import col

df.filter(col("isAlive") == True)\
 .withColumn("Age", col("Age") + 1)\
 .show()