In [None]:
string cosmosEndpoint = "<cosmos-endpoint>";
string cosmosMasterKey = "<cosmos-master-key>";
string cosmosDatabaseName = "ContosoHospital";
string cosmosContainerName =  "Patient";
string checkpointLocation = "/tmp/streaming";
string targetContainerName = "CopyContainer";

// Patient Document -- partition key: patientId
// {
//   "id": "9c9a1156-e936-40f3-a442-e9528b55a2fb",
//   "patientId": "423ab2cf-dd1c-4404-8524-86cee045c179",
//   "patientName": "John Doe",
//   "doctorId" : "629f49da-9cfc-45a4-8e1c-d4f8b7ab1f4e",
//   "doctorName": "Sung Ondricka"
// }

In [None]:
//-----filtering examples-with-schema-inference-disabled---------------------

var changeFeedCfg = new Dictionary<string, string>(){
  {"spark.synapse.linkedService", "CosmosDBLSContosoHospital"},
  {"spark.cosmos.container" , cosmosContainerName},
  {"spark.cosmos.read.inferSchema.enabled" , "false"},   
  {"spark.cosmos.changeFeed.startFrom" , "Beginning"},
  {"spark.cosmos.changeFeed.mode" , "Incremental"},
  {"spark.cosmos.changeFeed.itemCountPerTriggerHint" , "100000"}
  //optional configuration for throughput control
  // {"spark.cosmos.throughputControl.enabled", "true"},
  // {"spark.cosmos.throughputControl.name", "SourceContainerThroughputControl"},
  // {"spark.cosmos.throughputControl.targetThroughputThreshold", "0.30"}, 
  // {"spark.cosmos.throughputControl.globalControl.database", "database-v4"}, 
  // {"spark.cosmos.throughputControl.globalControl.container", "ThroughputControl}"
};

var writeCfg = new Dictionary<string, string>(){
  {"spark.synapse.linkedService", "CosmosDBLSContosoHospital"},
  {"spark.cosmos.container" , targetContainerName},
  {"checkpointLocation" , checkpointLocation}
};

In [None]:
//optional configuration for creating throughput control metadata container

// spark.Conf().Set("spark.sql.catalog.cosmosCatalog", "com.azure.cosmos.spark.CosmosCatalog");
// spark.Conf().Set("spark.sql.catalog.cosmosCatalog.spark.cosmos.accountEndpoint", cosmosEndpoint);
// spark.Conf().Set("spark.sql.catalog.cosmosCatalog.spark.cosmos.accountKey", cosmosMasterKey);


In [None]:
-- %%sql

-- CREATE DATABASE IF NOT EXISTS cosmosCatalog.database-v4;

-- CREATE TABLE IF NOT EXISTS cosmosCatalog.database-v4.ThroughputControl
-- USING cosmos.oltp
-- TBLPROPERTIES(partitionKeyPath = '/groupId', autoScaleMaxThroughput = '4000');


In [None]:
// ----- EXAMPLE 1 -----

var changeFeedDF = spark
    .ReadStream()
    .Format("cosmos.oltp.changefeed")
    .Options(changeFeedCfg)
    .Load();

// Here is an example of splitting the id column by "-" and creating a new column with only the second item
// e.g. Id = XXXXX-SubIdWeAreInterestedIn-XXXXX
DataFrame splitByDash = changeFeedDF.WithColumn("idSplit",Functions.Split(Column("id"),"-"));
DataFrame withSplitId = splitByDash.WithColumn("newId", Column("idSplit").GetItem(1));

// Filter by newId to get only the columns we want to migrate, then drop helper columns
DataFrame filteredByNewId = withSplitId.Filter(Column("newId") == 7926).Drop("idSplit").Drop("newId");

// preserve system properties like _ts, _etag by renaming the original column
DataFrame df_withAuditFields = filteredByNewId.WithColumnRenamed("_rawbody", "_origin_rawBody");

// Write a streaming Spark DataFrame to a Cosmos DB container
df_withAuditFields
    .WriteStream()
    .Format("cosmos.oltp")
    .Options(writeCfg)
    .OutputMode("append")
    .Start()
    .AwaitTermination();

In [None]:
// ----- EXAMPLE 2 -----
// The following example uses filter and string matching to parse out rows where doctorId values match a given value, without the need for the spark.read.json feature or any joins

var changeFeedDF = spark
    .ReadStream()
    .Format("cosmos.oltp.changefeed")
    .Options(changeFeedCfg)
    .Load();

// Filter by an example doctorId we are interested in from raw document into a new df
DataFrame filteredDf = changeFeedDF.Filter(Column("_rawBody").Contains("\"doctorId\":\"5b15f027-74d1-4ab8-9ad3-cca848837f66\""));

// preserve system properties like _ts, _etag by renaming the original column
DataFrame df_withAuditFields = filteredDf.WithColumnRenamed("_rawbody", "_origin_rawBody");

// write streaming dataframe to the target container
df_withAuditFields
    .WriteStream()
    .Format("cosmos.oltp")
    .Options(writeCfg)
    .OutputMode("append")
    .Start()
    .AwaitTermination();

In [None]:
// ----- EXAMPLE 3 ----- 
//In this example we will write data into a container with different partition key from source container

string targetContainerName = "CopyWithDoctorId";
string checkpointLocation = "/tmp/pk_doctorId_checkpoint";

In [None]:
// Configure Catalog Api to be used
spark.Conf().Set("spark.sql.catalog.cosmosCatalog", "com.azure.cosmos.spark.CosmosCatalog");
spark.Conf().Set("spark.sql.catalog.cosmosCatalog.spark.cosmos.accountEndpoint", cosmosEndpoint);
spark.Conf().Set("spark.sql.catalog.cosmosCatalog.spark.cosmos.accountKey", cosmosMasterKey);

In [None]:
%%sql

// create an Azure Cosmos DB container using catalog api
CREATE TABLE IF NOT EXISTS cosmosCatalog.ContosoHospital.CopyWithDoctorId
USING cosmos.oltp
TBLPROPERTIES(partitionKeyPath = '/doctorId', autoScaleMaxThroughput = '1000');


In [None]:

var writeCfgWithNewPK = new Dictionary<string, string>(){
 {"spark.synapse.linkedService", "CosmosDBLSContosoHospital"},
  {"spark.cosmos.container" , targetContainerName},
  {"checkpointLocation" , checkpointLocation}
};

// read streaming data from changeFeed
DataFrame changeFeedDF = spark.
        ReadStream().
        Format("cosmos.oltp.changeFeed")
        .Options(changeFeedCfg)
        .Load();

// write streaming data into new container with different PK
df_withAuditFields
    .WriteStream()
    .Format("cosmos.oltp")
    .Options(writeCfgWithNewPK)
    .OutputMode("append")
    .Start()
    .AwaitTermination();