# Global Distribution with Azure Cosmos DB

In this notebook we will compare the read and write latency from this Notebook located in Central US between three different Cosmos accounts.

First, you must create three Cosmos DB accounts with the following configurations.

- accounts 0: Single-master, single region, East US 2. 
- accounts 1: Single-master, two regions, East US 2 (master replica) and Central US (read replica). 
- accounts 2: Multi-master, two regions, East US 2 (master replica) and Central US (master replica).


## Initialize Resources
In this cell connect to the three accounts, put into an array, then iterate over the array creating a database and container for each account.

In [None]:
import logging
logger = logging.getLogger()
#logger.setLevel(logging.CRITICAL)

import azure.cosmos.documents as documents
import azure.cosmos.cosmos_client as cosmos
from azure.cosmos.partition_key import PartitionKey
import time

#
acct0_uri = "<fill-me>"
acct0_key = "<fill-me>"
acct0_connection_policy = documents.ConnectionPolicy()
acct0_connection_policy.PreferredLocations = ["East US 2"]

acct1_uri = "<fill-me>"
acct1_key = "<fill-me>"
acct1_connection_policy = documents.ConnectionPolicy()
acct1_connection_policy.PreferredLocations = ["Central US"]

acct2_uri = "<fill-me>"
acct2_key = "<fill-me>"
acct2_connection_policy = documents.ConnectionPolicy()
acct2_connection_policy.PreferredLocations = ["Central US"]
acct2_connection_policy.UseMultipleWriteLocations = True

accounts = []

accounts.append(cosmos.CosmosClient(url=acct1_uri, auth={'masterKey':acct1_key}, consistency_level=documents.ConsistencyLevel.Eventual, connection_policy=acct1_connection_policy))
accounts.append(cosmos.CosmosClient(url=acct2_uri, auth={'masterKey':acct2_key}, consistency_level=documents.ConsistencyLevel.Eventual, connection_policy=acct2_connection_policy))
accounts.append(cosmos.CosmosClient(url=acct3_uri, auth={'masterKey':acct3_key}, consistency_level=documents.ConsistencyLevel.Eventual, connection_policy=acct3_connection_policy))

db_name = "db1"
container_name  = "c1"
db_query = "select * from r where r.id = '{0}'".format(db_name)
container_query = "select * from r where r.id = '{0}'".format(container_name)

for account in accounts:
    # Create the database if it doesn't exist
    db = list(account.query_databases(db_query))
    if db:
        print('Database already exists')
    else:
        account.create_database(id=db_name)
        print('Database created')
        time.sleep(3)
    # Create the container
    db = account.get_database_client(db_name)
    containers = db.read_all_containers()
    if(any(container['id'] == container_name for container in containers)):
        db.delete_container(container_name); #delete and recreate to clear out old data
        print('delete container')
    pk = PartitionKey(path='/id', kind='Hash')
    db.create_container(container_name, pk)
    print('Container created')


## Pre-load Data
In this cell, pre-load Account 0 and Account 1 with 100 items to prepare them for a read-latency test.

In [None]:
import sys
import json
import random
import uuid

!{sys.executable} -m pip install Faker --user
from faker import Faker
fake = Faker()

c0 = accounts[0].get_database_client(db_name).get_container_client(container_name)
c1 = accounts[1].get_database_client(db_name).get_container_client(container_name)

for x in range(0, 100):
    item1 = {
      "id": str(uuid.uuid4()),
      "name": fake.name(),
      "city": fake.city(),
      "state": fake.state(),
      "uid": random.randint(0,100)
    }
    item2 = item1
    c0.create_item(body=item1)
    c1.create_item(body=item2)
print("Data load complete")

## Read Latency Benchmark Single-master, Single-region
This cell will benchmark the read latency for a Notebook running in Central US against a database with only a single read/write region in East US 2.

This test will first query the container to get the self id's for 100 items. Then iterate through them and execute 100 point reads, measuring latency and RU cost for each read. It will then print the average latency and RU/s at the end.

In [None]:
#Load a list of id's to do point reads with
c = accounts[0].get_database_client(db_name).get_container_client(container_name)
#%%sql --database {db_name} --container {container_name} --output ids
sql = "SELECT value c.id FROM c"
ids = list(c.query_items(query=sql, enable_cross_partition_query=True))

l = []
r = []

for id in ids:
    start = time.time()
    response = c.read_item(item=id, partition_key=id)
    end = time.time()
    latency = round((end-start)*1000)
    ru = float(c.client_connection.last_response_headers['x-ms-request-charge'])
    l.append(latency)
    r.append(ru)
    #print("Latency: " + str(latency) + "ms, RU: " + str(ru))

l.sort()
l = l[:99]
avgL = round(sum(l)/len(l))
avgR = round(sum(r)/len(r))

print("Read Latency for Single-Master, Single-Region: Reader in Central US, Read Replica in East US 2")
print("Average Read Latency (P99): " + str(avgL) + "ms, Average RU: " + str(avgR))

## Read Latency Benchmark Single-master, Multi-region
This cell will benchmark the read latency for a Notebook running in Central US against a database with a master replica in East US 2 and a read replica in Central US.

This test will first query the container to get the self id's for 100 items. Then iterate through them and doing 100 point reads, measuring latency and RU cost for each read. It will then print the average latency and RU/s at the end.

In [None]:
#Load a list of id's to do point reads with
c = accounts[1].get_database_client(db_name).get_container_client(container_name)
#%%sql --database {db_name} --container {container_name} --output ids
sql = "SELECT value c.id FROM c"
ids = list(c.query_items(query=sql, enable_cross_partition_query=True))

l = []
r = []

for id in ids:
    start = time.time()
    response = c.read_item(item=id, partition_key=id)
    end = time.time()
    latency = round((end-start)*1000)
    ru = float(c.client_connection.last_response_headers['x-ms-request-charge'])
    l.append(latency)
    r.append(ru)
    #print("Latency: " + str(latency) + "ms, RU: " + str(ru))

l.sort()
l = l[:99]
avgL = round(sum(l)/len(l))
avgR = round(sum(r)/len(r))

print("Read Latency for Single-Master, Multi-Region: Reader in Central US, Read Replica in Central US")
print("Average Read Latency (P99): " + str(avgL) + "ms, Average RU: " + str(avgR))

## Write Latency Benchmark Single-master, Multi-region

This cell will benchmark the write latency for a Notebook running in Central US against a database with it's write replica in East US 2.

This test will first generate 100 items to insert. Then iterate through the list and do 100 inserts, measuring latency and RU cost for each write. It will then print the average write latency and RU/s at the end.

In [None]:
c = accounts[1].get_database_client(db_name).get_container_client(container_name)

l = []
r = []
items = []

fake = Faker()
print("create 100 items for test")
for x in range(0, 100):
    item = {
      "id": str(uuid.uuid4()),
      "name": fake.name(),
      "city": fake.city(),
      "state": fake.state(),
      "uid": random.randint(0,100)
    }
    items.append(item)
    
for item in items:
    start = time.time()
    c.create_item(body=item)
    end = time.time()
    latency = round((end-start)*1000)
    ru = float(c.client_connection.last_response_headers['x-ms-request-charge'])
    l.append(latency)
    r.append(ru)
    #print("Write Latency: " + str(latency) + "ms, RU: " + str(ru))

l.sort()
l = l[:99]
avgL = round(sum(l)/len(l))
avgR = round(sum(r)/len(r))

print("Write Latency for Single-Master, Multi-Region: Writer in Central US, Write Replica in East US 2")
print("Average Write Latency (P99): " + str(avgL) + "ms, Average RU: " + str(avgR))

## Write Latency Benchmark Multi-master, Multi-region

This cell will benchmark the write latency for a Notebook running in Central US against a multi-master database in East US 2 and Central US regions.

This test will first generate 100 items to insert. Then iterate through the list and do 100 inserts, measuring latency and RU cost for each write. It will then print the average write latency and RU/s at the end.

In [None]:
c = accounts[2].get_database_client(db_name).get_container_client(container_name)

l = []
r = []
items = []

fake = Faker()
print("create 100 items for test")
for x in range(0, 100):
    item = {
      "id": str(uuid.uuid4()),
      "name": fake.name(),
      "city": fake.city(),
      "state": fake.state(),
      "uid": random.randint(0,100)
    }
    items.append(item)
    
for item in items:
    start = time.time()
    c.create_item(body=item)
    end = time.time()
    latency = round((end-start)*1000)
    ru = float(c.client_connection.last_response_headers['x-ms-request-charge'])
    l.append(latency)
    r.append(ru)
    print("Write Latency: " + str(latency) + "ms, RU: " + str(ru))

l.sort()
l = l[:99]
avgL = round(sum(l)/len(l))
avgR = round(sum(r)/len(r))

print("Write Latency for Multi-Master, Multi-Region: Writer in Central US, Write Replica in Central US")
print("Average Write Latency (P99): " + str(avgL) + "ms, Average RU: " + str(avgR))
