### Setup

In [1]:
import openai, json, os, requests, time, csv, uuid
from openai import AzureOpenAI

from tenacity import retry, wait_random_exponential, stop_after_attempt  
from concurrent.futures import ThreadPoolExecutor
from functools import partial
from dotenv import load_dotenv
from cosmosdb_mongodb import insert_one_if_not_exists, create_index
from urllib.parse import quote
from pymongo import MongoClient

load_dotenv()

os.environ["OPENAI_API_TYPE"] = "azure"
os.environ["OPENAI_API_VERSION"] = os.getenv("AZURE_OPENAI_API_VERSION")
os.environ["azure_endpoint"] = os.getenv("AZURE_OPENAI_ENDPOINT")
os.environ["OPENAI_API_KEY"] = os.getenv("AZURE_OPENAI_API_KEY")
os.environ["OPENAI_EMBEDDINGS_MODEL_NAME"] = os.getenv("AZURE_OPENAI_EMBEDDING_MODEL")



### Helper functions

In [2]:
@retry(wait=wait_random_exponential(min=1, max=20), stop=stop_after_attempt(10))
def generate_embeddings(openai_client, text):
    """
    Generates embeddings for a given text using the OpenAI API v1.x
    """
    response = openai_client.embeddings.create(
        input = text,
        model= os.getenv("AZURE_OPENAI_EMBEDDING_MODEL")
    )
    
    embeddings = response.data[0].embedding
    return embeddings

In [3]:
def init_cosmos():
    """
    Initialize the CosmosDB client, database, and collections
    
    Returns:
        database: CosmosDB database
        products_collection: CosmosDB collection for products
        customers_collection: CosmosDB collection for customers
    """

    host = os.getenv('COSMOSDB_MONGODB_HOST')
    username = os.getenv('COSMOSDB_MONGODB_USERNAME')
    password = os.getenv('COSMOSDB_MONGODB_PASSWORD')
    database_name = os.getenv('COSMOSDB_MONGODB_DATABASE')
    products_collection_name = os.getenv('COSMOSDB_MONGODB_PRODUCTS')
    customers_collection_name = os.getenv('COSMOSDB_MONGODB_CUSTOMERS')

    # Encode the password
    encoded_password = quote(password, safe='')

    connection_string = f'mongodb+srv://{username}:{encoded_password}@{host}/?tls=true&authMechanism=SCRAM-SHA-256&retrywrites=false&maxIdleTimeMS=120000'

    # in case of problems with SSL certificates, you might want to try with 'tlsAllowInvalidCertificates=True'
    # connection_string = f'mongodb+srv://{username}:{encoded_password}@{host}/?tls=true&tlsAllowInvalidCertificates=true&authMechanism=SCRAM-SHA-256&retrywrites=false&maxIdleTimeMS=120000'
    
    client = MongoClient(connection_string)
        
    database = client[database_name]
    products_collection = database[products_collection_name]
    customers_collection = database[customers_collection_name]
    
    return database, products_collection, customers_collection

In [4]:
def add_doc(openai_client, collection, doc):
    """ 
    Add document to Azure Cosmos DB for MongoDB vCore collection
    """
    try:
        doc["textContent"] = json.dumps(doc)
        doc["vectorContent"] = generate_embeddings(openai_client, doc["textContent"])
        insert_one_if_not_exists(collection, doc)
        print(doc["id"])
    except Exception as e:
        print(str(e))

### Populate Cosmos DB for MongoDB from json file 

In [5]:
# Init cosmos db
database, products_collection, customers_collection = init_cosmos()

In [6]:
# Insert products
with open('product.json') as file:
    products = json.load(file)

openai_client = AzureOpenAI(
  api_key = os.getenv("AZURE_OPENAI_API_KEY"),  
  api_version = os.getenv("AZURE_OPENAI_API_VERSION"),  
  azure_endpoint =os.getenv("AZURE_OPENAI_ENDPOINT") 
)


# Create a ThreadPoolExecutor
with ThreadPoolExecutor(max_workers=5) as executor:
    # For each product in the products list
    for product in products:
        # Use the executor to run add_doc in a separate thread
        executor.submit(add_doc, openai_client, products_collection, product)

# count products
c = products_collection.count_documents({})
print(f"There are {c} products in the collection")    

027D0B9A-F9D9-4C96-8213-C8546C4AAE71
0A7E57DA-C73F-467F-954F-17B7AFD6227E
08225A9E-F2B3-4FA3-AB08-8C70ADD6C3C2
201D0D79-81AD-43D2-AD6E-F09EEE6AC2D7
24BE4267-85D8-4C1A-B184-C08709495752
290B4594-95BE-47C5-863A-4EFAAFC0AED7
1A176FDB-D9A8-4888-BDD9-CE4F12E97AAE
14174164-F6C0-47FC-83FB-604C6A63408D
29663491-D2E9-47B4-83AE-D9459B6B5B67
2C981511-AC73-4A65-9DA3-A0577E386394
3F105575-8677-42F9-8E1F-76E4B450F136
3FE1A99E-DE14-4D11-B635-F5D39258A0B9
47C70E1E-E500-41B3-8615-DCCB963D9E35
44873725-7B3B-4B28-804D-963D2D62E761
4B0848F8-7BF5-4DB9-84A7-C4D69F2E3E8E
4E4B38CB-0D82-43E5-89AF-20270CD28A04
5089E32E-8A60-4117-AA98-5EF8AB9A61D1
52FAD88C-567E-469D-A35E-574EA3BF147F
5308BAE7-B0CB-4883-9A93-192CB10DC94F
5996B5E0-6EC7-4CB7-A924-7B5A053AE980
5BFADECD-2240-4480-9485-1256D1D60EA8
5B5E90B8-FEA2-4D6C-B728-EC586656FA6D
6E3AA511-67DF-4EAD-8F0C-4C9F91F7D335
6FB5B2D5-5725-4998-9B6C-2FF2B7A3E3E0
71BDFE67-6499-4A8E-9CCA-9E9AF7D92A7A
7BAA49C9-21B5-4EEF-9F6B-BCD6DA7C2239
7EA0EEEB-824E-42E9-B787-019219CE4466
8

In [7]:
# Insert customers
with open('customer.json') as file:
    products = json.load(file)

openai_client = AzureOpenAI(
  api_key = os.getenv("AZURE_OPENAI_API_KEY"),  
  api_version = os.getenv("AZURE_OPENAI_API_VERSION"),  
  azure_endpoint =os.getenv("AZURE_OPENAI_ENDPOINT") 
)

# Create a ThreadPoolExecutor
with ThreadPoolExecutor(max_workers=5) as executor:
    # For each customer in the customers list
    for customer in products:
        # Use the executor to run add_doc in a separate thread
        executor.submit(add_doc, openai_client, customers_collection, customer)

# count customers
c = customers_collection.count_documents({})
print(f"There are {c} customers in the collection")    

29C95F8A-9C52-48DB-A1C4-8A14C430FF06
23A65A9A-479C-44D2-9F6A-E6CDA8B0BE08
0E57A241-1B95-43A2-BCFB-637608B0AD1A
34E7A125-0F66-4673-A80B-20B4C46EAD3A
022BB1FA-35E6-4CC5-9079-8EA61FE7FAAE
35D52474-3D1A-433C-A310-10FA7DF8950B
45E422FD-0AE2-4C73-8883-61B1C3BB4431
44A6D5F6-AF44-4B34-8AB5-21C5DC50926E
3945DB3E-2632-466C-BCBE-0C252729C937
00500AA1-3E9D-4E83-9C21-07B0AF482B3F
00E4B453-A79A-4590-8CCB-28ED95003CB7
0148B088-5124-45FB-B815-22851683D8AA
01AFA50A-2009-4AC3-A008-B6F8E2003BB6
00CC8882-4B98-4273-BDD3-732CB8F5A2E0
01D792C9-8C84-4120-B9FB-87C707D94B30
4FEAA310-61D4-4A89-8E78-3CA6B34F7934
0288E4FA-1C89-411E-801E-5098B0CDC414
029E366C-0023-40AB-9684-51F8BC734BCB
039896DA-698A-4C60-8A6B-083926A5C281
0379883C-C865-4D57-BF65-AF79267CE961
537E369C-C65B-4F23-B7C0-D07DFFFAC08B
04B0E36F-E776-4B93-89A2-3FDD59B4F0B5
06794E40-1A3E-49B6-9914-94438A2D881305223D68-4D04-4EF3-A4E2-BE7DEA6A8066

05F826D9-3526-463D-96CE-A2340DE8E554
06C02E4F-484B-48AF-8C97-B952FF4E1313
06CA14C2-29D5-4306-B9D1-8E804C358494
0