In [None]:
! pip install python-dotenv

! pip install openai

! pip install geopy

! pip install azure-cosmos

In [2]:
# Import the required libraries
import time
import os
import json
import uuid
from dotenv import dotenv_values
from openai import OpenAI, AzureOpenAI


#Cosmos DB imports
from azure.cosmos import CosmosClient

In [4]:
env_name = "variables.env" # following example.env template change to your own .env file name
config = dotenv_values(env_name)

nosql_uri=config["NOSQL_URI"]
cosmos_key = config['NOSQL_PRIMARY_KEY']
cosmos_database = "Account"
cosmos_container = "Transactions"

openai_endpoint = config['AOAI_ENDPOINT']
openai_key = config['AOAI_KEY']
openai_api_version = config['API_VERSION']
openai_embeddings_deployment = config['AOAI_EMBEDDING_DEPLOYMENT']
openai_embeddings_model = config['AOAI_EMBEDDING_DEPLOYMENT_MODEL']
#openai_embeddings_dimensions = int(config['openai_embeddings_dimensions'])

# Create the Azure Cosmos DB for NoSQL client
cosmos_client = CosmosClient(url=nosql_uri, credential=cosmos_key)

azure_openai_embeddings = AzureOpenAI(
    api_version=openai_api_version,
    api_key= openai_key,
    azure_endpoint= openai_endpoint,
)

In [5]:
#create database
DATABASE_NAME = "fraud-nosql-db"
db= cosmos_client.create_database_if_not_exists(
    id=DATABASE_NAME
)
properties = db.read()
print(json.dumps(properties))

{"id": "fraud-nosql-db", "_rid": "+jUIAA==", "_self": "dbs/+jUIAA==/", "_etag": "\"0000f500-0000-4700-0000-66ebf3df0000\"", "_colls": "colls/", "_users": "users/", "_ts": 1726739423}


In [6]:
vector_embedding_policy = {
    "vectorEmbeddings": [
        {
            "path":"/locationVector",
            "dataType":"float32",
            "distanceFunction":"cosine",
            "dimensions":1536
        },
    ]
}

indexing_policy = {
    "includedPaths": [
        {
            "path": "/*"
        }
    ],
    "excludedPaths": [
        {
            "path": "/\"_etag\"/?"
        },
        {
            "path": "/locationVector/*"
        }
    ],
    "vectorIndexes": [
        {"path": "/locationVector",
         "type": "diskANN"
        }
    ]
}

In [20]:
from azure.cosmos import PartitionKey, exceptions

CONTAINER_NAME = "fraud-nosql-cont"
try:    
    container = db.create_container_if_not_exists(
                    id=CONTAINER_NAME,
                    partition_key=PartitionKey(path="/TenantId"),
                    indexing_policy=indexing_policy,
                    vector_embedding_policy=vector_embedding_policy)

    properties = container.read()
    print('Container with properties \'{0}\' created'.format(properties))

except exceptions.CosmosResourceExistsError:
    print('A container with id \'{0}\' already exists'.format(id))

Container with properties '{'id': 'fraud-nosql-cont', 'indexingPolicy': {'indexingMode': 'consistent', 'automatic': True, 'includedPaths': [{'path': '/*'}], 'excludedPaths': [{'path': '/"_etag"/?'}, {'path': '/locationVector/*'}], 'vectorIndexes': [{'path': '/locationVector', 'type': 'diskANN'}]}, 'partitionKey': {'paths': ['/TenantId'], 'kind': 'Hash', 'version': 2}, 'conflictResolutionPolicy': {'mode': 'LastWriterWins', 'conflictResolutionPath': '/_ts', 'conflictResolutionProcedure': ''}, 'geospatialConfig': {'type': 'Geography'}, 'vectorEmbeddingPolicy': {'vectorEmbeddings': [{'path': '/locationVector', 'dataType': 'float32', 'dimensions': 1536, 'distanceFunction': 'cosine'}]}, '_rid': '+jUIAPV8yqA=', '_ts': 1726750562, '_self': 'dbs/+jUIAA==/colls/+jUIAPV8yqA=/', '_etag': '"00004201-0000-4700-0000-66ec1f620000"', '_docs': 'docs/', '_sprocs': 'sprocs/', '_triggers': 'triggers/', '_udfs': 'udfs/', '_conflicts': 'conflicts/'}' created


In [8]:
def generate_embeddings(lat_lon):
    lat_lon_str = f"{lat_lon[0]},{lat_lon[1]}"
    
    # Call OpenAI to generate embeddings (assuming text input is required)
    response = azure_openai_embeddings.embeddings.create(input=lat_lon_str, model=openai_embeddings_model)
    embeddings = response.model_dump()
    
    time.sleep(0.5)  # To avoid API rate limits
    
    return embeddings['data'][0]['embedding']

In [9]:
from geopy.geocoders import Nominatim

def get_city_coordinates(city_name):
    try:
        # Create a geolocator object using Nominatim service
        geolocator = Nominatim(user_agent="MyAPP")
        
        # Geocode the city name to get location details
        location = geolocator.geocode(city_name)
        
        if location:
            # Extract the latitude and longitude from the location object
            lat = location.latitude
            lon = location.longitude
            return lat, lon
        else:
            print(f"City '{city_name}' not found.")
            return None, None
    except Exception as e:
        print(f"Error occurred: {e}")
        return None, None



In [22]:
# Load text-sample_w_embeddings.json which has embeddings pre-computed
data_file = open(file="data/data_with_tenants.json", mode="r") 

data = json.load(data_file)
data_file.close()

In [23]:
# Take a peek at one data item
print(json.dumps(data[2], indent=2))

{
  "TransactionID": "T7356",
  "Amount": 360.73,
  "Timestamp": "2024-09-15 14:04:38",
  "Location": "New York",
  "Merchant": "Amazon",
  "Fraud": false,
  "TenantId": "1"
}


In [24]:
# Generate embeddings for each location and store data in cosmos db container
for item in data:
    transaction_id = item["TransactionID"]
    item['id'] = transaction_id
    location = item["Location"]
    location_coord = get_city_coordinates(location)
    location_embeddings = generate_embeddings(location_coord)
    item['locationVector'] = location_embeddings
    item['@search.action'] = 'upload'
   
    print("Creating embeddings for transaction:", transaction_id, end='\r')
    
    # Insert the item into the container
    container.upsert_item(item)    

Creating embeddings for transaction: T8612

In [15]:
import numpy as np

def get_average_location_vector(container = container, num_purchases=10):
    # SQL query to get the last 'num_purchases' transactions ordered by timestamp
    sql_query = """
    SELECT TOP @num_purchases c.locationVector
    FROM c
    ORDER BY c.Timestamp DESC
    """
    
    # Parameters for the query
    parameters = [
        {"name": "@num_purchases", "value": num_purchases}
    ]
    
    # Execute the query to get the location vectors
    results = container.query_items(
        query=sql_query,
        parameters=parameters,
        enable_cross_partition_query=True
    )
    
    # Collect the location vectors
    vectors = []
    for result in results:
        vectors.append(result['locationVector'])
    
    # If no vectors are found, return None
    if not vectors:
        return None
    
    # Convert the list of vectors into a numpy array
    vectors_np = np.array(vectors)
    
    # Calculate the element-wise average of the vectors
    avg_vector = np.mean(vectors_np, axis=0)
    
    return avg_vector


In [16]:
def vector_search( current_location_vector, average_location_vector, amount, num_results=5):

    if isinstance(current_location_vector, np.ndarray):
        current_location_vector = current_location_vector.tolist()
    if isinstance(average_location_vector, np.ndarray):
        average_location_vector = average_location_vector.tolist()
    # SQL query for vector similarity search combined with vector distances and amount filtering
    sql_query = """
    SELECT TOP @num_results 
        c.TransactionID,
        c.Amount, 
        c.timestamp, 
        c.location, 
        c.merchant, 
        VectorDistance(@average_location_vector, @current_location_vector) AS ProximityofCurrentToAverage
    FROM c
    WHERE 
        VectorDistance(@average_location_vector, @current_location_vector) > 0.1  -- Proximity to the current location vector
        AND c.Amount > @amount * 0.5 
        AND c.Amount < @amount * 2.0
    -- ORDER BY ProximityofCurrentToAverage
    """

    # Parameters for the SQL query
    parameters = [
        {"name": "@num_results", "value": num_results},    # Number of results
        {"name": "@average_location_vector", "value": average_location_vector},  # Use the average location vector
        {"name": "@current_location_vector", "value": current_location_vector},  # Use the current location vector
        {"name": "@amount", "value": amount},  # Transaction amount range filtering
    ]

    results = container.query_items(
        query=sql_query,
        parameters=parameters,
        enable_cross_partition_query=True
    )

    return list(results)


In [25]:
# Assuming you already computed the average location vector and current location vector
average_location_vector = get_average_location_vector(container)
current_location_vector = generate_embeddings(get_city_coordinates("Phoenix"))

# Query parameters
query = "Recent purchase at a grocery store"
amount = 100.0  # The current transaction amount

# Perform vector search
results = vector_search( current_location_vector, average_location_vector, amount, num_results=5)

for result in results:
    print(result)


{'TransactionID': 'T2243', 'Amount': 165.86, 'ProximityofCurrentToAverage': 0.7936539014827947}
{'TransactionID': 'T5109', 'Amount': 108.79, 'ProximityofCurrentToAverage': 0.7936539014827947}
{'TransactionID': 'T5751', 'Amount': 194.48, 'ProximityofCurrentToAverage': 0.7936539014827947}
{'TransactionID': 'T7623', 'Amount': 137.76, 'ProximityofCurrentToAverage': 0.7936539014827947}
{'TransactionID': 'T6345', 'Amount': 60.86, 'ProximityofCurrentToAverage': 0.7936539014827947}
