## Overview

This project implements a **fraud detection system** that integrates **Azure Cosmos DB** and **Azure OpenAI embeddings**. It allows the detection of suspicious activities based on transaction patterns, geographical information, and vector similarity using embeddings generated by OpenAI's API. The system stores transaction data in Cosmos DB, generates embeddings for the locations, and performs vector-based searches to detect anomalies in transactions.


## Prerequisites

To set up and run this project, the following Python packages are required:

1. **python-dotenv**: For loading environment variables from a `.env` file.
2. **openai**: To interact with the OpenAI API for generating embeddings.
3. **geopy**: For geocoding city names into latitude and longitude coordinates.
4. **azure-cosmos**: For interacting with the Azure Cosmos DB service.

You can install these packages by running:


In [None]:
! pip install python-dotenv
! pip install openai
! pip install geopy
! pip install azure-cosmos

In [207]:
# Import the required libraries
import time
import os
import json
from dotenv import dotenv_values
from openai import OpenAI, AzureOpenAI
import pandas as pd

#Cosmos DB imports
from azure.cosmos import CosmosClient, PartitionKey, exceptions

## Environment Setup

You need to set up a `.env` file that contains your connection details to Azure Cosmos DB and Azure OpenAI. Here's a template for the environment variables that should be included:


In [98]:
env_name = "variable.env" # following example.env template change to your own .env file name
config = dotenv_values(env_name)

nosql_uri=config["NOSQL_URI"]
cosmos_key = config['NOSQL_PRIMARY_KEY']
DATABASE_NAME = "fraud-nosql-db2"
CONTAINER_NAME = "fraud-nosql-cont"
openai_endpoint = config['AOAI_ENDPOINT']
openai_key = config['AOAI_KEY']
openai_api_version = config['API_VERSION']
openai_embeddings_deployment = config['AOAI_EMBEDDING_DEPLOYMENT']
openai_embeddings_model = config['AOAI_EMBEDDING_DEPLOYMENT_MODEL']

cosmos_client = CosmosClient(url=nosql_uri, credential=cosmos_key)

azure_openai_embeddings = AzureOpenAI(
    api_version=openai_api_version,
    api_key= openai_key,
    azure_endpoint= openai_endpoint,
)

In [None]:
db= cosmos_client.create_database_if_not_exists(
    id=DATABASE_NAME
)
properties = db.read()
print(json.dumps(properties))

In [90]:
vector_embedding_policy = {
    "vectorEmbeddings": [
        {
            "path":"/locationVector",
            "dataType":"float32",
            "distanceFunction":"cosine",
            "dimensions":1536
        },
    ]
}

indexing_policy = {
    "includedPaths": [
        {
            "path": "/*"
        }
    ],
    "excludedPaths": [
        {
            "path": "/\"_etag\"/?"
        },
        {
            "path": "/locationVector/*"
        }
    ],
    "vectorIndexes": [
        {"path": "/locationVector",
         "type": "diskANN"
        }
    ]
}

In [None]:
try:    
    container = db.create_container_if_not_exists(
                    id=CONTAINER_NAME,
                    partition_key=PartitionKey(path="/TenantId"),
                    indexing_policy=indexing_policy,
                    vector_embedding_policy=vector_embedding_policy)

    properties = container.read()
    print('Container with properties \'{0}\' created'.format(properties))

except exceptions.CosmosResourceExistsError:
    print('A container with id \'{0}\' already exists'.format(id))

In [92]:
def generate_embeddings(lat_lon):
    lat_lon_str = f"{lat_lon[0]},{lat_lon[1]}"
    
    # Call OpenAI to generate embeddings (assuming text input is required)
    response = azure_openai_embeddings.embeddings.create(input=lat_lon_str, model=openai_embeddings_model)
    embeddings = response.model_dump()
    
    time.sleep(0.5)  # To avoid API rate limits
    
    return embeddings['data'][0]['embedding']

In [93]:
from geopy.geocoders import Nominatim

def get_city_coordinates(city_name):
    try:
        # Create a geolocator object using Nominatim service
        geolocator = Nominatim(user_agent="MyAPP")
        
        # Geocode the city name to get location details
        location = geolocator.geocode(city_name)
        
        if location:
            # Extract the latitude and longitude from the location object
            lat = location.latitude
            lon = location.longitude
            return lat, lon
        else:
            print(f"City '{city_name}' not found.")
            return None, None
    except Exception as e:
        print(f"Error occurred: {e}")
        return None, None



In [94]:
# Load text-sample_w_embeddings.json which has embeddings pre-computed
data_file = open(file="data/data_with_tenants.json", mode="r") 

data = json.load(data_file)
data_file.close()

In [None]:
# Take a peek at one data item
print(json.dumps(data[4], indent=2))

In [None]:
# Generate embeddings for each location and store data in cosmos db container
for item in data:
    transaction_id = item["TransactionID"]
    item['id'] = transaction_id
    location = item["Location"]
    location_coord = get_city_coordinates(location)
    location_embeddings = generate_embeddings(location_coord)
    item['locationVector'] = location_embeddings
    item['@search.action'] = 'upload'
   
    print("Creating embeddings for transaction:", transaction_id, end='\r')
    
    # Insert the item into the container
    container.upsert_item(item)    

In [201]:
import numpy as np

def get_average_location_vector(container, tenant_id):
    # SQL query to get the last 'num_purchases' transactions ordered by timestamp
    sql_query = """
    SELECT c.locationVector
    FROM c
    WHERE c.TenantId = @tenant_id
    ORDER BY c.Timestamp DESC
    """
    
    # Parameters for the query
    parameters = [
        {"name": "@tenant_id", "value": tenant_id}
    ]
    
    # Execute the query to get the location vectors
    results = container.query_items(
        query=sql_query,
        parameters=parameters,
        enable_cross_partition_query=True
    )
    
    # Collect the location vectors
    vectors = []
    for result in results:
        vectors.append(result['locationVector'])
    
    
    # If no vectors are found, return None
    if not vectors:
        return None
    
    # Convert the list of vectors into a numpy array
    vectors_np = np.array(vectors)
    
    # Calculate the element-wise average of the vectors
    avg_vector = np.mean(vectors_np, axis=0)
    
    return avg_vector

In [202]:
def vector_search( current_location_vector, tenant_id, average_location_vector, amount, num_results=5):

    if isinstance(current_location_vector, np.ndarray):
        current_location_vector = current_location_vector.tolist()
    if isinstance(average_location_vector, np.ndarray):
        average_location_vector = average_location_vector.tolist()

    sql_query = """
 SELECT 
        c.TransactionID,
        c.Amount, 
        c.Timestamp, 
        c.Location, 
        c.Merchant,
        c.TenantId, 
        VectorDistance(c.locationVector, @current_location_vector) AS ProximityOfCurrentToLast,
        VectorDistance(@current_location_vector, @average_location_vector) AS ProximityOfAverageToLast
    FROM c
    WHERE 
        VectorDistance(c.locationVector, @current_location_vector) > 0.1
        AND VectorDistance(@current_location_vector, @average_location_vector) > 0.1
        AND c.TenantId = @tenant_id
    """

    # Parameters for the SQL query
    parameters = [
        {"name": "@num_results", "value": num_results},   
        {"name": "@average_location_vector", "value": average_location_vector},  
        {"name": "@current_location_vector", "value": current_location_vector},  
        {"name": "@amount", "value": amount},  # Transaction amount range filtering
        {"name": "@tenant_id", "value": tenant_id},  # Transaction amount range filtering
    ]

    results = container.query_items(
        query=sql_query,
        parameters=parameters,
        enable_cross_partition_query=True
    )

    return list(results)


In [205]:
def perform_search(tenant_id, city, query, amount):

    average_location_vector = get_average_location_vector(container, tenant_id)
    
    # Generate embeddings for the current location
    current_location_vector = generate_embeddings(get_city_coordinates(city))
    
    # Perform vector search
    results = vector_search(current_location_vector, tenant_id, average_location_vector, amount, num_results=5)
    
    return results

In [None]:
tenant_id = "10"
city = "Sweden"
merchant = "Walmart"
amount = 1000

results = perform_search(tenant_id, city, merchant, amount)
print(pd.DataFrame(results))