# Cosmos DB in Fabric
# Credit Card Fraud Detection Sample – Part 1: Data Generation

This section demonstrates how to generate synthetic credit card data and transactions for use in a Cosmos DB container within Microsoft Fabric. The goal is to create realistic sample data that can later be used for fraud detection scenarios.

### Prerequisites
Before running this notebook, ensure you have:

- A **Cosmos DB artifact** created in Microsoft Fabric.
- Two containers:
    - **CCTransactions** – Stores credit card transaction records. 
        Indexing Policy
        {
        "path": "/embedding",
        "type": "DiskANN",
        "dimensions": 1536,
        "metric": "cosine",
        "quantizationByteSize": 4,
        "indexingSearchListSize": 128,
        "vectorIndexShardKey": ["/card_id"]
        }
        data type: float 32
    - **CreditCards** – Stores credit card details.
 
- An **OpenAI endpoint and key** for generating embeddings (placeholders will be used in this sample).
- Installed required Python packages.

### Install Required Packages ###

In [None]:
%pip install azure-core azure-cosmos
%pip install openai

### Imports and Configuration ###

Set up imports and define configuration values for Cosmos DB and OpenAI. Replace placeholder strings with your actual values when running in your environment.

In [None]:
# Imports
import base64, json
import openai
import os
import uuid
import random
import time
import math
import numpy as np
from datetime import datetime, timezone
from typing import Any, Optional, List, Dict, Tuple

from azure.cosmos import CosmosClient, PartitionKey, ThroughputProperties
from azure.core.credentials import TokenCredential, AccessToken

# Cosmos DB configuration
COSMOS_ENDPOINT = '<COSMOS_ENDPOINT>' # The Cosmos DB artifact endpoint from the artifact settings
COSMOS_DATABASE_NAME = '<COSMOS_DATABASE_NAME>' # The Cosmos DB artifact name is the database name
COSMOS_TRANSACTION_CONTAINER_NAME = "CCTransactions"
COSMOS_CC_CONTAINER_NAME = "CreditCards"

# OpenAI configuration
os.environ["OPENAI_API_VERSION"] = "2023-05-15"

OPEN_AI_MODEL = "text-embedding-ada-002"



### Authentication Class ###

Use a custom credential class to authenticate securely with Cosmos DB using Fabric tokens.

In [None]:
## Authentication Class

class FabricTokenCredential(TokenCredential):
    """Token credential for Fabric Cosmos DB access with automatic refresh and retry logic."""
    
    def get_token(self, *scopes: str, claims: Optional[str] = None, tenant_id: Optional[str] = None,
                  enable_cae: bool = False, **kwargs: Any) -> AccessToken:
        access_token = notebookutils.credentials.getToken("https://cosmos.azure.com/.default")
        parts = access_token.split(".")
        if len(parts) < 2:
            raise ValueError("Invalid JWT format")
        payload_b64 = parts[1]
        # Fix padding
        padding = (-len(payload_b64)) % 4
        if padding:
            payload_b64 += "=" * padding
        payload_json = base64.urlsafe_b64decode(payload_b64.encode("utf-8")).decode("utf-8")
        payload = json.loads(payload_json)
        exp = payload.get("exp")
        if exp is None:
            raise ValueError("exp claim missing in token")
        return AccessToken(token=access_token, expires_on=exp)

### Initialize Cosmos DB Clients ###

Create clients for the database and containers.

In [None]:
# Initialize Cosmos DB cosmos client
COSMOS_CLIENT = CosmosClient(COSMOS_ENDPOINT, FabricTokenCredential())

# Initialize Cosmos DB database client
DATABASE_CLIENT = COSMOS_CLIENT.get_database_client(COSMOS_DATABASE_NAME)

# Intialize Cosmos DB container client
txns_container = DATABASE_CLIENT.get_container_client(COSMOS_TRANSACTION_CONTAINER_NAME) 
cards_container = DATABASE_CLIENT.get_container_client(COSMOS_CC_CONTAINER_NAME)


### Merchant and Location Data ###

Define sample merchants and U.S. states for transaction generation.

In [None]:
merchants = [
    # Retail & E-Commerce
    "Amazon", "Walmart", "Target", "Best Buy", "Costco", "Home Depot", "Lowe's",
    "Macy's", "Nordstrom", "Kohl's", "eBay", "Wayfair", "Etsy", "AliExpress", "Shein",
    "Sam's Club", "BJ's Wholesale", "Bed Bath & Beyond",

    # Food & Beverage
    "Starbucks", "Dunkin'", "McDonald's", "Subway", "Chipotle", "Panera Bread",
    "Domino's Pizza", "Pizza Hut", "Chick-fil-A", "Burger King", "Taco Bell",
    "KFC", "Popeyes", "Shake Shack", "Five Guys",

    # Tech & Electronics
    "Apple Store", "Microsoft", "Google", "Samsung", "Dell", "HP", "Lenovo",
    "Sony", "Asus", "Acer", "Nvidia",

    # Fashion & Sports
    "Nike", "Adidas", "Under Armour", "Lululemon", "Zara", "H&M", "Gap",
    "Old Navy", "Uniqlo", "Forever 21", "Victoria's Secret",

    # Entertainment & Streaming
    "Netflix", "Hulu", "Disney+", "Spotify", "YouTube Premium", "Amazon Prime Video",
    "HBO Max", "Peacock", "Paramount+", "Apple TV+", "Crunchyroll",

    # Travel & Transport
    "Uber", "Lyft", "Airbnb", "Expedia", "Booking.com", "Delta Airlines",
    "United Airlines", "American Airlines", "Southwest Airlines", "Marriott",
    "Hilton", "Hyatt",

    # Financial & Services
    "PayPal", "Venmo", "Stripe", "Square", "Cash App", "Zelle",

    # Grocery & Pharmacy
    "Kroger", "Safeway", "Albertsons", "Publix", "Trader Joe's", "Whole Foods",
    "CVS", "Walgreens", "Rite Aid",

    # Home & Lifestyle
    "IKEA", "Ashley Furniture", "Crate & Barrel", "Williams Sonoma",

    # Luxury & Jewelry
    "Tiffany & Co.", "Cartier", "Rolex", "Gucci", "Louis Vuitton", "Prada"
]

US_STATES = [
    "Alabama","Alaska","Arizona","Arkansas","California","Colorado","Connecticut","Delaware","Florida","Georgia",
    "Hawaii","Idaho","Illinois","Indiana","Iowa","Kansas","Kentucky","Louisiana","Maine","Maryland","Massachusetts",
    "Michigan","Minnesota","Mississippi","Missouri","Montana","Nebraska","Nevada","New Hampshire","New Jersey",
    "New Mexico","New York","North Carolina","North Dakota","Ohio","Oklahoma","Oregon","Pennsylvania","Rhode Island",
    "South Carolina","South Dakota","Tennessee","Texas","Utah","Vermont","Virginia","Washington","West Virginia",
    "Wisconsin","Wyoming"
]

# Weights per your formula
W_AMOUNT   = 0.2
W_MERCHANT = 0.3
W_LOCATION = 0.5

### Embedding Helpers & Transaction Generation ###

This section adds utilities to **embed merchant, location, and amount signals,** compose them into a single vector, and **insert synthetic transactions** into the CCTransactions container. It also includes helpers to create **customer spending profiles, credit cards, and a bulk generator** for customers and transactions.

#### Embedding Helper
Wraps a call to the embeddings API and returns a NumPy vector.

**What it does**

- Calls the embeddings endpoint with OPEN_AI_MODEL and returns float32 vectors.
- Keeps the function general so you can reuse it for merchants, locations, or any other categorical signal.

**Why this matters**

- Embeddings allow you to numerically represent text features (e.g., “Nike”, “California”) so they can be compared, clustered, or used downstream in anomaly detection.

In [None]:

# ─────────────────────────────────────────────
# Embedding helper
# ─────────────────────────────────────────────
def embed_text(text: str) -> np.ndarray:
    resp = openai.embeddings.create(input=text, model=OPEN_AI_MODEL)
    return np.array(resp.data[0].embedding, dtype=np.float32)

# ─────────────────────────────────────────────
# Combine embedding (amount + merchant + location)
# ─────────────────────────────────────────────
W_AMOUNT, W_MERCHANT, W_LOCATION = 0.2, 0.3, 0.5

def normalize_amount(amount: float, lo: float, hi: float) -> float:
    span = max(hi - lo, 1e-6)
    return float(np.clip((amount - lo) / span, 0.0, 1.0))

def make_embedding(merchant: str, location: str, amount: float, lo: float, hi: float) -> list:
    amount_norm = normalize_amount(amount, lo, hi)
    a_vec = np.array([amount_norm], dtype=np.float32) * W_AMOUNT
    m_vec = embed_text(merchant) * W_MERCHANT
    l_vec = embed_text(location) * W_LOCATION
    combined = np.concatenate([a_vec, m_vec, l_vec]).astype(np.float32)
    norm = np.linalg.norm(combined)
    if norm > 0:
        combined /= norm
    return combined.tolist()


#### Combined Embedding: Amount + Merchant + Location

Creates a single vector by combining a scaled **amount** feature with **merchant** and **location** embeddings, then **L2-normalizes** the result.

**How it works**
- **normalize_amount** scales the transaction amount into **[0, 1]** using the **per-merchant** typical range lo..hi.
- **Weights** emphasize signal importance: amount (0.2), merchant (0.3), location (0.5).
- Concatenates [amount_scalar, merchant_embedding, location_embedding] and **L2-normalizes**.

**Dimensionality note**

- If your text embedding dimension is d, the final vector has size **1 + d + d**.
- Example: with 1536-dim embeddings → 3073 total dimensions.
- If you plan to **index** this vector in Cosmos DB vector indexing, ensure your **indexing policy** supports the dimension (or project down with PCA/UMAP, or use a single-text prompt like "merchant, location, amount bucket" to produce one embedding).

In [None]:

# ─────────────────────────────────────────────
# Helpers for merchant profiles & sampling
# ─────────────────────────────────────────────

def _random_spend_range_for_merchant(name: str) -> tuple[float, float]:
    """
    Assign a min/max typical spend per merchant. You can tweak bands per category.
    For simplicity, use the merchant name to bucket into spend bands heuristically.
    """
    n = name.lower()
    # Basic buckets by intuition—adjust as you like
    if any(k in n for k in ["airlines", "delta", "united", "american", "southwest", "marriott", "hilton", "hyatt", "apple store", "microsoft", "samsung", "ikea", "cartier", "rolex", "gucci", "tiffany", "prada", "louis vuitton"]):
        lo, hi = 120.0, 600.0
    elif any(k in n for k in ["uber", "lyft", "starbucks", "dunkin", "taco bell", "kfc", "mcdonald", "burger king", "subway", "popeyes", "chipotle", "five guys", "shake shack", "panera"]):
        lo, hi = 5.0, 35.0
    elif any(k in n for k in ["walmart", "target", "costco", "kroger", "safeway", "publix", "albertsons", "trader joe", "whole foods", "cvs", "walgreens", "rite aid"]):
        lo, hi = 20.0, 180.0
    elif any(k in n for k in ["netflix", "hulu", "disney", "spotify", "prime video", "youtube premium", "hbo", "peacock", "paramount", "apple tv"]):
        lo, hi = 5.0, 40.0
    else:
        lo, hi = 15.0, 250.0

    
    # Add slight randomization per customer-merchant profile
    pad = random.uniform(-0.15, 0.15)  # ±15%
    width_scale = random.uniform(0.85, 1.20)
    lo = max(1.0, lo * (1.0 + pad))
    hi = max(lo + 5.0, hi * width_scale)
    return round(lo, 2), round(hi, 2)


def _triangular_amount(lo: float, hi: float) -> float:
    """Sample an amount with more mass around the mid of [lo, hi]."""
    mid = (lo + hi) / 2.0
    return round(random.triangular(lo, hi, mid), 2)


def _pick_location(home_state: str, home_prob: float = 0.85) -> str:
    """Return the home state with probability home_prob; otherwise a different random state."""
    if random.random() < home_prob:
        return home_state
    else:
        # ensure it's different than home_state
        alt = home_state
        while alt == home_state:
            alt = random.choice(US_STATES)
        return alt


def build_customer_profile(available_merchants: list[str], available_states: list[str]) -> dict:
    """
    Choose 3–5 merchants, and for each assign:
      - 'home_state'
      - 'lo', 'hi' spend band (per merchant)
    Returns: { merchant_name: {"home_state": str, "lo": float, "hi": float}, ... }
    """
    k = random.randint(3, 5)
    chosen = random.sample(available_merchants, k)
    profile = {}
    for m in chosen:
        lo, hi = _random_spend_range_for_merchant(m)
        profile[m] = {
            "home_state": random.choice(available_states),
            "lo": lo,
            "hi": hi,
        }
    return profile




**Insert a Single Transaction**

Builds the transaction document (including the combined vector) and writes it to the CCTransactions container.

**Document shape (transactions)**

- id: unique UUID for the transaction.
- type: "transaction".
- card_id: credit-card identifier (used as **partition key** in the recommended model).
- customerId: owning customer.
- merchant, location, amount: observable features.
- embedding: combined vector for semantic similarity or anomaly detection.
- timestamp: server-side UTC ISO timestamp.


**Partitioning note**: This sample assumes **/card_id** as the partition key for transactions. Co-locating events for a single card simplifies per-card analytics.

In [None]:
# ─────────────────────────────────────────────
# Generate and insert a transaction
# ─────────────────────────────────────────────
def add_transaction(card_id: str, customer_id: str, merchant: str, location: str,
                    amount: float, lo: float, hi: float):
    emb = make_embedding(merchant, location, amount, lo, hi)
    doc = {
        "id": str(uuid.uuid4()),
        "type": "transaction",
        "card_id": card_id,
        "customer_id": customer_id,
        "merchant": merchant,
        "location": location,
        "amount": amount,
        "embedding": emb,
        "timestamp": datetime.now().replace(microsecond=0).isoformat()
    }
    txns_container.create_item(doc)

**Credit Card Helpers & Factory**

Produces **synthetic** but realistic-looking credit card items and identities to populate the CreditCards container.

**Modeling assumptions**
- CreditCards uses partition key /card_id, matching the card_id you store in transactions for co-location by card.
- These values are synthetic and for sample/demo purposes only.

In [None]:

# ─────────────────────────────────────────────
# Credit card helpers + factory (your function)
# ─────────────────────────────────────────────

def synthetic_card_number(rng: random.Random) -> str:
    # 16 digit card number
    card_number = f"{rng.randint(0,9999):04d}-{rng.randint(0,9999):04d}-{rng.randint(0,9999):04d}-{rng.randint(0,9999):04d}"
    return card_number

def synthetic_cvv(rng: random.Random) -> str:
    return f"{rng.randint(0, 999):03d}"

def synthetic_expiration_date(rng: random.Random) -> str:
    expiration_date = f"{rng.randint(1, 12):02d}/{datetime.now().year + rng.randint(1, 5)}"
    return expiration_date

def create_credit_card(customer_id: str, card_id: str, rng: random.Random) -> Dict:
    return {
        "id": card_id,            # item id in CreditCards
        "type": "card",
        "card_id": card_id,        # PK (assumes /card_id partition key on CreditCards)
        "customer_id": customer_id,
        "card_number": synthetic_card_number(rng),  # synthetic
        "security_code": synthetic_cvv(rng),          # '000'..'999'
        "expiration_date": synthetic_expiration_date(rng),
        "status": "unlocked",
        "last_lock_reason": "",  # no lock reason
        "last_updated": datetime.now().replace(microsecond=0).isoformat(),
        "createdAt": datetime.now().replace(microsecond=0).isoformat()
    }


**Bulk Generation: Customers, Cards, and Transactions**

Creates a set of Customer_1..N, generates a card for each, and writes a stream of transactions per customer using the profile-driven distributions.

**Behavior**

- Generates limit_customers customers and 3–5 favorite merchants each.
- Produces up to max_txns_per_customer per customer (randomized 60–100% of the cap).
- Skews locations toward a home state with probability home_prob (default 0.85).
- Uses sleep_between to throttle writes if needed (RU rate-limiting or embedding throughput).
- seed makes the generation deterministic for reproducibility.
- timestamp_spread_days is accepted for future enhancement (e.g., post-insert timestamp adjustments).

In [None]:
# ─────────────────────────────────────────────
# Main: create customers & cards, then generate txns
# ─────────────────────────────────────────────

def generate_new_customers_and_transactions(limit_customers: int = 10,
                                            max_txns_per_customer: int = 100,
                                            home_prob: float = 0.85,
                                            sleep_between: float = 0.0,
                                            seed: int | None = 42,
                                            timestamp_spread_days: int | None = None):
    """
    Creates Customers_1..N and their cards, inserts cards into CreditCards container,
    then generates transactions for each new card using your add_transaction function.

    Args:
      limit_customers: number of customers to create (Customer_1..Customer_N)
      max_txns_per_customer: cap per customer (actual n is [60%, 100%] of cap)
      home_prob: probability a txn occurs in the merchant home state
      sleep_between: sleep (secs) between inserts to ease RU throttling
      seed: seed for deterministic synthetic data (None for non-deterministic)
      timestamp_spread_days: if set, randomly distribute timestamps across the past N days
                             instead of using "now()" in add_transaction. (We’ll override after insert)
    """
    rng = random.Random(seed) if seed is not None else random.Random()

    total_cards = 0
    total_written = 0

    
    for i in range(1, limit_customers + 1):
        customer_id = f"U{i:04d}"
        card_id = f"C{i:04d}"
        card_doc = create_credit_card(customer_id, card_id, rng)

        # Insert card first
        try:
            cards_container.create_item(card_doc)
            total_cards += 1
        except Exception as e:
            print(f"[WARN] Failed to create card {card_id} for {customer_id}: {e}")
            continue

        # Build per-customer profile
        profile = build_customer_profile(merchants, US_STATES)
        m_names = list(profile.keys())

        # Decide how many transactions to generate for this customer
        n = rng.randint(max(1, math.ceil(max_txns_per_customer * 0.6)), max_txns_per_customer)

        # Generate transactions
        for _ in range(n):
            m = rng.choice(m_names)
            home_state = profile[m]["home_state"]
            lo = profile[m]["lo"]; hi = profile[m]["hi"]
            amount = _triangular_amount(lo, hi)
            location = _pick_location(home_state, home_prob=home_prob)

            try:

                
                # Insert using your existing add_transaction (uses "now" internally)
                add_transaction(
                    card_id=card_id,
                    customer_id=customer_id,
                    merchant=m,
                    location=location,
                    amount=amount,
                    lo=lo,
                    hi=hi
                )
                total_written += 1
            except Exception as e:
                print(f"[WARN] Failed to add txn for {customer_id} ({customer_id + '_card'}), merchant={m}: {e}")
                continue

            if sleep_between > 0:
                time.sleep(sleep_between)



# Generate Transactions
generate_new_customers_and_transactions(max_txns_per_customer=100, limit_customers=100)