In [1]:
import numpy as np
import pandas as pd
from datetime import datetime, timedelta
import random

In [12]:
import sys
from pathlib import Path

PROJECT_ROOT = Path().resolve().parent
sys.path.append(str(PROJECT_ROOT))

In [2]:
import os
os.getcwd()

'/Users/aishwarya/Desktop/Financial Intelligence/notebooks'

In [3]:
from pathlib import Path

PROJECT_ROOT = Path.cwd().parent
DATA_RAW = PROJECT_ROOT / "data" / "raw"

DATA_RAW.mkdir(parents=True, exist_ok=True)

In [4]:
np.random.seed(42)
random.seed(42)

In [5]:
NUM_USERS = 200
TXNS_PER_USER = 100

TRANSACTION_TYPES = ["payment", "transfer", "cashout", "deposit"]
LOCATIONS = ["IN", "US", "UK", "SG"]

In [6]:
user_profiles = {}

for user_id in range(1, NUM_USERS + 1):
    user_profiles[user_id] = {
        "avg_amount": np.random.uniform(200, 5000),
        "std_amount": np.random.uniform(50, 800),
        "active_start_hour": np.random.randint(6, 12),   # e.g. 6–11 AM
        "active_end_hour": np.random.randint(17, 23),    # e.g. 5–10 PM
        "devices": [f"device_{user_id}_{i}" for i in range(np.random.randint(1, 4))],
        "locations": random.sample(LOCATIONS, k=np.random.randint(1, 3))
    }

In [7]:
rows = []
transaction_id = 1
start_date = datetime(2024, 1, 1)

for user_id, profile in user_profiles.items():
    last_txn_time = start_date

    for _ in range(TXNS_PER_USER):
        is_fraud = np.random.rand() < 0.03  # ~3% fraud

        # Normal behavior
        amount = np.random.normal(profile["avg_amount"], profile["std_amount"])
        hour = np.random.randint(profile["active_start_hour"], profile["active_end_hour"])

        device = random.choice(profile["devices"])
        location = random.choice(profile["locations"])

        # Inject fraud patterns
        if is_fraud:
            fraud_type = random.choice([
                "amount_spike",
                "odd_hour",
                "new_device",
                "velocity",
                "new_location"
            ])

            if fraud_type == "amount_spike":
                amount *= np.random.uniform(3, 6)

            elif fraud_type == "odd_hour":
                hour = np.random.choice([0, 1, 2, 3, 4])

            elif fraud_type == "new_device":
                device = f"unknown_device_{transaction_id}"

            elif fraud_type == "new_location":
                location = random.choice(
                    list(set(LOCATIONS) - set(profile["locations"]))
                )

            elif fraud_type == "velocity":
                last_txn_time -= timedelta(minutes=np.random.randint(1, 5))

        timestamp = last_txn_time + timedelta(
            minutes=np.random.randint(10, 600)
        )
        timestamp = timestamp.replace(hour=hour)

        rows.append({
            "transaction_id": transaction_id,
            "user_id": user_id,
            "transaction_type": random.choice(TRANSACTION_TYPES),
            "amount": round(max(amount, 1), 2),
            "timestamp": timestamp,
            "device_id": device,
            "location": location,
            "is_fraud": int(is_fraud)
        })

        last_txn_time = timestamp
        transaction_id += 1

In [8]:
df = pd.DataFrame(rows)

df = df.sort_values(["user_id", "timestamp"]).reset_index(drop=True)

df.to_csv(DATA_RAW / "transactions.csv", index=False)


In [9]:
df.head(200)

Unnamed: 0,transaction_id,user_id,transaction_type,amount,timestamp,device_id,location,is_fraud
0,1,1,deposit,2902.15,2024-01-01 16:55:00,device_1_0,IN,0
1,4,1,transfer,2194.25,2024-01-02 08:36:00,device_1_0,IN,0
2,2,1,cashout,1486.53,2024-01-02 10:24:00,device_1_0,IN,0
3,10,1,transfer,1479.51,2024-01-02 12:46:00,device_1_0,IN,0
4,7,1,payment,2839.44,2024-01-02 13:23:00,device_1_0,IN,0
...,...,...,...,...,...,...,...,...
195,195,2,payment,2349.12,2024-01-19 16:03:00,device_2_0,UK,0
196,197,2,deposit,2274.07,2024-01-19 19:17:00,device_2_2,IN,0
197,198,2,payment,2385.20,2024-01-20 20:27:00,device_2_2,IN,0
198,199,2,cashout,2195.43,2024-01-21 09:05:00,device_2_1,UK,0


In [19]:


from backend.app.features.feature_engineering import generate_features

In [20]:
RAW_PATH = "../data/raw/transactions.csv"

df_raw = pd.read_csv(RAW_PATH)

print("Raw shape:", df_raw.shape)
df_raw.head()

Raw shape: (20000, 8)


Unnamed: 0,transaction_id,user_id,transaction_type,amount,timestamp,device_id,location,is_fraud
0,1,1,deposit,2902.15,2024-01-01 16:55:00,device_1_0,IN,0
1,4,1,transfer,2194.25,2024-01-02 08:36:00,device_1_0,IN,0
2,2,1,cashout,1486.53,2024-01-02 10:24:00,device_1_0,IN,0
3,10,1,transfer,1479.51,2024-01-02 12:46:00,device_1_0,IN,0
4,7,1,payment,2839.44,2024-01-02 13:23:00,device_1_0,IN,0


In [21]:
df_processed = generate_features(df_raw)

print("Processed shape:", df_processed.shape)
df_processed.head()

Processed shape: (20000, 15)


Unnamed: 0,transaction_id,user_id,transaction_type,amount,timestamp,device_id,location,is_fraud,hour,day_of_week,time_since_last_txn,user_avg_amount,user_std_amount,amount_zscore,new_device
0,1,1,deposit,2902.15,2024-01-01 16:55:00,device_1_0,IN,0,16,0,0.0,2091.328,781.941911,1.035609,1
1,4,1,transfer,2194.25,2024-01-02 08:36:00,device_1_0,IN,0,8,1,56460.0,2091.328,781.941911,0.131455,0
2,2,1,cashout,1486.53,2024-01-02 10:24:00,device_1_0,IN,0,10,1,6480.0,2091.328,781.941911,-0.772469,0
3,10,1,transfer,1479.51,2024-01-02 12:46:00,device_1_0,IN,0,12,1,8520.0,2091.328,781.941911,-0.781435,0
4,7,1,payment,2839.44,2024-01-02 13:23:00,device_1_0,IN,0,13,1,2220.0,2091.328,781.941911,0.955514,0


In [22]:
# Check one user
sample_user = df_processed["user_id"].iloc[0]
df_processed[df_processed["user_id"] == sample_user][
    ["timestamp", "amount", "amount_zscore", "time_since_last_txn", "new_device"]
].head(10)

Unnamed: 0,timestamp,amount,amount_zscore,time_since_last_txn,new_device
0,2024-01-01 16:55:00,2902.15,1.035609,0.0,1
1,2024-01-02 08:36:00,2194.25,0.131455,56460.0,0
2,2024-01-02 10:24:00,1486.53,-0.772469,6480.0,0
3,2024-01-02 12:46:00,1479.51,-0.781435,8520.0,0
4,2024-01-02 13:23:00,2839.44,0.955514,2220.0,0
5,2024-01-02 13:48:00,2804.42,0.910785,1500.0,0
6,2024-01-02 13:55:00,1884.33,-0.264385,420.0,0
7,2024-01-02 15:32:00,1726.89,-0.465473,5820.0,0
8,2024-01-02 16:30:00,3234.32,1.459868,3480.0,0
9,2024-01-02 16:44:00,2661.44,0.728166,840.0,0


In [23]:
# PROCESSED_PATH = "../data/processed/transactions_features.csv"

# df_processed.to_csv(PROCESSED_PATH, index=False)

# print("Saved processed data to:", PROCESSED_PATH)

Saved processed data to: ../data/processed/transactions_features.csv


In [27]:
import pandas as pd
import numpy as np

df = df_processed.copy()

# --------------------------------
# Initialize fraud flag
# --------------------------------
df["is_fraud"] = 0

# --------------------------------
# 1. Velocity abuse
# --------------------------------
df.loc[df["time_since_last_txn"] < 120, "is_fraud"] = 1

# --------------------------------
# 2. Amount anomaly (user-relative)
# --------------------------------
df.loc[
    df["amount"] > (df["user_avg_amount"] + 3 * df["user_std_amount"]),
    "is_fraud"
] = 1

# --------------------------------
# 3. Time-based anomaly (user behavior window)
# --------------------------------
user_hour_stats = (
    df.groupby("user_id")["hour"]
    .agg(["mean", "std"])
    .reset_index()
)

df = df.merge(
    user_hour_stats,
    on="user_id",
    suffixes=("", "_user")
)

df.loc[
    abs(df["hour"] - df["mean"]) > 3 * df["std"],
    "is_fraud"
] = 1

df.drop(columns=["mean", "std"], inplace=True)

# --------------------------------
# 4. New device detection
# --------------------------------
df["new_device"] = False

for user_id, group in df.groupby("user_id"):
    seen_devices = set()
    for idx, row in group.iterrows():
        if row["device_id"] not in seen_devices:
            df.loc[idx, "new_device"] = True
            seen_devices.add(row["device_id"])

df.loc[df["new_device"], "is_fraud"] = 1

# --------------------------------
# 5. New location detection
# --------------------------------
df["new_location"] = False

for user_id, group in df.groupby("user_id"):
    seen_locations = set()
    for idx, row in group.iterrows():
        if row["location"] not in seen_locations:
            df.loc[idx, "new_location"] = True
            seen_locations.add(row["location"])

df.loc[df["new_location"], "is_fraud"] = 1

# Cleanup helper columns
df.drop(columns=["new_device", "new_location"], inplace=True)

In [28]:
df["is_fraud"].value_counts()

is_fraud
0    18473
1     1527
Name: count, dtype: int64

In [29]:
df[df["is_fraud"] == 1].head()

Unnamed: 0,transaction_id,user_id,transaction_type,amount,timestamp,device_id,location,is_fraud,hour,day_of_week,time_since_last_txn,user_avg_amount,user_std_amount,amount_zscore
0,1,1,deposit,2902.15,2024-01-01 16:55:00,device_1_0,IN,1,16,0,0.0,2091.328,781.941911,1.035609
37,41,1,cashout,1971.33,2024-01-08 08:50:00,device_1_0,US,1,8,0,45120.0,2091.328,781.941911,-0.153266
54,53,1,transfer,1530.21,2024-01-10 16:07:00,device_1_0,IN,1,16,2,0.0,2091.328,781.941911,-0.716679
57,58,1,deposit,1516.32,2024-01-10 19:06:00,device_1_0,IN,1,19,2,0.0,2091.328,781.941911,-0.73442
67,68,1,payment,2303.32,2024-01-14 20:58:00,device_1_0,IN,1,20,6,0.0,2091.328,781.941911,0.270763


In [31]:
df.to_csv("../data/processed/transactions_with_features.csv", index=False)