In [1]:
#Generating transactions using a stochastic payment simulation to replicate UPI behavior and fraud patterns.
import pandas as pd
import numpy as np
from faker import Faker
import random

fake = Faker()

n_users = 2000
n_txns = 50000

# create users
users = [f"user_{i}" for i in range(n_users)]

data = []

for i in range(n_txns):
    sender = random.choice(users)
    receiver = random.choice(users)
    while receiver == sender:
        receiver = random.choice(users)

    amount = np.random.exponential(scale=1500)

    hour = random.randint(0,23)

    # fraud logic (rare but patterned)
    fraud = 0
    if amount > 8000 and hour in [0,1,2,3,4] and random.random() < 0.4:
        fraud = 1
    elif random.random() < 0.002:
        fraud = 1

    data.append([
        i,
        sender,
        receiver,
        round(amount,2),
        hour,
        fraud
    ])

df = pd.DataFrame(data, columns=[
    "transaction_id",
    "sender_id",
    "receiver_id",
    "amount",
    "hour",
    "fraud_flag"
])

df.head()


Unnamed: 0,transaction_id,sender_id,receiver_id,amount,hour,fraud_flag
0,0,user_549,user_152,893.2,12,0
1,1,user_1910,user_1360,750.37,5,0
2,2,user_328,user_1607,859.73,6,0
3,3,user_1841,user_180,791.96,16,0
4,4,user_701,user_453,1107.5,17,0


In [3]:
#saving the file
df.to_csv("transactions_raw.csv", index=False)

In [4]:
import pandas as pd

tx = pd.read_csv("transactions_raw.csv")
tx.head()

Unnamed: 0,transaction_id,sender_id,receiver_id,amount,hour,fraud_flag
0,0,user_549,user_152,893.2,12,0
1,1,user_1910,user_1360,750.37,5,0
2,2,user_328,user_1607,859.73,6,0
3,3,user_1841,user_180,791.96,16,0
4,4,user_701,user_453,1107.5,17,0


In [5]:
#Creating the users table
# sender activity stats
sender_stats = tx.groupby("sender_id").agg(
    total_sent=("amount","sum"),
    avg_amount=("amount","mean"),
    txn_count=("transaction_id","count"),
    active_hours=("hour","nunique")
).reset_index()

# simulate account age
import numpy as np
sender_stats["account_age_days"] = np.random.randint(30, 900, size=len(sender_stats))

users = sender_stats.rename(columns={"sender_id":"user_id"})
users.head()

Unnamed: 0,user_id,total_sent,avg_amount,txn_count,active_hours,account_age_days
0,user_0,40192.49,1385.947931,29,16,32
1,user_1,26770.08,1115.42,24,18,719
2,user_10,34983.07,1128.486129,31,15,838
3,user_100,39746.93,1419.533214,28,15,810
4,user_1000,46378.68,1656.381429,28,16,269


In [6]:
#Creating the behaviour signals table
behavior = tx.copy()

behavior["late_night_txn"] = behavior["hour"].between(0,4).astype(int)
behavior["high_amount"] = (behavior["amount"] > behavior["amount"].quantile(0.95)).astype(int)

# deviation from user's normal amount
avg_map = users.set_index("user_id")["avg_amount"].to_dict()
behavior["amount_deviation"] = behavior.apply(
    lambda x: x["amount"] / avg_map.get(x["sender_id"],1), axis=1
)

behavior = behavior[[
    "transaction_id",
    "sender_id",
    "late_night_txn",
    "high_amount",
    "amount_deviation",
    "fraud_flag"
]]

behavior.head()

Unnamed: 0,transaction_id,sender_id,late_night_txn,high_amount,amount_deviation,fraud_flag
0,0,user_549,0,0,0.520928,0
1,1,user_1910,0,0,0.371294,0
2,2,user_328,0,0,0.495553,0
3,3,user_1841,0,0,0.789538,0
4,4,user_701,0,0,0.813475,0


In [None]:
users.to_csv("data/users.csv", index=False)
behavior.to_csv("data/behavior_signals.csv", index=False)
tx.to_csv("transactions.csv", index=False)
