********Generate Dataset********

In [38]:
import pandas as pd
import numpy as np
from faker import Faker

# -----------------------
# Setup
# -----------------------
fake = Faker()
np.random.seed(42)
Faker.seed(42)

ROWS = 50000
FRAUD_RATE = 0.035

merchant_categories = [
    "Grocery", "Fuel", "Electronics",
    "Travel", "Dining", "E-commerce"
]

transaction_types = ["Online", "POS", "ATM"]
countries = ["India", "USA", "UK", "UAE", "Singapore"]

records = []

# -----------------------
# Data Generation Loop
# -----------------------
for _ in range(ROWS):

    # Fraud flag
    is_fraud = np.random.choice([0, 1], p=[1 - FRAUD_RATE, FRAUD_RATE])

    # Transaction amount
    amount = (
        np.random.normal(2500, 1200)
        if is_fraud == 0
        else np.random.uniform(8000, 50000)
    )

    # Time gap (velocity)
    hours_gap = (
        np.random.exponential(10)
        if is_fraud == 0
        else np.random.uniform(0.1, 1.5)
    )

    # Foreign transaction flag
    foreign_txn = (
        np.random.choice([0, 1], p=[0.85, 0.15])
        if is_fraud == 0
        else np.random.choice([0, 1], p=[0.3, 0.7])
    )

    # Risk score
    risk_score = round(
        (amount / 50000) * 0.4 +
        (1 / max(hours_gap, 0.1)) * 0.4 +
        foreign_txn * 0.2,
        2
    )

    # Append record
    records.append({
        "transaction_id": fake.uuid4(),
        "transaction_date": fake.date_time_between("-6M", "now"),
        "customer_id": f"CUST-{np.random.randint(10000, 99999)}",
        "merchant_category": np.random.choice(merchant_categories),
        "transaction_amount": round(max(amount, 50), 2),
        "transaction_type": np.random.choice(transaction_types),
        "merchant_country": np.random.choice(countries),
        "hours_since_last_txn": round(hours_gap, 2),
        "is_foreign_transaction": foreign_txn,
        "fraud_risk_score": risk_score,
        "is_fraud": is_fraud
    })

# -----------------------
# Create DataFrame & Save
# -----------------------
df = pd.DataFrame(records)
df.to_csv("transactions.csv", index=False)

print(df.shape)
print(df["is_fraud"].value_counts(normalize=True))


(50000, 11)
is_fraud
0    0.96468
1    0.03532
Name: proportion, dtype: float64


*Customer Dataset*

In [39]:
import pandas as pd
import numpy as np
from faker import Faker

fake = Faker()
np.random.seed(42)

# --------------------------------
# Step 1: Transactions se unique customer_id nikaalna
# --------------------------------
# Kyun: Customers dataset ko transactions se link karna hai
customer_ids = df["customer_id"].unique()

# --------------------------------
# Step 2: Empty list banani (records store karne ke liye)
# --------------------------------
customer_records = []

# --------------------------------
# Step 3: Har customer ke liye profile banana
# --------------------------------
for cust_id in customer_ids:

    # Age generate karna (realistic banking range)
    age = np.random.randint(18, 70)

    # Income band decide karna (simple segmentation)
    income_band = np.random.choice(
        ["Low", "Medium", "High"],
        p=[0.4, 0.4, 0.2]   # zyada log low/medium income hote hain
    )

    # Credit score generate karna
    credit_score = (
        np.random.randint(300, 600) if income_band == "Low"
        else np.random.randint(550, 750) if income_band == "Medium"
        else np.random.randint(700, 900)
    )
    # Low income → low credit score, High income → better score

    # Customer since date (kab se bank ka customer hai)
    customer_since = fake.date_between(start_date="-10y", end_date="-1y")

    # Home country
    home_country = np.random.choice(
        ["India", "USA", "UK", "UAE", "Singapore"],
        p=[0.6, 0.15, 0.1, 0.1, 0.05]
    )

    # Risk segment derive karna (business-friendly label)
    if credit_score < 550:
        risk_segment = "High"
    elif credit_score < 700:
        risk_segment = "Medium"
    else:
        risk_segment = "Low"

    # --------------------------------
    # Step 4: Customer record append karna
    # --------------------------------
    customer_records.append({
        "customer_id": cust_id,
        "age": age,
        "income_band": income_band,
        "credit_score": credit_score,
        "customer_since": customer_since,
        "home_country": home_country,
        "risk_segment": risk_segment
    })

# --------------------------------
# Step 5: DataFrame banana
# --------------------------------
customers_df = pd.DataFrame(customer_records)

# --------------------------------
# Step 6: CSV me save karna
# --------------------------------
customers_df.to_csv("customers.csv", index=False)

# Quick sanity check
print(customers_df.shape)
customers_df.head()


(38413, 7)


Unnamed: 0,customer_id,age,income_band,credit_score,customer_since,home_country,risk_segment
0,CUST-54131,56,Medium,564,2020-02-10,USA,Medium
1,CUST-93104,38,Low,514,2020-02-28,India,High
2,CUST-28431,41,Low,451,2020-01-19,USA,High
3,CUST-90038,19,Medium,707,2022-06-30,India,Low
4,CUST-33483,38,Medium,607,2018-08-28,India,Medium


*Merchants Dataset*

In [40]:
import pandas as pd
import numpy as np

np.random.seed(42)

# --------------------------------
# Step 1: Transactions se unique merchant_category nikaalna
# --------------------------------
# Kyun: Merchants ko transactions se link karna hai
merchant_categories = df["merchant_category"].unique()

# --------------------------------
# Step 2: Empty list banani (merchant records store karne ke liye)
# --------------------------------
merchant_records = []

# --------------------------------
# Step 3: Har merchant category ke liye profile banana
# --------------------------------
for category in merchant_categories:

    # Average transaction value (category-specific behavior)
    avg_txn_value = np.random.uniform(500, 5000)
    # Grocery/Fuel jaise categories lower, Travel/Electronics higher ho sakte hain

    # Merchant country (mostly domestic, thoda foreign mix)
    merchant_country = np.random.choice(
        ["India", "USA", "UK", "UAE", "Singapore"],
        p=[0.55, 0.15, 0.1, 0.1, 0.1]
    )

    # Merchant risk score (0–1 scale)
    merchant_risk_score = round(
        (avg_txn_value / 5000) * 0.6 +
        np.random.uniform(0, 0.4),
        2
    )
    # High avg value + randomness = higher risk

    # --------------------------------
    # Step 4: Merchant record append karna
    # --------------------------------
    merchant_records.append({
        "merchant_category": category,
        "merchant_country": merchant_country,
        "avg_transaction_value": round(avg_txn_value, 2),
        "merchant_risk_score": merchant_risk_score
    })

# --------------------------------
# Step 5: DataFrame banana
# --------------------------------
merchants_df = pd.DataFrame(merchant_records)

# --------------------------------
# Step 6: CSV me save karna
# --------------------------------
merchants_df.to_csv("merchants.csv", index=False)

# Quick sanity check
print(merchants_df.shape)
merchants_df.head()


(6, 4)


Unnamed: 0,merchant_category,merchant_country,avg_transaction_value,merchant_risk_score
0,Electronics,Singapore,2185.43,0.56
1,Travel,India,3193.96,0.45
2,Fuel,UAE,761.38,0.33
3,E-commerce,India,3686.33,0.83
4,Dining,India,4245.99,0.58


*Left-Join*

In [52]:
import pandas as pd

# LEFT JOIN because every transaction is important
df_txn_customer = pd.merge(
    df,
    customers_df,
    on="customer_id",
    how="left"
)

# Check output
print(df_txn_customer.head())
print(df_txn_customer.shape)

final_df = pd.merge(
    df_txn_customer,
    merchants_df,
    on="merchants_ID",
    how="left"
)

# Final dataset preview
print(final_df.head())
print(final_df.shape)


                         transaction_id    transaction_date customer_id  \
0  bdd640fb-0667-4ad1-9c80-317fa3b1799d 2025-08-17 18:49:33  CUST-54131   
1  bc8960a9-23b8-41e9-b924-56de3eb13b90 2025-07-15 06:45:30  CUST-93104   
2  8b9d2434-e465-4150-bd9c-66b3ad3c2d6d 2025-07-12 06:15:38  CUST-28431   
3  07a0ca6e-0822-48f3-ac03-1199972a8469 2025-07-13 13:44:52  CUST-90038   
4  9a1de644-815e-46d1-bb8f-aa1837f8a88b 2025-06-30 12:45:46  CUST-33483   

  merchant_category  transaction_amount transaction_type merchant_country  \
0       Electronics             1165.74              POS        Singapore   
1            Travel            42962.59              POS        Singapore   
2            Travel             2882.68              ATM               UK   
3              Fuel             2470.65           Online              UAE   
4            Travel             2926.66           Online               UK   

   hours_since_last_txn  is_foreign_transaction  fraud_risk_score  is_fraud  \
0      

KeyError: 'merchants_ID'