In [72]:
import random
import datetime
from faker import Faker
import json

fake = Faker()

# Function to generate merchant profiles
def generate_merchant_profile(merchant_id):
    return {
        "merchant_id": merchant_id,
        "business_name": fake.company(),
        "business_type": random.choice(["Electronics", "Fashion", "Groceries"]),
        "registration_date": fake.date_between(start_date="-5y", end_date="today"),
        "gst_status": random.choice([True, False]),
        "average_ticket_size": round(random.uniform(500, 10000), 2)
    }

# Function to generate normal transactions
def generate_normal_transactions(merchant_id, num_txns=80):
    transactions = []
    for _ in range(num_txns):
        txn = {
            "transaction_id": fake.uuid4(),
            "merchant_id": merchant_id,
            "timestamp": fake.date_time_between(start_date="-30d", end_date="now"),
            "amount": round(random.uniform(50, 500), 2),  # Normal transaction amounts
            "customer_id": fake.uuid4(),
            "device_id": fake.uuid4(),
            "customer_location": fake.city(),
            "payment_method": random.choice(["Credit Card", "Debit Card", "UPI", "Net Banking"]),
            "product_category": random.choice(["Electronics", "Clothing", "Groceries"]),
            "platform": random.choice(["Web", "Mobile"]),
            "velocity_flag": False,
            "amount_flag": False,
            "time_flag": False,
    
            "label": 0,  # Normal transaction
        }
        transactions.append(txn)
    return transactions


# Function to inject late-night trading fraud
def inject_late_night_trading(transactions, config):
    for txn in transactions:
        # Set the current date for generating timestamps
        today = datetime.datetime.now().date()
        
        # Generate a timestamp between 11 PM today and 4 AM the next day
        start_time = datetime.datetime.combine(today, datetime.time(23, 0))
        end_time = datetime.datetime.combine(today + datetime.timedelta(days=1), datetime.time(4, 0))
        
        txn['timestamp'] = fake.date_time_between_dates(datetime_start=start_time, datetime_end=end_time)
        txn['time_flag'] = True
        txn['label'] = 1  # Fraudulent transaction
    return transactions


# Function to inject high-velocity spikes fraud
def inject_high_velocity_spikes(transactions, config):
    timestamp = fake.date_time_between(start_date="-30d", end_date="now")
    for txn in transactions:
        txn['timestamp'] = timestamp
        txn['velocity_flag'] = True
        txn['label'] = 1  # Fraudulent transaction
    return transactions

# Function to inject customer concentration fraud
def inject_customer_concentration(transactions, config):
    concentration_customer_ids = [fake.uuid4() for _ in range(5)]  # 5 high-risk customers
    for txn in transactions:
        if txn['customer_id'] in concentration_customer_ids:
            txn['amount_flag'] = True
            txn['label'] = 1  # Fraudulent transaction
    return transactions

# Function to generate fraudulent transactions for a merchant
def generate_fraudulent_transactions(merchant_id, num_txns=20):
    late_night_count = 6
    high_velocity_count = 7
    concentration_count = 7

    late_night_fraud = inject_late_night_trading(
        [generate_normal_transactions(merchant_id, 1)[0] for _ in range(late_night_count)],
        config=None
    )
    high_velocity_fraud = inject_high_velocity_spikes(
        [generate_normal_transactions(merchant_id, 1)[0] for _ in range(high_velocity_count)],
        config=None
    )
    customer_concentration_fraud = inject_customer_concentration(
        [generate_normal_transactions(merchant_id, 1)[0] for _ in range(concentration_count)],
        config=None
    )

    return late_night_fraud + high_velocity_fraud + customer_concentration_fraud

# Generate data for multiple merchants
num_merchants = 100
merchant_profiles = [generate_merchant_profile(f"M{1000 + i}") for i in range(num_merchants)]
all_transactions = []

for profile in merchant_profiles:
    merchant_id = profile["merchant_id"]
    normal_data = generate_normal_transactions(merchant_id, num_txns=80)
    fraud_data = generate_fraudulent_transactions(merchant_id, num_txns=20)

    # Combine normal and fraud data for this merchant
    all_transactions.extend(normal_data + fraud_data)

# Shuffle the combined dataset
random.shuffle(all_transactions)

# Display the first 5 transactions


# # Save to JSON
# with open("transaction_dataset.json", "w") as f:
#     json.dump(all_transactions, f, indent=4)

# print(f"Generated data for {num_merchants} merchants.")
# print(f"Total transactions generated: {len(all_transactions)}")
all_transactions

[{'transaction_id': '4b070b85-1f84-4cb8-b113-ceefa107649e',
  'merchant_id': 'M1025',
  'timestamp': datetime.datetime(2024, 11, 15, 1, 19, 34),
  'amount': 215.49,
  'customer_id': '174d4f19-c408-40b0-a605-c6dbaf661a1b',
  'device_id': '1c58be65-c7aa-47bb-8c0a-f67e4147c6ed',
  'customer_location': 'South Aaronburgh',
  'payment_method': 'Net Banking',
  'product_category': 'Clothing',
  'platform': 'Mobile',
  'velocity_flag': False,
  'amount_flag': False,
  'time_flag': False,
  'label': 0},
 {'transaction_id': '8a44ce71-2dcd-40be-8c05-594af0dadfbf',
  'merchant_id': 'M1040',
  'timestamp': datetime.datetime(2024, 11, 5, 23, 1, 43),
  'amount': 101.1,
  'customer_id': '77ae89f3-1e6d-43a7-bbea-b605f27b2b0e',
  'device_id': 'c116f041-64a9-4579-bab0-700a9c33c6d1',
  'customer_location': 'Heatherchester',
  'payment_method': 'Credit Card',
  'product_category': 'Electronics',
  'platform': 'Mobile',
  'velocity_flag': False,
  'amount_flag': False,
  'time_flag': False,
  'label': 0},
 

In [73]:
import pandas as pd
import json

# Convert the dataset to a DataFrame
df = pd.DataFrame(all_transactions)

# Ensure 'timestamp' column is converted to datetime and then formatted to ISO strings
if 'timestamp' in df.columns:
    df['timestamp'] = pd.to_datetime(df['timestamp']).dt.strftime('%Y-%m-%dT%H:%M:%S')

# Save the DataFrame to a JSON file
df.to_json("transaction_dataset.json", orient='records', lines=True)

# Print the DataFrame to verify
transactions=df
df 

df.to_csv("train.csv", index=False)
df

Unnamed: 0,transaction_id,merchant_id,timestamp,amount,customer_id,device_id,customer_location,payment_method,product_category,platform,velocity_flag,amount_flag,time_flag,label
0,4b070b85-1f84-4cb8-b113-ceefa107649e,M1025,2024-11-15T01:19:34,215.49,174d4f19-c408-40b0-a605-c6dbaf661a1b,1c58be65-c7aa-47bb-8c0a-f67e4147c6ed,South Aaronburgh,Net Banking,Clothing,Mobile,False,False,False,0
1,8a44ce71-2dcd-40be-8c05-594af0dadfbf,M1040,2024-11-05T23:01:43,101.10,77ae89f3-1e6d-43a7-bbea-b605f27b2b0e,c116f041-64a9-4579-bab0-700a9c33c6d1,Heatherchester,Credit Card,Electronics,Mobile,False,False,False,0
2,6e32d150-de51-42e4-b6a8-790050c55fe2,M1073,2024-11-14T20:12:19,411.83,88a62f54-1f95-4913-a12d-085c1412274f,dec389b4-875a-4b0f-b199-a3457d5f8b0d,Michaelburgh,UPI,Electronics,Mobile,False,False,False,0
3,2ef25388-68e9-4fc3-ac02-3da9e61e5218,M1062,2024-11-28T23:05:43,363.12,dad807a7-de71-4542-bfa6-b5c1c86eccc6,189c9cf4-fd57-4a0c-aeb9-c02ee2b1adbc,South Paul,Net Banking,Electronics,Mobile,False,False,True,1
4,124ed179-74c0-431f-819e-82b3bd2f8de2,M1093,2024-11-06T19:18:51,127.04,c4466c5e-ee96-4f02-919d-98c28ea3c405,cf76f8f6-d3ff-4a2b-8b15-1b347f3e7490,Lake Jeremy,Net Banking,Groceries,Mobile,False,False,False,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,669847af-6a1c-4451-aea7-a3b44e287e76,M1045,2024-11-28T23:27:19,199.09,545b75a8-c1e6-460b-af67-9edf70386375,0d480170-0628-4965-99b2-da1f8fa52a71,Reginahaven,Credit Card,Electronics,Web,False,False,True,1
9996,16cfcf5d-dd54-4790-be2d-f5266ed777ff,M1029,2024-11-02T12:01:12,293.91,9ebdf1de-b926-4030-829a-cb2ba14c81a9,5c896174-af0d-404e-ba16-954bb75a7ee1,Cartershire,Debit Card,Electronics,Web,False,False,False,0
9997,e6a65211-5e66-4c26-88c3-67195f66d25c,M1035,2024-10-31T15:06:25,188.88,ee08ff3b-97a1-4855-acf3-5e32fecccecf,aace00a2-e4d2-4c15-ac05-672133cc91b0,Lake Darlene,Credit Card,Groceries,Web,False,False,False,0
9998,5d052618-12ba-49e3-917b-1b40dd0bc5b9,M1036,2024-11-04T23:02:16,68.68,48a5d616-4fa5-46b6-a123-4e514acc649d,3e54d84f-844a-4507-8dc5-63992bbea745,Pennyburgh,Net Banking,Clothing,Mobile,False,False,False,0
