In [21]:
%pip install faker

Note: you may need to restart the kernel to use updated packages.


In [22]:
from faker import Faker
import pandas as pd
import random
from datetime import datetime, timedelta

fake = Faker()

# generate synthetic data for order details
def generate_order_details(num_orders, num_customers):
    order_details_data = {
        "OrderID": list(range(1, num_orders + 1)),
        "CustomerID": [fake.random_number(digits=6) for _ in range(num_orders)],
        "OrderDate": [fake.date_between(start_date='-1y', end_date='today') for _ in range(num_orders)]
    }
    return pd.DataFrame(order_details_data)

# generate synthetic data for order item details
def generate_order_item_details(num_orders, num_products):
    order_item_details_data = []
    for order_id in range(1, num_orders + 1):
        num_items = random.randint(1, 5)  # Random number of items per order
        for _ in range(num_items):
            order_item_details_data.append({
                "OrderID": order_id,
                "ProductID": fake.random_number(digits=6),
                "Quantity": random.randint(1, 10),
                "UnitPrice": round(random.uniform(10, 100), 2)
            })
    return pd.DataFrame(order_item_details_data)

# Generate synthetic data for accounts
def generate_accounts_data(num_accounts):
    accounts_data = {
        "AccountNumber": [fake.random_number(digits=8) for _ in range(num_accounts)],
        "AccountType": [random.choice(["Savings", "Checking", "Credit"]) for _ in range(num_accounts)],
        "Balance": [round(random.uniform(100, 10000), 2) for _ in range(num_accounts)]
    }
    return pd.DataFrame(accounts_data)

# Generate synthetic data for transactions
def generate_transactions_data(num_transactions, account_numbers):
    transactions_data = {
        "TransactionID": [fake.uuid4() for _ in range(num_transactions)],
        "AccountNumber": [random.choice(account_numbers) for _ in range(num_transactions)],
        "Amount": [round(random.uniform(-5000, 5000), 2) for _ in range(num_transactions)],
        "TransactionType": [random.choice(["Deposit", "Withdrawal"]) for _ in range(num_transactions)]
    }
    
    # Generate dates within the past year
    end_date = datetime.now()
    start_date = end_date - timedelta(days=365)
    dates = [fake.date_time_between(start_date=start_date, end_date=end_date) for _ in range(num_transactions)]
    transactions_data["Date"] = dates
    
    return pd.DataFrame(transactions_data)

# Generate synthetic data
num_orders = 100000
num_customers = 50000
num_products = 200000
num_accounts = 100000
num_transactions = 500000

order_details_df = generate_order_details(num_orders, num_customers)
order_item_details_df = generate_order_item_details(num_orders, num_products)
accounts_df = generate_accounts_data(num_accounts)
transactions_df = generate_transactions_data(num_transactions, accounts_df["AccountNumber"].unique())

# Write data to CSV files
order_details_df.to_csv("Dataset/orderdetails.csv", index=False)
order_item_details_df.to_csv("Dataset/orderitemdetails.csv", index=False)
accounts_df.to_csv("Dataset/accounts.csv", index=False)
transactions_df.to_csv("Dataset/transactions.csv", index=False)

In [23]:
df1 = pd.read_csv('Dataset/accounts.csv')

In [24]:
df1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 3 columns):
 #   Column         Non-Null Count   Dtype  
---  ------         --------------   -----  
 0   AccountNumber  100000 non-null  int64  
 1   AccountType    100000 non-null  object 
 2   Balance        100000 non-null  float64
dtypes: float64(1), int64(1), object(1)
memory usage: 2.3+ MB
