In [1]:
# FinForensix Generator:

# Import necessary libraries
from google.colab import drive
import pandas as pd
import numpy as np
import random
from datetime import datetime, timedelta

# Function to generate names and genders
def generate_name_gender(name_list):
    male_names = {
        'Stephen', 'Jeremy', 'Frank', 'Gregory', 'Henry', 'James', 'Jason',
        'Jeffrey', 'Jesse', 'John', 'Jordan', 'Paul', 'Robert', 'Russell', 'Ryan',
        'Samuel', 'Scott', 'Sean', 'Steven', 'Thomas', 'Vincent', 'Walter', 'Wayne', 'William'
    }
    name_gender = {}
    for name in name_list:
        first_name = name.split()[0]
        if first_name in male_names:
            name_gender[name] = 'M'
        else:
            name_gender[name] = 'F'
    return name_gender

# Function to generate a unique invoice number
def generate_unique_invoice_number(existing_numbers, length=10):
    chars = 'ABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789'
    while True:
        invoice_number = ''.join(random.choices(chars, k=length))
        if invoice_number not in existing_numbers:
            return invoice_number


def benford_law_numbers(count, min_amount, max_amount):
    # Benford's Law probabilities for the first digit
    digits = np.arange(1, 10)
    probabilities = np.log10(1 + 1/digits)

    first_digits = np.random.choice(digits, size=count, p=probabilities)
    other_digits = np.random.uniform(low=0, high=1, size=count) * (max_amount - min_amount) + min_amount

    amounts = first_digits * 10**(np.floor(np.log10(other_digits)))
    amounts = np.round(amounts, 2)  # Round to two decimal places
    return amounts

def generate_invoice_data(num_invoices, start_date, end_date, min_amount, max_amount, name_list):
    name_gender = generate_name_gender(name_list)
    invoices = []
    start_date = datetime.strptime(start_date, '%Y-%m-%d')
    end_date = datetime.strptime(end_date, '%Y-%m-%d')
    date_range = (end_date - start_date).days
    existing_invoice_numbers = set()

    # Generate initial Benford's Law compliant amounts
    benford_amounts = benford_law_numbers(num_invoices, min_amount, max_amount)

    fraudsters = ["Gregory Y Rogers", "Hannah U Powell"]

    for i in range(num_invoices):
        invoice_date = start_date + timedelta(days=random.randint(0, date_range))
        due_date = invoice_date + timedelta(days=30)
        payment_date_variation = random.randint(-2, 5)
        payment_date = due_date + timedelta(days=payment_date_variation)
        customer = random.choice(list(name_gender.items()))

        invoice_number = generate_unique_invoice_number(existing_invoice_numbers)
        existing_invoice_numbers.add(invoice_number)

        is_fraudulent = customer[0] in fraudsters

        # Generate invoice amount
        if is_fraudulent:
            if random.choice([True, False]):  # 50% chance to have even dollar amounts
                amount = float(random.randint(min_amount, max_amount))
            else:
                amount = round(random.uniform(min_amount, max_amount), 2)
        else:
            # Add random cents value between $0.01 and $99.99 to Benford's Law amounts
            random_cents = round(random.uniform(0.01, 99.99), 2)
            amount = round(benford_amounts[i] + random_cents, 2)
            # Make sure the amount does not exceed the max_amount
            if amount > max_amount:
                amount = round(max_amount - random.uniform(0.01, 0.99), 2)

        invoice = {
            'InvoiceID': i + 1,
            'InvoiceNumber': invoice_number,
            'InvoiceDate': invoice_date.strftime('%Y-%m-%d'),
            'InvoiceDueDate': due_date.strftime('%Y-%m-%d'),
            'InvoiceAmount': amount,
            'PaymentDate': payment_date.strftime('%Y-%m-%d'),
            'CustomerName': customer[0],
            'CustomerGender': customer[1]
        }
        invoices.append(invoice)

        # Additional logic for fraudsters
        if is_fraudulent:
            # Randomly add duplicate transactions
            if random.choice([True, False]):
                duplicate_invoice = invoice.copy()
                duplicate_invoice['InvoiceID'] = len(invoices) + 1
                duplicate_invoice['InvoiceNumber'] = generate_unique_invoice_number(existing_invoice_numbers)
                existing_invoice_numbers.add(duplicate_invoice['InvoiceNumber'])
                invoices.append(duplicate_invoice)

            # Randomly add frequent transactions
            if random.choice([True, False]):
                additional_invoice_date = invoice_date + timedelta(days=random.choice([1, -1]))
                additional_invoice = invoice.copy()
                additional_invoice['InvoiceID'] = len(invoices) + 1
                additional_invoice['InvoiceNumber'] = generate_unique_invoice_number(existing_invoice_numbers)
                existing_invoice_numbers.add(additional_invoice['InvoiceNumber'])
                additional_invoice['InvoiceDate'] = additional_invoice_date.strftime('%Y-%m-%d')
                additional_invoice['InvoiceDueDate'] = (additional_invoice_date + timedelta(days=30)).strftime('%Y-%m-%d')
                additional_invoice['PaymentDate'] = (additional_invoice_date + timedelta(days=random.randint(-2, 5))).strftime('%Y-%m-%d')
                invoices.append(additional_invoice)

    return invoices

# Parameters for invoice generation
num_invoices_to_generate = 500000  # Adjusted for demonstration purposes
start_date = '2015-01-01'
end_date = '2024-06-30'
min_invoice_amount = 100
max_invoice_amount = 1000

# Name list
name_list = [
    "Alexis N Thompson", "Amanda M Turner", "Amy A Powell", "Amy F White",
    "Anna C Walker", "Anthony T Foster", "Arthur D Jenkins", "Ashley A Walker",
    "Ashley D Gutierrez", "Ashley I Russell", "Barbara O Howard", "Betty H Walker",
    "Betty V Hill", "Bobby T Sanders", "Bruce B Turner", "Bruce J Adams",
    "Carol H Watson", "Catherine G Hill", "Charles L Gonzalez", "Christina X Reyes",
    "Christine F Stewart", "Danielle V Butler", "Debra P Watson", "Debra R Johnson",
    "Diane P Harris", "Donna I Morales", "Donna N Bell", "Douglas P Cox",
    "Emma B Allen", "Emma L Stewart", "Frances S Smith", "Frank Q Wright",
    "Gary L Hall", "Gregory Y Rogers", "Hannah U Powell", "Heather Z Cooper",
    "Henry L Sanchez", "Henry O Gomez", "Henry R Reyes", "Jacqueline D Walker",
    "Jacqueline G Hughes", "James W Adams", "Janet V Taylor", "Jason L Adams",
    "Jason L Price", "Jean A Hall", "Jeffrey F Sanders", "Jennifer Y Sullivan",
    "Jennifer Z Morales", "Jeremy J Gonzalez", "Jeremy R Johnson", "Jerry C Evans",
    "Jesse B Hughes", "John C Barnes", "Jordan B Garcia", "Judith V Edwards",
    "Julia C Watson", "Kathleen H King", "Kelly N Green", "Kyle I Ortiz",
    "Kyle W Bailey", "Kyle W Sullivan", "Lauren P Carter", "Lawrence V Morgan",
    "Linda R Taylor", "Lisa J Roberts", "Lisa T King", "Lori P Smith",
    "Lori X Anderson", "Maria H Lewis", "Melissa H Morales", "Noah Z Powell",
    "Patricia P Thompson", "Paul H Anderson", "Robert R Myers", "Robert S Reed",
    "Russell Y Fisher", "Ruth G Johnson", "Ruth S Garcia", "Ryan H Rodriguez",
    "Samantha N Gray", "Samuel Y Morgan", "Sandra K Flores", "Sara S Scott",
    "Scott X Ross", "Sean L Kelly", "Sean S Howard", "Sean X Wilson",
    "Shirley B Smith", "Stephen S King", "Steven Q Young", "Teresa F Rogers",
    "Teresa J Cox", "Thomas R Williams", "Tiffany O Bailey", "Vincent J Barnes",
    "Virginia A Brooks", "Walter M Clark", "Wayne S Cook", "William U Barnes"
]

# Generate the invoices, including fraudulent transactions
invoices = generate_invoice_data(num_invoices_to_generate, start_date, end_date, min_invoice_amount, max_invoice_amount, name_list)

# Convert to DataFrame
df_invoices = pd.DataFrame(invoices)

drive.mount('/content/drive')

file_path = '/content/drive/My Drive/UtahTech/ISA3020/Invoices.csv'
df_invoices.to_csv(file_path, index=False)

print(f"Invoice data saved to {file_path}")


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Invoice data saved to /content/drive/My Drive/UtahTech/ISA3020/Invoices.csv


In [2]:
# FinForensix Generator:

# Import necessary libraries
from google.colab import drive
import pandas as pd
import numpy as np
import random
from datetime import datetime, timedelta

# Function to generate names and genders
def generate_name_gender(name_list):
    male_names = {
        'Stephen', 'Jeremy', 'Frank', 'Gregory', 'Henry', 'James', 'Jason',
        'Jeffrey', 'Jesse', 'John', 'Jordan', 'Paul', 'Robert', 'Russell', 'Ryan',
        'Samuel', 'Scott', 'Sean', 'Steven', 'Thomas', 'Vincent', 'Walter', 'Wayne', 'William'
    }
    name_gender = {}
    for name in name_list:
        first_name = name.split()[0]
        if first_name in male_names:
            name_gender[name] = 'M'
        else:
            name_gender[name] = 'F'
    return name_gender

# Function to generate a unique invoice number
def generate_unique_invoice_number(existing_numbers, length=10):
    chars = 'ABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789'
    while True:
        invoice_number = ''.join(random.choices(chars, k=length))
        if invoice_number not in existing_numbers:
            return invoice_number


def benford_law_numbers(count, min_amount, max_amount):
    # Benford's Law probabilities for the first digit
    digits = np.arange(1, 10)
    probabilities = np.log10(1 + 1/digits)

    first_digits = np.random.choice(digits, size=count, p=probabilities)
    other_digits = np.random.uniform(low=0, high=1, size=count) * (max_amount - min_amount) + min_amount

    amounts = first_digits * 10**(np.floor(np.log10(other_digits)))
    amounts = np.round(amounts, 2)  # Round to two decimal places
    return amounts

def generate_invoice_data(num_invoices, start_date, end_date, min_amount, max_amount, name_list):
    name_gender = generate_name_gender(name_list)
    invoices = []
    start_date = datetime.strptime(start_date, '%Y-%m-%d')
    end_date = datetime.strptime(end_date, '%Y-%m-%d')
    date_range = (end_date - start_date).days
    existing_invoice_numbers = set()

    # Generate initial Benford's Law compliant amounts
    benford_amounts = benford_law_numbers(num_invoices, min_amount, max_amount)

    fraudsters = {"Gregory Y Rogers": [], "Hannah U Powell": []}

    for i in range(num_invoices):
        invoice_date = start_date + timedelta(days=random.randint(0, date_range))
        due_date = invoice_date + timedelta(days=30)
        payment_date_variation = random.randint(-2, 5)
        payment_date = due_date + timedelta(days=payment_date_variation)
        customer = random.choice(list(name_gender.items()))

        invoice_number = generate_unique_invoice_number(existing_invoice_numbers)
        existing_invoice_numbers.add(invoice_number)

        is_fraudulent = customer[0] in fraudsters.keys()

        # Generate invoice amount
        if is_fraudulent:
            amount = round(random.uniform(min_amount, max_amount), 2)
        else:
            # Add random cents value between $0.01 and $99.99 to Benford's Law amounts
            random_cents = round(random.uniform(0.01, 99.99), 2)
            amount = round(benford_amounts[i] + random_cents, 2)
            # Make sure the amount does not exceed the max_amount
            if amount > max_amount:
                amount = round(max_amount - random.uniform(0.01, 0.99), 2)

        invoice = {
            'InvoiceID': i + 1,
            'InvoiceNumber': invoice_number,
            'InvoiceDate': invoice_date.strftime('%Y-%m-%d'),
            'InvoiceDueDate': due_date.strftime('%Y-%m-%d'),
            'InvoiceAmount': amount,
            'PaymentDate': payment_date.strftime('%Y-%m-%d'),
            'CustomerName': customer[0],
            'CustomerGender': customer[1]
        }
        invoices.append(invoice)

        # Store original fraudulent invoices to duplicate later
        if is_fraudulent:
            fraudsters[customer[0]].append(invoice)
            if len(fraudsters[customer[0]]) == 5:  # Limit to 5 per fraudster
                break

    # Add duplicates for the collected fraudulent invoices
    for fraudster, fraud_invoices in fraudsters.items():
        for fraud_invoice in fraud_invoices:
            duplicate_invoice = fraud_invoice.copy()
            duplicate_invoice['InvoiceID'] = len(invoices) + 1
            invoices.append(duplicate_invoice)

    return invoices


# Parameters for invoice generation
num_invoices_to_generate = 500000  # Adjusted for demonstration purposes
start_date = '2015-01-01'
end_date = '2025-06-30'
min_invoice_amount = 100
max_invoice_amount = 1000

# Name list
name_list = [
    "Alexis N Thompson", "Amanda M Turner", "Amy A Powell", "Amy F White",
    "Anna C Walker", "Anthony T Foster", "Arthur D Jenkins", "Ashley A Walker",
    "Ashley D Gutierrez", "Ashley I Russell", "Barbara O Howard", "Betty H Walker",
    "Betty V Hill", "Bobby T Sanders", "Bruce B Turner", "Bruce J Adams",
    "Carol H Watson", "Catherine G Hill", "Charles L Gonzalez", "Christina X Reyes",
    "Christine F Stewart", "Danielle V Butler", "Debra P Watson", "Debra R Johnson",
    "Diane P Harris", "Donna I Morales", "Donna N Bell", "Douglas P Cox",
    "Emma B Allen", "Emma L Stewart", "Frances S Smith", "Frank Q Wright",
    "Gary L Hall", "Gregory Y Rogers", "Hannah U Powell", "Heather Z Cooper",
    "Henry L Sanchez", "Henry O Gomez", "Henry R Reyes", "Jacqueline D Walker",
    "Jacqueline G Hughes", "James W Adams", "Janet V Taylor", "Jason L Adams",
    "Jason L Price", "Jean A Hall", "Jeffrey F Sanders", "Jennifer Y Sullivan",
    "Jennifer Z Morales", "Jeremy J Gonzalez", "Jeremy R Johnson", "Jerry C Evans",
    "Jesse B Hughes", "John C Barnes", "Jordan B Garcia", "Judith V Edwards",
    "Julia C Watson", "Kathleen H King", "Kelly N Green", "Kyle I Ortiz",
    "Kyle W Bailey", "Kyle W Sullivan", "Lauren P Carter", "Lawrence V Morgan",
    "Linda R Taylor", "Lisa J Roberts", "Lisa T King", "Lori P Smith",
    "Lori X Anderson", "Maria H Lewis", "Melissa H Morales", "Noah Z Powell",
    "Patricia P Thompson", "Paul H Anderson", "Robert R Myers", "Robert S Reed",
    "Russell Y Fisher", "Ruth G Johnson", "Ruth S Garcia", "Ryan H Rodriguez",
    "Samantha N Gray", "Samuel Y Morgan", "Sandra K Flores", "Sara S Scott",
    "Scott X Ross", "Sean L Kelly", "Sean S Howard", "Sean X Wilson",
    "Shirley B Smith", "Stephen S King", "Steven Q Young", "Teresa F Rogers",
    "Teresa J Cox", "Thomas R Williams", "Tiffany O Bailey", "Vincent J Barnes",
    "Virginia A Brooks", "Walter M Clark", "Wayne S Cook", "William U Barnes"
]

# Generate the invoices, including fraudulent transactions
invoices = generate_invoice_data(num_invoices_to_generate, start_date, end_date, min_invoice_amount, max_invoice_amount, name_list)

# Convert to DataFrame
df_invoices = pd.DataFrame(invoices)

drive.mount('/content/drive')

file_path = '/content/drive/My Drive/Invoices.csv'
df_invoices.to_csv(file_path, index=False)

print(f"Invoice data saved to {file_path}")


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Invoice data saved to /content/drive/My Drive/Invoices.csv
