In [58]:
print("hello world!")

hello world!


**What is `faker` in `import faker`**:  
   `faker` is a Python library that generates fake data, such as names, addresses, and emails. It is useful for testing and simulating realistic-looking random data in various domains.

**What is `faker`**:  
   `faker` is a module within the `Faker` library used to generate random fake data, such as names, addresses, and emails. It helps simulate realistic data for development and testing purposes.

**What are `datetime`, `timedelta` in `from datetime import datetime, timedelta`**:  
   `datetime` represents date and time, while `timedelta` represents a duration or difference between two dates. They are used to manipulate and generate dates for order history.

In [59]:
import random
import faker
from datetime import datetime, timedelta


**What is `random.random()`**:  
   `random.random()` returns a random floating-point number between 0 and 1. It’s used to create a probability condition (like deciding if a product should have variants).

In [60]:
# Constants
PRODUCT_CATEGORIES = ["Clothing", "Groceries", "Electronics"]
PRODUCTS = {
    "Clothing": ["T-shirt", "Jeans", "Jacket"],
    "Groceries": ["Rice", "Pasta", "Olive Oil"],
    "Electronics": ["Laptop", "Smartphone", "Headphones", "ipad"]
}
COLORS = ["Red", "Green", "Blue", "Black", "White"]
VARIANT_CHANCE = 0.5  # 50% chance of a product having variants

# Generate Product Data
def generate_products():
    products = []
    product_id = 1

    for category, product_names in PRODUCTS.items():
        for product in product_names:
            # Check if product should have variants
            if (random.random() < VARIANT_CHANCE) and (category != "Groceries" ):
                # Generate variants
                for color in random.sample(COLORS, random.randint(1, 3)):
                    variant_name = f"{color} {product}"
                    products.append({"id": product_id, "category": category, "name": variant_name})
                    product_id += 1
            else:
                # No variants, add the product itself
                price = round(random.uniform(10, 500), 2)
                products.append({"id": product_id, "category": category, "name": product})
                product_id += 1
    
    return products


**What is `faker.Faker()`**:  
   `faker.Faker()` initializes a `Faker` instance that provides access to methods for generating fake data. It acts as a generator for creating random names, addresses, emails, etc.


**What are `faker.Faker().name()`, `faker.Faker().email()`, `faker.Faker().address()`, `faker.Faker().phone_number()`**:  
   These methods from `Faker` generate a fake person’s name, email, address, and phone number. They are useful for simulating customer data in an e-commerce scenario.

In [61]:
# Initialize Faker
fake = faker.Faker()

# Generate Customer Data
def generate_customers(num_customers):
    customers = []
    for _ in range(num_customers):
        customers.append({
            "id": fake.uuid4(),
            "name": fake.name(),
            "email": fake.email(),
            "address": fake.address(),
            "phone_number": fake.phone_number()
        })
    return customers



**What are `random.choice` and `random.randint` in `generate_orders()`**:  
   `random.choice()` selects a random item from a list (e.g., a random product or customer). `random.randint()` generates a random integer within a specified range (e.g., the number of order items).

**What is `datetime.now() - timedelta(days=random.randint(1, 730))` and `random.randint(1, 5)`**:  
   `datetime.now() - timedelta(days=random.randint(1, 730))` generates a random order date within the last two years. `random.randint(1, 5)` sets a random order quantity between 1 and 5.

  - Since, prive is dynamic and can change with time, I've put price column in orders table

- `uuid4()` in `fake.uuid4()` generates a random unique identifier (UUID) based on random numbers, which is commonly used for creating unique IDs. The datatype is a string representing the UUID in its standard 36-character format (including hyphens).

- The line `order_date.strftime("%Y-%m-%d")` formats the `order_date` into a string that follows the `YYYY-MM-DD` format (e.g., `2023-09-30`).

In [62]:
# Generate Order Data
def generate_orders(products, customers, num_orders=100):
    orders = []
    for _ in range(num_orders):
        customer = random.choice(customers)
        product = random.choice(products)
        order_date = datetime.now() - timedelta(days=random.randint(1, 730))  # Random date in the last 2 years
        quantity = random.randint(1, 5)
        price = round(random.uniform(10, 500), 2)  # Random price between $10 and $500
        total_price = price * quantity
        
        orders.append({
            "order_id": fake.uuid4(),
            "customer_id": customer["id"],
            "product_id": product["id"],
            "order_date": order_date.strftime("%Y-%m-%d"),
            # The line order_date.strftime("%Y-%m-%d") formats the order_date into a string that follows the YYYY-MM-DD format (e.g., 2023-09-30)
            "quantity": quantity,
            "product_price": price, # Since, prive is dynamic and can change with time, I've put price column in orders table
            "total_price": total_price
        })
    
    return orders


In [63]:
# Main Function to Generate All Data
def generate_ecommerce_data():
    # Generate products and customers
    products = generate_products()
    customers = generate_customers(10)  # At least 10 customers

    # Generate orders (At least 2 years of order history)
    orders = generate_orders(products, customers, num_orders=200)  # Generate 200 orders

    return products, customers, orders

# Run the generation
products, customers, orders = generate_ecommerce_data()

# Print the data (For demonstration purposes)
print("\nGenerated Products:\n", products[:5])


Generated Products:
 [{'id': 1, 'category': 'Clothing', 'name': 'Green T-shirt'}, {'id': 2, 'category': 'Clothing', 'name': 'Red T-shirt'}, {'id': 3, 'category': 'Clothing', 'name': 'Jeans'}, {'id': 4, 'category': 'Clothing', 'name': 'Red Jacket'}, {'id': 5, 'category': 'Groceries', 'name': 'Rice'}]


- Converting these to dataframe and saving to csv files.

In [64]:
import pandas as pd

df_products = pd.DataFrame(products)
df_customers = pd.DataFrame(customers)
df_orders = pd.DataFrame(orders)

In [65]:
df_products.head()

Unnamed: 0,id,category,name
0,1,Clothing,Green T-shirt
1,2,Clothing,Red T-shirt
2,3,Clothing,Jeans
3,4,Clothing,Red Jacket
4,5,Groceries,Rice


In [66]:
df_customers.head()

Unnamed: 0,id,name,email,address,phone_number
0,433aad0b-da9a-4843-b184-6caab49a5f18,David Hall,stephanie49@example.net,"90191 Anne Branch\nEast James, AR 94143",+1-587-384-6710x625
1,869ae21f-3c08-40e0-a79c-b3b850bb4f06,Michael Holt,rhart@example.org,"32998 Jason Shoals Apt. 299\nDustinbury, CO 68415",385.998.4140x32148
2,dc0be72e-5aa9-4850-9c3d-e8ba3296cbe2,Don Cochran,woodjason@example.com,"46187 Edwin Unions\nLake Randyport, TX 79259",001-480-330-4582x1875
3,e6dfe79b-d20f-4a03-be2b-9ca753e2dbcd,Jacob Kelley,fordaudrey@example.net,Unit 2584 Box 5162\nDPO AA 48058,(985)660-1676x405
4,ca1cea91-749f-41b7-a6af-5471bf375e22,Tammy Rowland,zvalenzuela@example.com,"82067 Nathan Pine\nPort Oliviastad, TN 45604",836.778.7476


In [67]:
df_orders.head()

Unnamed: 0,order_id,customer_id,product_id,order_date,quantity,product_price,total_price
0,7e356cfd-5323-4375-9f94-5e9b8a60b748,1771b2c8-6764-47ea-9730-6e3cc20f9c9a,12,2024-04-07,1,410.02,410.02
1,0aae2a02-8be6-4bf9-8c04-e045c0852480,ca1cea91-749f-41b7-a6af-5471bf375e22,9,2022-12-06,2,402.95,805.9
2,6a905c72-62e2-4752-b4b9-fe1681f8d96f,1bfbdda5-acda-45b3-85e9-7fb8b6871990,9,2024-08-10,2,28.25,56.5
3,88e430c0-b245-4484-94eb-45231937982a,e6dfe79b-d20f-4a03-be2b-9ca753e2dbcd,10,2023-10-13,4,133.54,534.16
4,ac9e2dba-2bb2-4223-a8d1-d669984b6a17,53a7ccef-e87b-4463-89a5-af9f82b45224,8,2023-05-26,2,463.84,927.68


In [68]:
df_products.to_csv("products.csv", index=False)
df_customers.to_csv("customers.csv", index=False)
df_orders.to_csv("orders.csv", index=False)