In [172]:
print("hello world!")

hello world!


**What is `faker` in `import faker`**:  
   `faker` is a Python library that generates fake data, such as names, addresses, and emails. It is useful for testing and simulating realistic-looking random data in various domains.

**What is `faker`**:  
   `faker` is a module within the `Faker` library used to generate random fake data, such as names, addresses, and emails. It helps simulate realistic data for development and testing purposes.

**What are `datetime`, `timedelta` in `from datetime import datetime, timedelta`**:  
   `datetime` represents date and time, while `timedelta` represents a duration or difference between two dates. They are used to manipulate and generate dates for order history.

In [173]:
import random
import faker
from datetime import datetime, timedelta


**What is `random.random()`**:  
   `random.random()` returns a random floating-point number between 0 and 1. It’s used to create a probability condition (like deciding if a product should have variants).

In [174]:
# Constants
PRODUCT_CATEGORIES = ["Clothing", "Groceries", "Electronics"]
PRODUCTS = {
    "Clothing": ["T-shirt", "Jeans", "Jacket"],
    "Groceries": ["Rice", "Pasta", "Olive Oil"],
    "Electronics": ["Laptop", "Smartphone", "Headphones", "ipad"]
}
COLORS = ["Red", "Green", "Blue", "Black", "White"]
VARIANT_CHANCE = 0.5  # 50% chance of a product having variants

# Generate Product Data
def generate_products():
    products = []
    product_id = 1

    for category, product_names in PRODUCTS.items():
        for product in product_names:
            # Check if product should have variants
            if (random.random() < VARIANT_CHANCE) and (category != "Groceries" ):
                # Generate variants
                for color in random.sample(COLORS, random.randint(1, 3)):
                    variant_name = f"{color} {product}"
                    products.append({"product_id": product_id, "category": category, "name": variant_name})
                    product_id += 1
            else:
                # No variants, add the product itself
                price = round(random.uniform(10, 500), 2)
                products.append({"product_id": product_id, "category": category, "name": product})
                product_id += 1
    
    return products


**What is `faker.Faker()`**:  
   `faker.Faker()` initializes a `Faker` instance that provides access to methods for generating fake data. It acts as a generator for creating random names, addresses, emails, etc.


**What are `faker.Faker().name()`, `faker.Faker().email()`, `faker.Faker().address()`, `faker.Faker().phone_number()`**:  
   These methods from `Faker` generate a fake person’s name, email, address, and phone number. They are useful for simulating customer data in an e-commerce scenario.

In [175]:
# Initialize Faker
fake = faker.Faker()

# Generate Customer Data
def generate_customers(num_customers):
    customers = []
    customer_id = 1
    for _ in range(num_customers):
        customers.append({
            "customer_id": customer_id,
            "name": fake.name(),
            "email": fake.email(),
            "address": fake.address(),
            "phone_number": fake.phone_number()
        })

        customer_id += 1
    return customers



**What are `random.choice` and `random.randint` in `generate_orders()`**:  
   `random.choice()` selects a random item from a list (e.g., a random product or customer). `random.randint()` generates a random integer within a specified range (e.g., the number of order items).

**What is `datetime.now() - timedelta(days=random.randint(1, 730))` and `random.randint(1, 5)`**:  
   `datetime.now() - timedelta(days=random.randint(1, 730))` generates a random order date within the last two years. `random.randint(1, 5)` sets a random order quantity between 1 and 5.

  - Since, prive is dynamic and can change with time, I've put price column in orders table

- `uuid4()` in `fake.uuid4()` generates a random unique identifier (UUID) based on random numbers, which is commonly used for creating unique IDs. The datatype is a string representing the UUID in its standard 36-character format (including hyphens).

- The line `order_date.strftime("%Y-%m-%d")` formats the `order_date` into a string that follows the `YYYY-MM-DD` format (e.g., `2023-09-30`).

In [176]:
# Generate Order Data
def generate_orders(products, customers, num_orders=100):
    orders = []
    order_id=1
    for _ in range(num_orders):
        customer = random.choice(customers)
        product = random.choice(products)
        order_date = datetime.now() - timedelta(days=random.randint(1, 730))  # Random date in the last 2 years
        quantity = random.randint(1, 5)
        price = round(random.uniform(10, 500), 2)  # Random price between $10 and $500
        total_price = price * quantity
        
        orders.append({
            "order_id": order_id,
            "customer_id": customer["customer_id"],
            "product_id": product["product_id"],
            "order_date": order_date.strftime("%Y-%m-%d"),
            # The line order_date.strftime("%Y-%m-%d") formats the order_date into a string that follows the YYYY-MM-DD format (e.g., 2023-09-30)
            "quantity": quantity,
            "product_price": price, # Since, prive is dynamic and can change with time, I've put price column in orders table
            "total_price": total_price
        })

        order_id += 1
    
    return orders


In [177]:
# Main Function to Generate All Data
def generate_ecommerce_data():
    # Generate products and customers
    products = generate_products()
    customers = generate_customers(10)  # At least 10 customers

    # Generate orders (At least 2 years of order history)
    orders = generate_orders(products, customers, num_orders=200)  # Generate 200 orders

    return products, customers, orders

# Run the generation
products, customers, orders = generate_ecommerce_data()

# Print the data (For demonstration purposes)
print("\nGenerated Products:\n", products[:5])


Generated Products:
 [{'product_id': 1, 'category': 'Clothing', 'name': 'Green T-shirt'}, {'product_id': 2, 'category': 'Clothing', 'name': 'Jeans'}, {'product_id': 3, 'category': 'Clothing', 'name': 'Blue Jacket'}, {'product_id': 4, 'category': 'Groceries', 'name': 'Rice'}, {'product_id': 5, 'category': 'Groceries', 'name': 'Pasta'}]


- Converting these to dataframe and saving to csv files.

In [178]:
import pandas as pd

df_products = pd.DataFrame(products)
df_customers = pd.DataFrame(customers)
df_orders = pd.DataFrame(orders)

In [179]:
df_products.head()

Unnamed: 0,product_id,category,name
0,1,Clothing,Green T-shirt
1,2,Clothing,Jeans
2,3,Clothing,Blue Jacket
3,4,Groceries,Rice
4,5,Groceries,Pasta


- To replace all commas in the address column of the df_customers DataFrame with spaces.
- This will help remove complications while reading csv in sql_workbench

In [180]:
df_customers['address'] = df_customers['address'].str.replace(',', ' ', regex=False)
df_customers['address'] = df_customers['address'].str.replace('\n', ' ', regex=False)

In [181]:
df_customers.head()

Unnamed: 0,customer_id,name,email,address,phone_number
0,1,Tracy Chavez,mharris@example.net,8932 Michael Mills Johnport PA 61538,266-255-8610x6944
1,2,Marie Thomas,jhernandez@example.net,452 Daniel Avenue Christopherside DC 28520,835-922-6123x899
2,3,Jason Jefferson,richardashley@example.net,PSC 2123 Box 9037 APO AP 76384,(928)353-1084
3,4,Michael Gibbs,omartinez@example.com,5675 Davis Forges Suite 968 Port Thomaschester...,(503)816-3507x2943
4,5,Alexander York,mclaughlinryan@example.net,4600 Mary Street Suite 277 Port Rebecca HI 38069,696-806-0403x753


In [182]:
df_customers.isnull().sum()

customer_id     0
name            0
email           0
address         0
phone_number    0
dtype: int64

In [183]:
df_orders.head()

Unnamed: 0,order_id,customer_id,product_id,order_date,quantity,product_price,total_price
0,1,4,3,2024-08-08,1,426.62,426.62
1,2,3,5,2024-06-19,4,301.37,1205.48
2,3,4,6,2023-04-12,1,463.27,463.27
3,4,4,6,2023-01-29,4,388.65,1554.6
4,5,6,11,2024-04-06,5,269.91,1349.55


In [184]:
df_products.to_csv("products.csv", index=False)
df_customers.to_csv("customers.csv", index=False)
df_orders.to_csv("orders.csv", index=False)