In [1]:
import pandas as pd
from faker import Faker
import random
import datetime

fake = Faker('de_DE')

In [2]:
# 1. Customers Data
customers = []
num_customers = 5000
for i in range (1, num_customers + 1):
    customers.append({
        'customer_id' : i,
        'join_date' : fake.date_between(start_date = '-3y', end_date = 'today'),
        'city' : fake.city(),
        'postal_code' : fake.postcode()
    })

customers_df = pd.DataFrame(customers)
customers_df.to_csv('customers.csv', index=False)

print("customers.csv generated successfully!")

customers.csv generated successfully!


In [3]:
# 2. Products Data

products = []
num_products = 100
categories = ['Kuche', 'Badezimmer', 'Lebensmittel', 'Kleidung', 'Garten']
for i in range(1, num_products + 1):  # 100 unique products
    products.append({
        'product_id' : i,
        'product_name' : fake.word().capitalize() + ' ' + fake.word(),
        'category' : random.choice(categories),
        'price' : round(random.uniform(5.0, 150.0), 2),
        'sustainability_rating' : random.randint(1, 5)
    })

products_df = pd.DataFrame(products)
products_df.to_csv('products.csv', index = False)

print("customers.csv generated successfully!")

customers.csv generated successfully!


In [4]:
# 3. Orders data generation

orders = []
num_orders = 20000
for i in range(1, num_orders + 1): # 20000 orders so far
    orders.append({
        'order_id' : i,
        'customer_id' : random.randint(1, 5000),
        'order_date' : fake.date_between(start_date = '-3y', end_date = 'today'),
        'total_amount' : round(random.uniform(5.0, 1000.0), 2)
    })

orders_df = pd.DataFrame(orders)
orders_df.to_csv('orders.csv', index = False)

print("orders.csv successfully generated")

orders.csv successfully generated


In [6]:
# 4. Order Items Data Generation

order_items = []
order_item_id_counter = 1

for order_id in range(1, num_orders + 1):
    # Assuming each order has 1 to 5 different items
    num_items_in_order = random.randint(1, 5)

    # Using a set to ensure that there is no item repeating in an order
    products_in_this_order = set()

    # Randomly selecting product_ids for this order
    while len(products_in_this_order) < num_items_in_order:
        product_id = random.randint(1, num_products)
        products_in_this_order.add(product_id)

    # For each unique product in the order, create a record
    for product_id in products_in_this_order:
        order_items.append({
            'order_item_id' : order_item_id_counter,
            'order_id' : order_id,
            'product_id' : product_id,
            'quantity' : random.randint(1, 3)
        })
        order_item_id_counter = order_item_id_counter + 1
        
order_items_df = pd.DataFrame(order_items)

order_items_df.to_csv('order_items.csv', index = False)

print('order_items.csv successfully generated')
    

order_items.csv successfully generated


In [15]:
# 5. Marketing data generation
from datetime import date, timedelta

num_campaigns = 25
marketing_data = []

current_year = date.today().year
campaign_types = ['Rabatt', 'Aktion', 'Sale', 'Angebote']
campaign_templates = [
    {'event': 'Oster', 'months': [3, 4]},
    {'event': 'Sommer', 'months': [6, 7, 8]},
    {'event': 'Herbst', 'months': [9, 10]},
    {'event': 'Winter', 'months': [12, 1, 2]},
    {'event': 'Black Friday', 'months': [11]},
    {'event': 'Weihnachts', 'months': [11, 12]}
]

for i in range(1, num_campaigns + 1):
    template = random.choice(campaign_templates)
    year = random.randint(current_year - 2, current_year)
    month = random.choice(template['months'])
    day = random.randint(1, 28)
    start_date = date(year, month, day)
    campaign_duration = timedelta(days = random.randint(7, 30))
    end_date = start_date + campaign_duration
    campaign_name = f"{template['event']}-{random.choice(campaign_types)} {year}"
    
    marketing_data.append({
        'campaign_id' : i,
        'campaign_name' : campaign_name,
        'start_date' : start_date,
        'end_date' : end_date
    })

marketing_df = pd.DataFrame(marketing_data)

marketing_df = marketing_df.sort_values(by='start_date')

marketing_df.to_csv('marketing.csv', index = False)

print('marketing.csv successuflly generated')

marketing.csv successuflly generated


In [17]:
marketing_df

Unnamed: 0,campaign_id,campaign_name,start_date,end_date
24,25,Winter-Angebote 2023,2023-01-14,2023-02-12
18,19,Winter-Sale 2023,2023-01-24,2023-02-05
14,15,Oster-Aktion 2023,2023-04-03,2023-04-24
15,16,Oster-Aktion 2023,2023-04-10,2023-05-07
19,20,Oster-Angebote 2023,2023-04-24,2023-05-04
12,13,Sommer-Aktion 2023,2023-07-05,2023-07-12
13,14,Sommer-Sale 2023,2023-07-18,2023-08-05
22,23,Black Friday-Aktion 2023,2023-11-01,2023-11-19
11,12,Black Friday-Aktion 2023,2023-11-04,2023-11-28
2,3,Black Friday-Aktion 2023,2023-11-14,2023-12-11
