In [29]:
import numpy as np
import pandas as pd
import random
import string
from datetime import datetime, timedelta
from faker import Faker
fake = Faker()

In [30]:
# Helper functions for numeric data
def generate_integers(size, low=0, high=100):
    return np.random.randint(low, high, size)

def generate_floats(size, low=0.0, high=1.0, distribution='uniform'):
    if distribution == 'uniform':
        return np.random.uniform(low, high, size)
    elif distribution == 'normal':
        mean = (high + low) / 2
        stddev = (high - low) / 6  # approx 99.7% data within range
        return np.random.normal(mean, stddev, size)

In [31]:
# Helper functions for string data
def generate_random_strings(size, length=10):
    return [''.join(random.choices(string.ascii_letters, k=length)) for _ in range(size)]

def generate_names(size):
    return [fake.name() for _ in range(size)]

def generate_emails(size):
    return [fake.email() for _ in range(size)]

def generate_addresses(size):
    return [fake.address().replace('\n', ', ') for _ in range(size)]

def generate_phone_numbers(size):
    return [fake.phone_number() for _ in range(size)]

In [32]:
# Helper functions for categorical data
def generate_categories(size, categories):
    return np.random.choice(categories, size)

In [33]:
# Helper functions for datetime data
def generate_datetimes(size, start, end):
    start_u = start.timestamp()
    end_u = end.timestamp()
    return [datetime.fromtimestamp(random.uniform(start_u, end_u)) for _ in range(size)]

In [34]:
# Helper functions for generating customer data
def generate_customer_ids(size):
    return [f"CUST{str(i).zfill(5)}" for i in range(1, size+1)]

In [35]:
def generate_customer_data(num_customers):
    data = {
        'customer_id': generate_customer_ids(num_customers),
        'name': generate_names(num_customers),
        'email': generate_emails(num_customers),
        'address': generate_addresses(num_customers),
        'phone_number': generate_phone_numbers(num_customers),
        'registration_date': generate_datetimes(num_customers, datetime(2000, 1, 1), datetime(2022, 12, 31)),
    }
    return pd.DataFrame(data)

In [36]:
# Helper functions for generating sales data
def generate_sales_data(num_sales, customer_ids):
    data = {
        'sale_id': [f"SALE{str(i).zfill(5)}" for i in range(1, num_sales+1)],
        'customer_id': np.random.choice(customer_ids, num_sales),
        'sale_amount': generate_floats(num_sales, 10.0, 1000.0, 'normal'),
        'sale_date': generate_datetimes(num_sales, datetime(2020, 1, 1), datetime(2022, 12, 31)),
        'product_category': generate_categories(num_sales, ['Electronics', 'Clothing', 'Books', 'Home', 'Toys']),
    }
    return pd.DataFrame(data)

In [37]:
def generate_datasets(num_customers, num_sales):
    customer_data = generate_customer_data(num_customers)
    sales_data = generate_sales_data(num_sales, customer_data['customer_id'].tolist())
    return customer_data, sales_data


In [38]:
num_customers = 100
num_sales = 800

In [39]:
customer_df, sales_df = generate_datasets(num_customers, num_sales)


In [40]:
customer_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Data columns (total 6 columns):
 #   Column             Non-Null Count  Dtype         
---  ------             --------------  -----         
 0   customer_id        100 non-null    object        
 1   name               100 non-null    object        
 2   email              100 non-null    object        
 3   address            100 non-null    object        
 4   phone_number       100 non-null    object        
 5   registration_date  100 non-null    datetime64[ns]
dtypes: datetime64[ns](1), object(5)
memory usage: 4.8+ KB


In [42]:
sales_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 800 entries, 0 to 799
Data columns (total 5 columns):
 #   Column            Non-Null Count  Dtype         
---  ------            --------------  -----         
 0   sale_id           800 non-null    object        
 1   customer_id       800 non-null    object        
 2   sale_amount       800 non-null    float64       
 3   sale_date         800 non-null    datetime64[ns]
 4   product_category  800 non-null    object        
dtypes: datetime64[ns](1), float64(1), object(3)
memory usage: 31.4+ KB


In [None]:

print("Customer Data:")
print(customer_df.head())
print("\nSales Data:")
print(sales_df.head())

Customer Data:
  customer_id              name                       email  \
0   CUST00001  Heather Matthews     biancamoyer@example.net   
1   CUST00002  Brittany Burgess      jonathan46@example.com   
2   CUST00003     Kenneth Brown  hoffmanstephen@example.com   
3   CUST00004   Frederick Moore         ltaylor@example.net   
4   CUST00005       Sara Bright          dbowen@example.org   

                                             address        phone_number  \
0  26162 Gonzalez Rapids Suite 848, Larryshire, S...          9343570083   
1                   Unit 6922 Box 4320, DPO AP 89206    346.469.0181x784   
2  9585 Brown Extension Suite 300, Amyport, OK 24557       (701)245-3750   
3  37987 Ho Gateway Suite 262, South Christopher,...  326.485.2453x63981   
4                   Unit 3007 Box 1762, DPO AP 35378       (285)838-8374   

           registration_date  
0 2006-01-07 00:23:10.390119  
1 2007-02-10 06:10:56.693214  
2 2013-02-04 10:35:17.887422  
3 2014-06-04 02:16:55.388