In [1]:
%load_ext nb_black

<IPython.core.display.Javascript object>

In [2]:
import pandas as pd
import numpy as np

np.random.seed(42)

<IPython.core.display.Javascript object>

In [3]:
# Make n rows of data
n = 213
perc_nan = 0.15

<IPython.core.display.Javascript object>

In [4]:
# product_id & price --------------------------------------------------------

# Using less than n to have less product diversity
small_n = int(n * 0.05)

# generate zero padded ids (i.e. always display as 3 digits)
# if the number is '3' "pad" it with zeros on the left to be '003'
ids = [f"{i:03}" for i in range(small_n)]

# Random prices
prices = np.random.uniform(0.99, 24.99, size=small_n)

# Format prices as money (put dollar sign in front and format to 2 decimal places)
prices = np.array([f"${p:0.2f}" for p in prices])

# Sample to have a mix of products instead of 001 - 00n
df = pd.DataFrame({"product_id": ids, "price": prices})
df = df.sample(n=n, replace=True)

# Make some values nan at random
nan_idxs = np.random.choice(df.shape[0], size=int(n * perc_nan))
df.iloc[nan_idxs, 1] = np.nan

df.head(2)

Unnamed: 0,product_id,price
5,5,$4.73
4,4,


<IPython.core.display.Javascript object>

In [5]:
# state -----------------------------------------------------------------------
states = np.random.choice(
    ["CA", "OR", "NY", "FL", "TN", "AZ", "GA", "ID", "HI"], size=n, replace=True
)

df["state"] = states
df.head(2)

Unnamed: 0,product_id,price,state
5,5,$4.73,ID
4,4,,FL


<IPython.core.display.Javascript object>

In [6]:
# date ------------------------------------------------------------------------
# lazy way
days = np.random.choice(range(1, 26), size=n, replace=True)
months = np.random.choice(range(1, 13), size=n, replace=True)
years = np.random.choice(range(2010, 2020), size=n, replace=True)

dates = [f"{y}-{m:02}-{d:02}" for y, m, d in zip(years, months, days)]
dates = sorted(dates)

df["date"] = dates
df.head(2)

Unnamed: 0,product_id,price,state,date
5,5,$4.73,ID,2010-02-05
4,4,,FL,2010-02-07


<IPython.core.display.Javascript object>

In [7]:
# rating ----------------------------------------------------------------------
ratings = ["*" * np.random.randint(1, 6) for _ in range(n)]

df["rating"] = ratings
df.head(2)

Unnamed: 0,product_id,price,state,date,rating
5,5,$4.73,ID,2010-02-05,*
4,4,,FL,2010-02-07,****


<IPython.core.display.Javascript object>

In [8]:
# units_sold ------------------------------------------------------------------
# Randomness
units_sold = np.random.normal(5, 3, n)

# Make sure no value is below 1. cause you cant sell less than one units. prolly
units_sold = np.clip(units_sold, 1, np.inf)

# Add a trend
units_sold += np.arange(0, n)

# Round because you cant sell fractional units. prolly
units_sold = np.round(units_sold)

# Make some values nan
nan_idxs = np.random.choice(units_sold.size, size=int(n * perc_nan))
units_sold[nan_idxs] = np.nan

df["units_sold"] = units_sold
df.head(2)

Unnamed: 0,product_id,price,state,date,rating,units_sold
5,5,$4.73,ID,2010-02-05,*,6.0
4,4,,FL,2010-02-07,****,7.0


<IPython.core.display.Javascript object>

In [9]:
# order columns to match slides
df = df[["product_id", "state", "date", "rating", "units_sold", "price"]]
df.head(2)

Unnamed: 0,product_id,state,date,rating,units_sold,price
5,5,ID,2010-02-05,*,6.0,$4.73
4,4,FL,2010-02-07,****,7.0,


<IPython.core.display.Javascript object>

In [10]:
# Make all ratings for most popular product NaN
most_popular_prod = df["product_id"].value_counts().index[0]
df.loc[df["product_id"] == most_popular_prod, "rating"] = np.nan
df.head(2)

Unnamed: 0,product_id,state,date,rating,units_sold,price
5,5,ID,2010-02-05,*,6.0,$4.73
4,4,FL,2010-02-07,****,7.0,


<IPython.core.display.Javascript object>

In [11]:
df.to_csv("orders.csv")

<IPython.core.display.Javascript object>