In [None]:
# In this cell I am importing all the packaged I will work with to generate a synthetic Amazon e-commerce sales data for year 2024.
import pandas as pd
import numpy as np
import random
from datetime import datetime, timedelta


In [None]:
# setting seed for same output in case of running the code several times.
random.seed(52)
np.random.seed(52)

In [None]:
# I am taking random 10 categories and random price range for those categories for my data. This dictionary is consist of 10 different categories and price range for each.
category_list_price_range = {
    "Electronics": (100, 500),
    "Home & Kitchen": (50, 300),
    "Clothing, Shoes & Jewelry": (20, 100),
    "Beauty & Personal Care": (15, 50),
    "Books": (10, 30),
    "Health & Wellness": (20, 150),
    "Tools & Home Improvement": (40, 200),
    "Pet Supplies": (10, 70),
    "Sports & Outdoors": (60, 350),
    "Toys & Games": (10, 60)
}

In [None]:
# In this cell I am creating a product catalog of 2500 items. Per category 250 different products will be generated.
product_catalog = []
product_counter = 1
for category, price_range in category_list_price_range.items():
  for _ in range(250):
    product_id = f"PRD{product_counter:06d}"
    product_price = round(random.uniform(*price_range),2)
    product_catalog.append ((product_id, category, product_price))
    product_counter +=1

In [None]:
# I am creating random date function to generate random dates in the year of 2024 from Jan 1st til December 31st
def random_date_2024():
  start_date = datetime(2024, 1, 1)
  end_date = datetime(2024, 12, 31)
  delta = end_date-start_date
  return start_date+timedelta(days=random.randint(0,delta.days))

In [None]:
# in this fucntion I am assigining random return probability to each category per country.
def returnprobs_country():
  return {
      "USA": {"Electronics": 0.22, "Home & Kitchen": 0.10, "Clothing, Shoes & Jewelry": 0.25,
        "Beauty & Personal Care": 0.15, "Books": 0.05, "Health & Wellness": 0.12,
        "Tools & Home Improvement": 0.10, "Pet Supplies": 0.10, "Sports & Outdoors": 0.14,
        "Toys & Games": 0.12},

      "Canada": { "Electronics": 0.20, "Home & Kitchen": 0.09, "Clothing, Shoes & Jewelry": 0.23,
        "Beauty & Personal Care": 0.13, "Books": 0.04, "Health & Wellness": 0.10,
        "Tools & Home Improvement": 0.09, "Pet Supplies": 0.09, "Sports & Outdoors": 0.12,
        "Toys & Games": 0.10},

      "Mexico": { "Electronics": 0.25, "Home & Kitchen": 0.12, "Clothing, Shoes & Jewelry": 0.28,
        "Beauty & Personal Care": 0.17, "Books": 0.06, "Health & Wellness": 0.14,
        "Tools & Home Improvement": 0.11, "Pet Supplies": 0.11, "Sports & Outdoors": 0.16,
        "Toys & Games": 0.14}
  }

In [None]:
# In this cell I am creating a distribution of units to lean more towards realistic data generation.
unit_distribution = {
    "Electronics": [1, 1, 2, 3],
    "Home & Kitchen": [1, 2, 1, 1],
    "Clothing, Shoes & Jewelry": [1, 1, 1, 2],
    "Beauty & Personal Care": [1, 2, 2, 3],
    "Books": [1, 2, 1, 1],
    "Health & Wellness": [1,3,1, 2],
    "Tools & Home Improvement": [1, 1, 2, 3],
    "Pet Supplies": [1, 2, 3, 1],
    "Sports & Outdoors": [1, 1, 2, 3],
    "Toys & Games": [1, 2, 2, 3]

  }

In [None]:
# in this cell I am writing to generate customer ratings.
def customer_rating(category,country):
  if country == "USA":
    if category == "Books":
      return round(random.uniform(3.7,4.9),1)
    elif category == "Electronics":
      return round(random.uniform(2.5, 4.5), 1)
    elif category == "Clothing, Shoes & Jewelry":
      return round(random.uniform(2.0, 4.2), 1)
    else:
      return round(random.uniform(3.0, 5.0), 1)
  elif country == "Canada":
    if category == "Books":
      return round(random.uniform(4.0, 5.0), 1)
    elif category == "Electronics":
      return round(random.uniform(2.8, 4.8), 1)
    elif category == "Clothing, Shoes & Jewelry":
      return round(random.uniform(2.2, 4.5), 1)
    else:
      return round(random.uniform(3.2, 5.0), 1)
  elif country == "Mexico":
    if category == "Books":
      return round(random.uniform(3.5, 4.8), 1)
    elif category == "Electronics":
      return round(random.uniform(2.0, 4.0), 1)
    elif category == "Clothing, Shoes & Jewelry":
      return round(random.uniform(1.8, 4.0), 1)
    else:
      return round(random.uniform(2.8, 4.8), 1)

In [None]:
# in this cell I am defining countries, countries weight in sales and seller types.
countries = ["USA", "Canada", "Mexico"]
countries_distribution = [0.50,0.28,0.22]
seller_types = ["Amazon FBA", "Vendor"]

In [None]:
# in this cell I am writing to generate data witn 1 million raws, based on our pre-defined functions and dictionary

records = []
n = 1000000

for i in range (1, n+1):
  order_id = f"ORD{i:08d}"
  product_id, category, price = random.choice(product_catalog)
  country = np.random.choice(countries, p = countries_distribution)
  order_date = random_date_2024().date()
  base_units = random.choice(unit_distribution[category])
  units_sold = max(1, int(base_units))
  revenue = round(units_sold*price,2)
  return_probabilities = returnprobs_country()
  return_probability = return_probabilities[country][category]
  return_units = sum([np.random.binomial(1, return_probability) for _ in range(units_sold)])
  rating = customer_rating(category, country)
  seller_type = random.choice(seller_types)

  records.append([order_id, product_id,category, order_date, units_sold, price, revenue, return_units, rating, country, seller_type])

In [None]:
df = pd.DataFrame(records, columns = ["order_id", "product_id", "category", "order_date", "units_sold", "price", "revenue", "return_units", "rating", "country", "seller_type"])

df.to_csv("amazon_sales_data.csv", index = False)