# Step-by-Step Explanation of the Retail Store Dataset Generation Code

In [43]:
#1. Importing Necessary Libraries
import pandas as pd # For creating and manuplatinf the dataset
import numpy as np 
from faker import Faker # Generates realistic fake data for fields like dates, UUIDs, etc.
import random # Used for selecting random values from lists like product catagories and store locations.

In [44]:
#2. Initialize Faker and Seed for Reproducibility
fake = Faker() 
Faker.seed(0)  # Optional: Seed for reproducibility # Prepares the faker instance for generating data.
random.seed(0)  # Optional: Seed for reproducibility # Ensures consistent output every time the script is run by fixing the randomization.

In [56]:
# Define the number of rows for the dataset
num_rows = 3000

In [57]:
# Define product categories and products
categories = {
    "Electronics": ["Smartphone", "Laptop", "Tablet", "Headphones", "Smartwatch"],
    "Clothing": ["Shirt", "Jeans", "Jacket", "Dress", "Shoes"],
    "Groceries": ["Milk", "Eggs", "Bread", "Fruits", "Vegetables"],
    "Home Appliances": ["Mixer", "Vacuum Cleaner", "Air Conditioner", "Refrigerator", "Microwave"],
}
#It defines a dictionary of product cataagories and their respective products.

In [58]:
# Payment methods
payment_methods = ["Credit Card", "Cash", "Debit Card", "Mobile Payment"]
#Lists available payment methods for transctions.

In [59]:
# Store locations
store_locations = ["New York", "Los Angeles", "Chicago", "Houston", "Phoenix"]
#Specifies the possible store locations for transactions.

In [60]:
# Initialize dataset dictionary
data = {
    "Date": [fake.date_between(start_date="-1y", end_date="today") for _ in range(num_rows)],
    "Customer ID": [fake.uuid4()[:8] for _ in range(num_rows)],
    "Store Location": [random.choice(store_locations) for _ in range(num_rows)],
    "Category": [],
    "Product": [],
    "Quantity": [random.randint(1, 10) for _ in range(num_rows)],
    "Price": [],
    "Payment Method": [random.choice(payment_methods) for _ in range(num_rows)],
}

#Date: Random dates within the past year.
#Customer ID: Unique customer identifiers truncated to 8 characters.
#Store Location: Randomly selected store locations.
#Quantity: Random quantity (between 1 and 10).
#Payment Method: Randomly selected payment method.
#Empty Columns: Category, Product, and Price to be populated in the next step.

In [61]:
# Populate category, product, and price columns
for _ in range(num_rows):
    category = random.choice(list(categories.keys()))
    product = random.choice(categories[category])
    data["Category"].append(category)
    data["Product"].append(product)

    # Assign price based on category
    if category == "Electronics":
        price = random.uniform(100, 1000)
    elif category == "Clothing":
        price = random.uniform(20, 200)
    elif category == "Groceries":
        price = random.uniform(5, 50)
    elif category == "Home Appliances":
        price = random.uniform(50, 500)

    # Round price to two decimal places
    data["Price"].append(round(price, 2))
#Category and Product: Randomly assigns a category and product.
#Price: Randomly assigns a price based on the product category and rounds it to two decimal places.

In [68]:
data["Total Sales"] = [round(q * p, 2) for q, p in zip(data["Quantity"], data["Price"])]
#Total Sales: Calculated by multiplying Quantity and Price for each row, rounded to two decimal places.

In [62]:
# Verify all columns have the same length
for key, values in data.items():
    assert len(values) == num_rows, f"Column '{key}' has {len(values)} elements instead of {num_rows}."

In [63]:
# Create a DataFrame
df = pd.DataFrame(data)
#Converts the dictionary into a structured pandas DataFrame.

In [65]:
# Save the dataset to a CSV file
output_file = "retail_store_data_3000_rows.csv"
df.to_csv(output_file, index=False)

#File Name: Saves the dataset as retail_store_data.csv.
#Index: Excluded from the CSV file by setting index=False.

In [66]:
print(f"Dataset created and saved as '{output_file}'.")

Dataset created and saved as 'retail_store_data_3000_rows.csv'.


In [67]:
import os
print("File saved at:", os.getcwd())


File saved at: C:\Users\Admin\Desktop


# Summary of Output

In [1]:
#Dataset Details: The dataset contains 3000 rows with the following columns:
#Date: Transaction date.
#Customer ID: Unique identifier for each customer.
#Store Location: Location of the transaction.
#Category: Product category.
#Product: Specific product purchased.
#Quantity: Quantity of the product purchased.
#Price: Price of the product.
#Payment Method: Payment method used in the transaction.
#Total Sales: Total sales value (Quantity × Price).
#File Location: Saved as retail_store_data.csv in the current working directory. You can customize the file path by modifying the output_file variable.