In [1]:
import pandas as pd
import random
import faker
from datetime import datetime

# Initialize Faker
fake = faker.Faker()

# Number of entries
num_entries = 10000

# Sample data for columns
products = ["Laptop", "Mobile Phone", "Tablet", "Smartwatch", "Desktop", "Camera", "Printer", "Monitor", "Keyboard", "Headphones"]
categories = ["Electronics", "Accessories", "Office Equipment"]
regions = ["North", "South", "East", "West"]
payment_methods = ["Credit Card", "Debit Card", "Cash", "Online Payment", "UPI"]

# Generate random sales data
data = {
    "Transaction ID": [f"T{str(i).zfill(5)}" for i in range(1, num_entries + 1)],
    "Customer ID": [f"C{str(random.randint(1000, 9999))}" for _ in range(num_entries)],
    "Product": [random.choice(products) for _ in range(num_entries)],
    "Category": [random.choice(categories) for _ in range(num_entries)],
    "Region": [random.choice(regions) for _ in range(num_entries)],
    "Sales Amount": [round(random.uniform(50, 2000), 2) for _ in range(num_entries)],
    "Quantity": [random.randint(1, 10) for _ in range(num_entries)],
    "Date": [fake.date_between(start_date="-2y", end_date="today") for _ in range(num_entries)],
    "Payment Method": [random.choice(payment_methods) for _ in range(num_entries)],
}

# Create a DataFrame
sales_data = pd.DataFrame(data)

# Save to CSV
sales_data.to_csv("sales_data.csv", index=False)
print("Sales data with 10,000 entries created successfully!")

# Load the dataset
sales_data = pd.read_csv("sales_data.csv")

# Display the first 5 rows
print(sales_data.head())

# Summary statistics
print("\nSummary Statistics:")
print(sales_data.describe())

# Total sales by region
sales_by_region = sales_data.groupby("Region")["Sales Amount"].sum()
print("\nTotal Sales by Region:")
print(sales_by_region)

# Total sales by category
sales_by_category = sales_data.groupby("Category")["Sales Amount"].sum()
print("\nTotal Sales by Category:")
print(sales_by_category)

# Top 5 products by sales
top_products = sales_data.groupby("Product")["Sales Amount"].sum().sort_values(ascending=False).head(5)
print("\nTop 5 Products by Sales:")
print(top_products)



Sales data with 10,000 entries created successfully!
  Transaction ID Customer ID       Product     Category Region  Sales Amount  \
0         T00001       C4449        Laptop  Accessories   West       1919.46   
1         T00002       C8912        Laptop  Accessories  South        794.40   
2         T00003       C8133  Mobile Phone  Electronics  South       1273.98   
3         T00004       C3677       Desktop  Electronics   West        859.47   
4         T00005       C4763        Tablet  Accessories  South       1124.79   

   Quantity        Date  Payment Method  
0         9  2023-04-10             UPI  
1        10  2023-01-17     Credit Card  
2         2  2023-08-10      Debit Card  
3         4  2024-06-08  Online Payment  
4         5  2024-07-26  Online Payment  

Summary Statistics:
       Sales Amount      Quantity
count  10000.000000  10000.000000
mean    1025.603027      5.494900
std      566.562516      2.890019
min       50.380000      1.000000
25%      534.692500    