In [1]:
import pandas as pd
import numpy as np
import random
import datetime  # to get the time
import calendar  # to get the month

In [2]:
#list of all the products

products = {
 #'Product Name' : [ Price, Weight]
    
  'iPhone': [700, 10],
  'Google Phone': [600, 8],
  'Vareebadd Phone': [400, 3],
  '20in Monitor': [109.99,6],
  '34in Ultrawide Monitor': [379.99, 9],
  '27in 4K Gaming Monitor': [389.99,9],
  '27in FHD Monitor': [149.99, 11],
  'Flatscreen TV': [300, 7],
  'Macbook Pro Laptop': [1700, 7],
  'ThinkPad Laptop': [999.99, 6],
  'AA Batteries (4-pack)': [3.84, 30],
  'AAA Batteries (4-pack)': [2.99, 30],
  'USB-C Charging Cable': [11.95, 30],
  'Lightning Charging Cable': [14.95, 30],
  'Wired Headphones': [11.99, 26],
  'Bose SoundSport Headphones': [99.99, 19],
  'Apple Airpods Headphones': [150, 22],
  'LG Washing Machine': [600.00, 1],
  'LG Dryer': [600.00, 1]
}


In [3]:
# All the column names for the dataframe
columns = ['Order ID', 'Product', 'Quantity Ordered', 'Price Each', 'Order Date', 'Purchase Address']

In [4]:
def generate_random_address():
    
    #list of street names
    street_names = ['Main', '2nd', '1st', '4th', '5th', 'Park', '6th', '7th', 'Maple', 'Pine', 'Washington', '8th', 'Cedar', 'Elm', 'Walnut', '9th', '10th', 'Lake', 'Sunset', 'Lincoln', 'Jackson', 'Church', 'River', '11th', 'Willow', 'Jefferson', 'Center', '12th', 'North', 'Lakeview', 'Ridge', 'Hickory', 'Adams', 'Cherry', 'Highland', 'Johnson', 'South', 'Dogwood', 'West', 'Chestnut', '13th', 'Spruce', '14th', 'Wilson', 'Meadow', 'Forest', 'Hill', 'Madison']
    # list of city names
    cities = ['San Francisco', 'Boston', 'New York City', 'Austin', 'Dallas', 'Atlanta', 'Portland', 'Portland', 'Los Angeles', 'Seattle']
    # weights for each city
    weights = [9,4,5,2,3,3,2,0.5,6,0.7]
    # zip code for each city
    zips = ['893828', '898230', '029302', '877982', '909873', '23224', '23232', '343423', '323452', '342134']
    # state name of each city
    state = ['CA', 'MA', 'NY', 'TX', 'TX', 'GA', 'OR', 'ME', 'CA', 'WA']
    
    # function to get a random street name
    street = random.choice(street_names)
    
    # to get the value of range(len(cities)) depending upon the weights
    index = random.choices(range(len(cities)), weights= weights)[0]
    
    return f'{random.randint(1,999)} {street} St, {cities[index]}, {state[index]} {zips[index]}'


In [5]:
def generate_random_time(month):
    day = generate_random_day(month)
    
    #peak at 12 pm
    if random.random() < 0.5:
        date = datetime.datetime(2019, month, day, 12, 0)# create a date
    
    # peak at 20 pm
    else:
        date = datetime.datetime(2019, month, day, 20, 0)
    
    # to create a time difference, return +-3 hours or +-180 minutes
    time_diff = np.random.normal(loc= 0.0, scale=180)
    final_date = date + datetime.timedelta(minutes= time_diff)
    
    return final_date.strftime('%m/%d/%y %H:%M')
        
    

In [6]:
def generate_random_day(month):
    
    # to get the no. of days in a month
    day_range = calendar.monthrange(2019, month)[1]
    # monthrange function returns the range of days in a month i.e. (Start day, Last day)
    # Like for month 1 : (1,31)
    #but we want only 31 in this case so we use [1]
    
    # returns the random date
    return random.randint(1, day_range)
    

In [7]:
def write_row(order_number, product, order_date, address):
    
    # to get the price of a particular product
    product_price = products[product][0]
    
    # to get the quantity of its material depending upon its price
    # qunatity is inversely propotional to price
    quantity = np.random.geometric(p= 1.0-(1.0/product_price), size=1)[0]
    output = [order_number, product, quantity, product_price, order_date, address]
    
    return output

In [8]:
order_number = 138924

# returns all the key values of the dict(products) i.e. 'Product Name'
product_list = [product for product in products]

# to get the weights of each product
weights = [products[product][1] for product in products]

for month in range(8,13):
    
    if month <= 10:
        order_amount = int(np.random.normal(loc=12000, scale=4000))
        
    elif month  == 11:
        order_amount = int(np.random.normal(loc=20000, scale=3000))
        
    else: #month == 12
        order_amount = int(np.random.normal(loc=26000, scale=2000))
        
    
    df = pd.DataFrame(columns = columns)
    print(order_amount)
    
    i=0
    
    while order_amount > 0:
        
        address = generate_random_address()
        order_date = generate_random_time(month)
        
        # [0] means to get the values in the form of a string
        product_choice = random.choices(product_list, weights)[0]
        
        df.loc[i] = write_row(order_number, product_choice, order_date, address)
        i += 1
        
        # add some items to orders with random chance
        if product_choice == 'iPhone':
            if random.random() < 0.15:
                df.loc[i] = write_row(order_number, 'Lightning Charging Cable', order_date, address)
                i += 1
                
            if random.random() < 0.05:
                df.loc[i] = write_row(order_number, 'Apple Airpods Headphones', order_date, address)
                i += 1
                
            if random.random() < 0.05:
                df.loc[i] = write_row(order_number, 'Wired Headphones', order_date, address)
                i += 1
                
                
        elif product_choice == "Google Phone" or product_choice == "Vareebadd Phone":
            
            if random.random() < 0.18:
                df.loc[i] = write_row(order_number, "USB-C Charging Cable", order_date, address)
                i += 1
            
            if random.random() < 0.04:
                df.loc[i] = write_row(order_number, "Bose SoundSport Headphones", order_date, address)
                i += 1
            
            if random.random() < 0.07:
                df.loc[i] = write_row(order_number, "Wired Headphones", order_date, address)
                i += 1    
                
                
        if random.random() <= 0.02:
            product_choice = random.choices(product_list, weights)[0]
            df.loc[i] = write_row(order_number, product_choice, order_date, address)
            i += 1
            
            
        order_number += 1
        order_amount -= 1
        
    
    month_name = calendar.month_name[month]
    df.to_csv(f'C:\\Users\\Admin\\Desktop\\mock_data\\Sales_{month_name}_2019.csv', index=False)
    print(f'{month_name} Finished')
                

9929
August Finished
10948
September Finished
13912
October Finished
17460
November Finished
23657
December Finished
