In [1]:
import pandas as pd
import numpy as np
import datetime
import random

In [2]:
columns = ['Order ID', 'Product', 'Quantity Ordered','Price Each', 'Order Date', 'Purchase Address']


In [3]:
products ={'Travel Electronics Organizer Bag':19.90, 'GoPro':400, 'Video Camera Camcorder':70,
          'eBook Reader':30, 'Oculus Quest 2':300, 'Stereo Gaming Headset for PS4':28, 'Flatscreen TV': 300,
  'Macbook Pro Laptop': 1700,
  'ThinkPad Laptop': 999.99,
  'AA Batteries (4-pack)': 3.84,
  'AAA Batteries (4-pack)': 2.99,
  'USB-C Charging Cable': 11.95,
  'Lightning Charging Cable': 14.95,
  'Wired Headphones': 11.99,
  'Bose SoundSport Headphones': 99.99,
  'Apple Airpods Headphones': 150,
  'LG Washing Machine': 600.00,
  'LG Dryer': 600.00}

In [4]:
df = pd.DataFrame(columns = columns)

In [5]:
df

Unnamed: 0,Order ID,Product,Quantity Ordered,Price Each,Order Date,Purchase Address


In [6]:
for i in range(1000):
    product = random.choice(list(products.keys()))
    price = products[product]
    df.loc[i] = [i, product, 1, price, 'NA', 'NA']
df

Unnamed: 0,Order ID,Product,Quantity Ordered,Price Each,Order Date,Purchase Address
0,0,Wired Headphones,1,11.99,,
1,1,Bose SoundSport Headphones,1,99.99,,
2,2,LG Washing Machine,1,600.00,,
3,3,LG Dryer,1,600.00,,
4,4,ThinkPad Laptop,1,999.99,,
...,...,...,...,...,...,...
995,995,Macbook Pro Laptop,1,1700.00,,
996,996,ThinkPad Laptop,1,999.99,,
997,997,Lightning Charging Cable,1,14.95,,
998,998,ThinkPad Laptop,1,999.99,,


# Let's make our data more realistic.

## Selecting some products with higher probability



In [7]:
#Product name :[Price, popularity=weight]

products ={'Travel Electronics Organizer Bag':19.90, 'GoPro':400, 'Video Camera Camcorder':70,
          'eBook Reader':30, 'Oculus Quest 2':300, 'Stereo Gaming Headset for PS4':28, 'Flatscreen TV': 300,
  'Macbook Pro Laptop': 1700,
  'ThinkPad Laptop': 999.99,
  'AA Batteries (4-pack)': 3.84,
  'AAA Batteries (4-pack)': 2.99,
  'USB-C Charging Cable': 11.95,
  'Lightning Charging Cable': 14.95,
  'Wired Headphones': 11.99,
  'Bose SoundSport Headphones': 99.99,
  'Apple Airpods Headphones': 150,
  'LG Washing Machine': 600.00,
  'LG Dryer': 600.00}

In [8]:
# As a I am too lazy to make up 'popularity points' I will generate them.

for product in products:
    products[product] = [products[product], random.randint(0, 20)]

#sort products according popularuty-weight    
products = dict(sorted(products.items(), key=lambda item: item[1][1], reverse=True))
products

{'Wired Headphones': [11.99, 20],
 'USB-C Charging Cable': [11.95, 19],
 'Flatscreen TV': [300, 18],
 'Lightning Charging Cable': [14.95, 18],
 'LG Dryer': [600.0, 18],
 'Apple Airpods Headphones': [150, 17],
 'Video Camera Camcorder': [70, 14],
 'ThinkPad Laptop': [999.99, 11],
 'Travel Electronics Organizer Bag': [19.9, 9],
 'Oculus Quest 2': [300, 9],
 'AA Batteries (4-pack)': [3.84, 9],
 'eBook Reader': [30, 8],
 'Macbook Pro Laptop': [1700, 8],
 'GoPro': [400, 7],
 'LG Washing Machine': [600.0, 7],
 'Bose SoundSport Headphones': [99.99, 6],
 'AAA Batteries (4-pack)': [2.99, 5],
 'Stereo Gaming Headset for PS4': [28, 2]}

In [9]:
product_list = list(products.keys())
weight_list = [products[product][1] for product in product_list]
                    

for i in range(1000):
    product = random.choices(population=product_list, weights=weight_list)[0]
    price = products[product][0]
    df.loc[i] = [i, product, 1, price, 'NA', 'NA']
df

Unnamed: 0,Order ID,Product,Quantity Ordered,Price Each,Order Date,Purchase Address
0,0,Oculus Quest 2,1,300.00,,
1,1,Video Camera Camcorder,1,70.00,,
2,2,Macbook Pro Laptop,1,1700.00,,
3,3,USB-C Charging Cable,1,11.95,,
4,4,Video Camera Camcorder,1,70.00,,
...,...,...,...,...,...,...
995,995,Macbook Pro Laptop,1,1700.00,,
996,996,Apple Airpods Headphones,1,150.00,,
997,997,Wired Headphones,1,11.99,,
998,998,Travel Electronics Organizer Bag,1,19.90,,


In [11]:
#check that some products are more popular than other

df.groupby('Product').count()['Order ID'].sort_values(ascending = False)

Product
Wired Headphones                    108
USB-C Charging Cable                 98
Lightning Charging Cable             93
LG Dryer                             90
Flatscreen TV                        87
Video Camera Camcorder               79
Apple Airpods Headphones             67
Macbook Pro Laptop                   47
Travel Electronics Organizer Bag     46
ThinkPad Laptop                      46
AA Batteries (4-pack)                43
Oculus Quest 2                       37
eBook Reader                         33
Bose SoundSport Headphones           31
AAA Batteries (4-pack)               31
GoPro                                30
LG Washing Machine                   24
Stereo Gaming Headset for PS4        10
Name: Order ID, dtype: int64

In [12]:
[product for product in products]

['Wired Headphones',
 'USB-C Charging Cable',
 'Flatscreen TV',
 'Lightning Charging Cable',
 'LG Dryer',
 'Apple Airpods Headphones',
 'Video Camera Camcorder',
 'ThinkPad Laptop',
 'Travel Electronics Organizer Bag',
 'Oculus Quest 2',
 'AA Batteries (4-pack)',
 'eBook Reader',
 'Macbook Pro Laptop',
 'GoPro',
 'LG Washing Machine',
 'Bose SoundSport Headphones',
 'AAA Batteries (4-pack)',
 'Stereo Gaming Headset for PS4']

As we can see products with max weight are in the top after groupby.count

# Generating months

In [14]:
import calendar

In [18]:
#calendar.month_name - indexing from 1
print(calendar.month_name[1])
#return empty string
print(calendar.month_name[0])

January



In [20]:
product_list = list(products.keys())
weight_list = [products[product][1] for product in product_list]


for month_i in range(1,12+1):
    #for each month lets create a dataframe
    df_i = pd.DataFrame(columns=columns)
                    
    for i in range(1000):
        product = random.choices(population=product_list, weights=weight_list)[0]
        price = products[product][0]
        df_i.loc[i] = [i, product, 1, price, 'NA', 'NA']
        month_name = calendar.month_name[month_i]
        df_i.to_csv(f'{month_name}_data.csv')
        

# Make some months have more purchase than others

In [25]:
#I guess, that during holiday-season lake valentine's day, The 23 of Februrary,
# The 8 of March, and New year(+ November)  people purchase more.

holday_months = [2, 3, 11]
new_year_month = 12

product_list = list(products.keys())
weight_list = [products[product][1] for product in product_list]

for month_i in range(1,12+1):
    #for each month lets create a dataframe
    df_i = pd.DataFrame(columns=columns)
    
    if (month_i in holday_months):
        orders_number = int(np.random.normal(loc=1500, scale = 400))
    elif (month_i == new_year_month):
        orders_number = int(np.random.normal(loc = 2500, scale = 300))
    else:
        orders_number = int(np.random.normal(loc = 1300, scale = 200))
            
    for i in range(orders_number):
        product = random.choices(population=product_list, weights=weight_list)[0]
        price = products[product][0]
        df_i.loc[i] = [i, product, 1, price, 'NA', 'NA']
        month_name = calendar.month_name[month_i]
        df_i.to_csv(f'{month_name}_data.csv')
    print(len(df_i), month_name)




1515 January
1581 February
1218 March
1391 April
1252 May
1021 June
1480 July
1160 August
1421 September
1307 October
1483 November
2577 December


# Generating Random Address

In [33]:
street_names = ['Центральная', 'Молодежная', 'Школьная', 'Лесная', 'Садовая', 'Советская', 'Новая', 'Набережная', 'Заречная', 'Зеленая']
#Благовещенск в Амурской области и Благовещенск в Башкортостане
cities = {'Каза́нская о́бласть':'Казань','Сочинский район':'Сочи','Нижегородская область':'Нижний Новгород', 'Московская область':'Москва', 'Ленинградская область' : 'СПБ', 'Калинингра́дская о́бласть':'Калининград','Свердловская область':'Екатеринбург', 'Амурская область':'Благовещенск', 'Респу́блика Башкортоста́н':'Благовещенск'}
state = np.random.choice(list(cities.keys()))

# As a I am too lazy to make up 'popularity points' I will generate them.

for state in cities:
    cities[state] = [cities[state], random.randint(1, 10)]

#sort products according popularuty-weight    
cities = dict(sorted(cities.items(), key=lambda item: item[1][1], reverse=True))

cities



{'Сочинский район': ['Сочи', 10],
 'Нижегородская область': ['Нижний Новгород', 10],
 'Московская область': ['Москва', 7],
 'Респу́блика Башкортоста́н': ['Благовещенск', 7],
 'Ленинградская область': ['СПБ', 5],
 'Калинингра́дская о́бласть': ['Калининград', 5],
 'Амурская область': ['Благовещенск', 3],
 'Каза́нская о́бласть': ['Казань', 2],
 'Свердловская область': ['Екатеринбург', 2]}

In [44]:
def generate_random_adress():
    weight_list = [cities[state][1] for state in cities]
    state =  random.choices(list(cities.keys()),weight_list)
    return f"{state[0]}, город {cities[state[0]][0]}, {random.choice(street_names)} улица, дом {np.random.randint(100)}, квартира {np.random.randint(200)}"
generate_random_adress()

'Сочинский район, город Сочи, Центральная улица, дом 66, квартира 81'

In [47]:
#I guess, that during holiday-season lake valentine's day, The 23 of Februrary,
# The 8 of March, and New year(+ November)  people purchase more.

holday_months = [2, 3, 11]
new_year_month = 12

product_list = list(products.keys())
weight_list = [products[product][1] for product in product_list]

for month_i in range(1,12+1):
    #for each month lets create a dataframe
    df_i = pd.DataFrame(columns=columns)
    
    if (month_i in holday_months):
        orders_number = int(np.random.normal(loc=1500, scale = 400))
    elif (month_i == new_year_month):
        orders_number = int(np.random.normal(loc = 2500, scale = 300))
    else:
        orders_number = int(np.random.normal(loc = 1300, scale = 200))
            
    for i in range(orders_number):
        product = random.choices(population=product_list, weights=weight_list)[0]
        price = products[product][0]
        address = generate_random_adress()
        df_i.loc[i] = [i, product, 1, price, 'NA', address]
        month_name = calendar.month_name[month_i]
        df_i.to_csv(f'{month_name}_data.csv')
    print(len(df_i), month_name)




1533 January
1315 February
1319 March
1361 April
1372 May
1533 June
1459 July
1203 August
1210 September
997 October
1591 November
2650 December


# Generate order time

In [63]:
calendar.monthrange(2020, 1)[1]

31

In [85]:
#Format is dd.mm.yyyy H:m
def generate_random_time(month):
    #Returns weekday of first day of the month and number of days in month
    day_range = calendar.monthrange(2020, month)[1]
    random_day = random.randint(1, day_range)
    
    if(random.random() < 0.5):
        date = datetime.datetime(2020, month, random_day, 11, 0)
    else:
        date = datetime.datetime(2020, month, random_day, 20, 0)

    time_offset = int(np.random.normal(loc= 0, scale = 180))
    final_date = date + datetime.timedelta(minutes = time_offset)
    return final_date.strftime("%d.%m.%Y %H:%M")


In [86]:
generate_random_time(1)

'13.01.2020 18:25'

In [87]:
#I guess, that during holiday-season lake valentine's day, The 23 of Februrary,
# The 8 of March, and New year(+ November)  people purchase more.

holday_months = [2, 3, 11]
new_year_month = 12

product_list = list(products.keys())
weight_list = [products[product][1] for product in product_list]

for month_i in range(1,12+1):
    #for each month lets create a dataframe
    df_i = pd.DataFrame(columns=columns)
    
    if (month_i in holday_months):
        orders_number = int(np.random.normal(loc=1500, scale = 400))
    elif (month_i == new_year_month):
        orders_number = int(np.random.normal(loc = 2500, scale = 300))
    else:
        orders_number = int(np.random.normal(loc = 1300, scale = 200))
            
    for i in range(orders_number):
        product = random.choices(population=product_list, weights=weight_list)[0]
        price = products[product][0]
        address = generate_random_adress()
        order_time = generate_random_time(month_i)
        df_i.loc[i] = [i, product, 1, price, order_time, address]
        month_name = calendar.month_name[month_i]
        df_i.to_csv(f'{month_name}_data.csv')
    print(len(df_i), month_name)




1298 January
679 February
969 March
1415 April
896 May
1565 June
1275 July
1130 August
1484 September
1545 October
1155 November
2508 December


# Quantity using geometric distribution

Logic^

quantity_ordered = np.random.geometric(p = 1.0 - (1.0/price))

If the price of product is high, it is unlickly that somebody will buy many products of this type.

In [89]:
#I guess, that during holiday-season lake valentine's day, The 23 of Februrary,
# The 8 of March, and New year(+ November)  people purchase more.

holday_months = [2, 3, 11]
new_year_month = 12

product_list = list(products.keys())
weight_list = [products[product][1] for product in product_list]

for month_i in range(1,12+1):
    #for each month lets create a dataframe
    df_i = pd.DataFrame(columns=columns)
    
    if (month_i in holday_months):
        orders_number = int(np.random.normal(loc=1500, scale = 400))
    elif (month_i == new_year_month):
        orders_number = int(np.random.normal(loc = 2500, scale = 300))
    else:
        orders_number = int(np.random.normal(loc = 1300, scale = 200))
            
    for i in range(orders_number):
        product = random.choices(population=product_list, weights=weight_list)[0]
        price = products[product][0]
        address = generate_random_adress()
        order_time = generate_random_time(month_i)
        quantity_ordered = np.random.geometric(p = 1.0 - (1.0/price))
        df_i.loc[i] = [i, product, quantity_ordered, price, order_time, address]
        month_name = calendar.month_name[month_i]
        df_i.to_csv(f'{month_name}_data.csv')
    print(len(df_i), month_name)




1314 January
1828 February
1293 March
1511 April
1396 May
1434 June
1235 July
1162 August
1323 September
1374 October
1951 November
2254 December
