In [1]:
import pandas as pd
import numpy as np
import requests
import random
from datetime import datetime, timedelta

In [2]:
# GitHub repository information which contain csv files for database
repository_owner = 'AntonMiniazev'
repository_name = 'online_retail_reporting'
folder_path = 'initial_data_source'

In [3]:
# Use Github API to get source files from repository
# GitHub API URL to fetch directory contents
api_url = f'https://api.github.com/repos/{repository_owner}/{repository_name}/contents/{folder_path}'

# Send GET request to fetch directory contents
response = requests.get(api_url)

# Check if the request was successful
if response.status_code == 200:
    contents = response.json()
    
    # List to store CSV file links
    csv_links = []
    
    # Iterate over the contents
    for item in contents:
        if item['type'] == 'file' and item['name'].endswith('.csv'):
            # Construct the raw file link
            raw_link = item['download_url']
            
            # Append the raw file link to the CSV links list
            csv_links.append(raw_link)
else:
    print(f"Failed to fetch directory contents. Status Code: {response.status_code}")

In [4]:
# Iterate over the CSV files in the folder
csv_links

['https://raw.githubusercontent.com/AntonMiniazev/online_retail_reporting/main/initial_data_source/assortment.csv',
 'https://raw.githubusercontent.com/AntonMiniazev/online_retail_reporting/main/initial_data_source/delivery_types.csv',
 'https://raw.githubusercontent.com/AntonMiniazev/online_retail_reporting/main/initial_data_source/orders.csv',
 'https://raw.githubusercontent.com/AntonMiniazev/online_retail_reporting/main/initial_data_source/products.csv',
 'https://raw.githubusercontent.com/AntonMiniazev/online_retail_reporting/main/initial_data_source/resource.csv',
 'https://raw.githubusercontent.com/AntonMiniazev/online_retail_reporting/main/initial_data_source/store.csv',
 'https://raw.githubusercontent.com/AntonMiniazev/online_retail_reporting/main/initial_data_source/zone.csv']

In [5]:
df_assortment = pd.read_csv(csv_links[0], dtype=object, delimiter = ';', thousands=',')
df_products = pd.read_csv(csv_links[3], dtype=object, delimiter = ';', thousands=',')
df_delivery_types = pd.read_csv(csv_links[1], dtype=object, delimiter = ';', thousands=',')

In [6]:
df_assortment

Unnamed: 0,product_id,Category,name,selling_price,cost_of_sales,limit_min,limit_car_max,limit_bike_max
0,100125,Dairy,Milk,60,39.6,1,3,2
1,100190,Dairy,Cheese,250,180.0,1,3,2
2,100276,Dairy,Yogurt,40,27.6,2,6,4
3,100290,Dairy,Butter,350,234.5,1,2,1
4,100374,Dairy,Eggs,110,74.8,1,2,1
5,100407,Bakery,Bread,40,31.2,1,2,2
6,100488,Bakery,Bagels,200,156.0,1,2,1
7,100514,Bakery,Croissants,175,134.75,1,2,1
8,100548,Bakery,Muffins,230,151.8,1,2,1
9,100599,Bakery,Donuts,340,255.0,1,2,1


### Specs
1. Create orders for tables Products and Orders
2. Orders should have columns [delivery_date, delivery_type, product_id, order_id,quantity,selling_price, cost_of_sales,zone_id]
Limitations for orders:
- Every product has limits (min and max, max depends on delivery_type: 1 - Bike, 2 - Car)
- Every day has 20-35 orders
- Number of positions for cars 5 to 20 and for bike 2 to 12

In [7]:
# Sets 1 or 2 for a delivery type
def assign_values_within_day(df):
    num_orders = len(df)
    num_ones = int(num_orders * random.uniform(0.6, 0.85))  # Calculate number of 1s

    # Assign 1s and 2s randomly within the day
    values = random.choices([1, 2], k=num_orders)
    values[:num_ones] = [1] * num_ones  # Set the first num_ones elements to 1
    return values

def get_random_positions(delivery_type, assortment = df_assortment):
    if delivery_type == 1:
        num_ids = random.randint(2, 12)  # Bikes will have positions between 2 and 12
    else:
        num_ids = random.randint(5, 20)  # Cars will have positions between 5 and 20
    random_positions = df_assortment['product_id'].sample(n=num_ids).tolist()
    return random_positions

def add_values_to_column(df, values):
    num_rows = len(values)
    num_cols = len(df.columns)
    repeated_df = pd.concat([df] * num_rows, ignore_index=True)
    repeated_df['product_id'] = values[:num_rows]
    return repeated_df

def add_positions(df):
    df_with_positions = pd.DataFrame(columns=df.columns)
    for id in df['order_id']:
        tmp_df = df.query('order_id == @id')
        tmp_df = add_values_to_column(tmp_df,get_random_positions(tmp_df.iloc[0]['delivery_type']))
        df_with_positions = pd.concat([df_with_positions,tmp_df], ignore_index=True)
    return df_with_positions

def add_quantity(row):
    prod_id = row['product_id']
    product_limits = df_assortment.query('product_id == @prod_id')
    if row['delivery_type'] == 1:
        num_q = random.randint(int(product_limits.iloc[0]['limit_min']), int(product_limits.iloc[0]['limit_bike_max']))
    else:
        num_q = random.randint(int(product_limits.iloc[0]['limit_min']), int(product_limits.iloc[0]['limit_car_max']))
    return num_q

def orders_dates(start_date, end_date,df_assortment):
    # Convert start_date and end_date strings to datetime objects
    start_date = datetime.strptime(start_date, "%d.%m.%Y")
    end_date = datetime.strptime(end_date, "%d.%m.%Y")

    # Calculate the number of days between start_date and end_date
    num_days = (end_date - start_date).days + 1

    gen_orders = []
    first_order = 10000  # Initial order_id

    # Generate rows for each date in the range
    for i in range(num_days):
        current_date = start_date + timedelta(days=i)
        num_orders = random.randint(20, 35)  # Random number of orders between 20 and 35
        orders = [(first_order + j, current_date) for j in range(num_orders)]
        gen_orders.extend(orders)
        first_order += num_orders
    
    df = pd.DataFrame(data=gen_orders, columns=['order_id','delivery_date'])
    
    num_orders = len(df)
    zone_ids = [random.randint(1, 5) for _ in range(num_orders)]
    df['zone_id'] = zone_ids
    
    values = []
    for date in df['delivery_date'].unique():
        tmp_df = df.query('delivery_date == @date')
        x = assign_values_within_day(tmp_df)
        values.extend(x)
    df['delivery_type'] = values

    df_with_positions = add_positions(df)
    df_with_positions['quantity'] = df_with_positions.apply(add_quantity,axis=1)
    
    df_with_positions = df_with_positions.merge(df_assortment[['product_id','selling_price','cost_of_sales']], how='left',on='product_id')
    
    df_with_positions['total_price'] = df_with_positions['quantity'] * df_with_positions['selling_price'].astype('float16')
    df_with_positions['total_cost'] = df_with_positions['quantity'] * df_with_positions['cost_of_sales'].astype('float16')
    
    df_with_positions['delivery_date'] = pd.to_datetime(df_with_positions['delivery_date'],format="%d.%m.%Y").dt.date    
    
    return df_with_positions   

In [8]:
df_orders = orders_dates("01.04.2023", "15.06.2023",df_assortment)
df_orders

Unnamed: 0,order_id,delivery_date,zone_id,delivery_type,product_id,quantity,selling_price,cost_of_sales,total_price,total_cost
0,10000,2023-04-01,2,1,101370,1,100,70,100.0,70.000000
1,10000,2023-04-01,2,1,101035,1,900,666,900.0,666.000000
2,10000,2023-04-01,2,1,100792,2,160,107.2,320.0,214.375000
3,10000,2023-04-01,2,1,100927,2,180,135,360.0,270.000000
4,10000,2023-04-01,2,1,100125,1,60,39.6,60.0,39.593750
...,...,...,...,...,...,...,...,...,...,...
16011,12080,2023-06-15,5,2,100978,3,650,520,1950.0,1560.000000
16012,12080,2023-06-15,5,2,100514,2,175,134.75,350.0,269.500000
16013,12080,2023-06-15,5,2,101270,1,50,35,50.0,35.000000
16014,12080,2023-06-15,5,2,100927,2,180,135,360.0,270.000000


In [9]:
final_orders = df_orders.groupby(['order_id','delivery_date','delivery_type','zone_id']).agg({'total_price':'sum'}).reset_index().rename(columns={'total_price':'total_value'})
final_orders

Unnamed: 0,order_id,delivery_date,delivery_type,zone_id,total_value
0,10000,2023-04-01,1,2,2070.0
1,10001,2023-04-01,1,2,4615.0
2,10002,2023-04-01,1,5,750.0
3,10003,2023-04-01,1,3,3640.0
4,10004,2023-04-01,1,3,1350.0
...,...,...,...,...,...
2076,12076,2023-06-15,2,5,2110.0
2077,12077,2023-06-15,2,4,3695.0
2078,12078,2023-06-15,1,4,895.0
2079,12079,2023-06-15,2,5,2160.0


In [10]:
final_products = df_orders[['product_id'
                                     ,'order_id'
                                     ,'quantity'
                                     ,'selling_price'
                                     ,'cost_of_sales'
                                     ,'total_price'
                                     ,'total_cost']]
final_products

Unnamed: 0,product_id,order_id,quantity,selling_price,cost_of_sales,total_price,total_cost
0,101370,10000,1,100,70,100.0,70.000000
1,101035,10000,1,900,666,900.0,666.000000
2,100792,10000,2,160,107.2,320.0,214.375000
3,100927,10000,2,180,135,360.0,270.000000
4,100125,10000,1,60,39.6,60.0,39.593750
...,...,...,...,...,...,...,...
16011,100978,12080,3,650,520,1950.0,1560.000000
16012,100514,12080,2,175,134.75,350.0,269.500000
16013,101270,12080,1,50,35,50.0,35.000000
16014,100927,12080,2,180,135,360.0,270.000000


In [11]:
final_orders.to_csv('orders.csv', index=False, sep=";")
final_products.to_csv('products.csv', index=False,sep=";")