In [4]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from random import randint

filepath = 'ECOMMRecords2020.csv'
data = pd.read_csv(filepath)
data.head()

Unnamed: 0,OrderDate,RowID,OrderID,ShipMode,CustomerID,Segment,Country,City,State,PostalCode,Region,ProductID,Category,SubCategory,ProductName,Sales,Quantity,Discount,Profit
0,2020-01-01,849,CA-2017-107503,Standard Class,GA-14725,Consumer,United States,Lorain,Ohio,44052,East,FUR-FU-10003878,Furniture,Furnishings,Linden 10 Round Wall Clock Black,48.896,4,0.2,8.5568
1,2020-01-01,4010,CA-2017-144463,Standard Class,SC-20725,Consumer,United States,Los Angeles,California,90036,West,FUR-FU-10001215,Furniture,Furnishings,Howard Miller 11-1/2 Diameter Brentwood Wall C...,474.43,11,0.0,199.2606
2,2020-01-01,6683,CA-2017-154466,First Class,DP-13390,Home Office,United States,Franklin,Wisconsin,53132,Central,OFF-BI-10002012,Office Supplies,Binders,Wilson Jones Easy Flow II Sheet Lifters,3.6,2,0.0,1.728
3,2020-01-01,8070,CA-2017-151750,Standard Class,JM-15250,Consumer,United States,Huntsville,Texas,77340,Central,OFF-ST-10002743,Office Supplies,Storage,SAFCO Boltless Steel Shelving,454.56,5,0.2,-107.958
4,2020-01-01,8071,CA-2017-151750,Standard Class,JM-15250,Consumer,United States,Huntsville,Texas,77340,Central,FUR-FU-10002116,Furniture,Furnishings,Tenex Carpeted Granite-Look or Clear Contempor...,141.42,5,0.6,-187.3815


In [5]:
df_comps = data.groupby("ProductID")["Sales"].mean().reset_index()
df_comps = df_comps.rename(columns={"Sales": "byggern"})

df_comps.head()

Unnamed: 0,ProductID,byggern
0,FUR-BO-10000112,825.174
1,FUR-BO-10000330,241.96
2,FUR-BO-10000468,108.495333
3,FUR-BO-10000780,365.7836
4,FUR-BO-10001337,308.499


In [8]:
def fill_dataset(df, competitors, variation_percentage):
    result_df = df.copy()
    
    # Calculate standard deviation to achieve desired range
    # In a normal distribution, ±3 standard deviations cover 99.7% of the data
    # So we set our variation_percentage to correspond to 3 standard deviations
    std_dev = (variation_percentage/100) / 3
    
    for competitor in competitors:
        # Generate normally distributed random numbers centered at 1.0
        random_factors = np.random.normal(loc=1.0, scale=std_dev, size=len(df))
        
        # Clip values to ensure they stay within the desired range
        random_factors = np.clip(random_factors, 
                               1 - variation_percentage/100, 
                               1 + variation_percentage/100)
        
        # Apply the random factors to byggern prices
        result_df[competitor] = result_df['byggern'] * random_factors
    
    return result_df

In [10]:
competitors = ["bauhaus", "byggmakker", "byggmax", "byggtorget"]
df_comps = fill_dataset(df_comps, competitors, 20)

df_comps

Unnamed: 0,ProductID,byggern,bauhaus,byggmakker,byggmax,byggtorget
0,FUR-BO-10000112,825.174000,819.692340,903.774845,724.295838,882.411607
1,FUR-BO-10000330,241.960000,262.821675,255.131190,279.960412,215.528618
2,FUR-BO-10000468,108.495333,113.088005,107.101358,114.152729,106.790343
3,FUR-BO-10000780,365.783600,319.851006,361.123926,389.253784,390.667390
4,FUR-BO-10001337,308.499000,331.860706,296.484174,271.538967,354.887688
...,...,...,...,...,...,...
1520,TEC-PH-10004897,23.976000,23.545246,22.957511,24.294354,24.388320
1521,TEC-PH-10004908,67.992000,63.356768,61.230900,65.070408,69.074107
1522,TEC-PH-10004912,219.800000,240.860851,221.574793,219.956011,202.353759
1523,TEC-PH-10004922,160.776000,148.854262,166.768455,172.328025,177.378453
