In [4]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from random import randint

filepath = 'ECOMMRecords2020.csv'
data = pd.read_csv(filepath)
data.head()

Unnamed: 0,OrderDate,RowID,OrderID,ShipMode,CustomerID,Segment,Country,City,State,PostalCode,Region,ProductID,Category,SubCategory,ProductName,Sales,Quantity,Discount,Profit
0,2020-01-01,849,CA-2017-107503,Standard Class,GA-14725,Consumer,United States,Lorain,Ohio,44052,East,FUR-FU-10003878,Furniture,Furnishings,Linden 10 Round Wall Clock Black,48.896,4,0.2,8.5568
1,2020-01-01,4010,CA-2017-144463,Standard Class,SC-20725,Consumer,United States,Los Angeles,California,90036,West,FUR-FU-10001215,Furniture,Furnishings,Howard Miller 11-1/2 Diameter Brentwood Wall C...,474.43,11,0.0,199.2606
2,2020-01-01,6683,CA-2017-154466,First Class,DP-13390,Home Office,United States,Franklin,Wisconsin,53132,Central,OFF-BI-10002012,Office Supplies,Binders,Wilson Jones Easy Flow II Sheet Lifters,3.6,2,0.0,1.728
3,2020-01-01,8070,CA-2017-151750,Standard Class,JM-15250,Consumer,United States,Huntsville,Texas,77340,Central,OFF-ST-10002743,Office Supplies,Storage,SAFCO Boltless Steel Shelving,454.56,5,0.2,-107.958
4,2020-01-01,8071,CA-2017-151750,Standard Class,JM-15250,Consumer,United States,Huntsville,Texas,77340,Central,FUR-FU-10002116,Furniture,Furnishings,Tenex Carpeted Granite-Look or Clear Contempor...,141.42,5,0.6,-187.3815


In [5]:
df_comps = data.groupby("ProductID")["Sales"].mean().reset_index()
df_comps = df_comps.rename(columns={"Sales": "byggern"})

df_comps.head()

Unnamed: 0,ProductID,byggern
0,FUR-BO-10000112,825.174
1,FUR-BO-10000330,241.96
2,FUR-BO-10000468,108.495333
3,FUR-BO-10000780,365.7836
4,FUR-BO-10001337,308.499


In [8]:
def fill_dataset(df, competitors, variation_percentage):
    result_df = df.copy()
    
    # Calculate standard deviation to achieve desired range
    # In a normal distribution, ±3 standard deviations cover 99.7% of the data
    # So we set our variation_percentage to correspond to 3 standard deviations
    std_dev = (variation_percentage/100) / 3
    
    for competitor in competitors:
        # Generate normally distributed random numbers centered at 1.0
        random_factors = np.random.normal(loc=1.0, scale=std_dev, size=len(df))
        
        # Clip values to ensure they stay within the desired range
        random_factors = np.clip(random_factors, 
                               1 - variation_percentage/100, 
                               1 + variation_percentage/100)
        
        # Apply the random factors to byggern prices
        result_df[competitor] = result_df['byggern'] * random_factors
    
    return result_df

In [15]:
competitors = ["bauhaus", "byggmakker", "byggmax", "byggtorget", "byggtorget", "elektroimportoren",	"fargerike",	"flisekompaniet", "gausdal_landhandleri",	"industrivarer",	"jemogfix",	"jernia", "maxbo",	"megaflis",	"monter", "obs_bygg",	"staypro",	"torn", "verktoy",	"xl_bygg"]

df_comps = fill_dataset(df_comps, competitors, 40)

df_comps


Unnamed: 0,ProductID,byggern,bauhaus,byggmakker,byggmax,byggtorget,elektroimportoren,fargerike,flisekompaniet,gausdal_landhandleri,...,jemogfix,jernia,maxbo,megaflis,monter,obs_bygg,staypro,torn,verktoy,xl_bygg
0,FUR-BO-10000112,825.174000,816.644132,787.736054,850.384690,758.793511,839.191876,781.876547,870.480632,579.217796,...,623.071977,720.341988,948.249398,815.974332,749.005062,869.807449,923.364042,889.176036,931.520626,890.338398
1,FUR-BO-10000330,241.960000,185.820554,251.524620,264.705068,223.405311,223.500549,263.352514,247.361945,246.130099,...,237.249252,221.100144,238.872229,229.959898,251.188298,291.344730,261.567154,147.473864,271.133313,264.138211
2,FUR-BO-10000468,108.495333,113.561493,92.566791,100.135879,105.452810,98.621137,115.147476,88.564725,126.207021,...,120.102030,96.648770,142.977563,123.362680,130.208667,116.544477,107.313587,125.319975,93.498474,106.794665
3,FUR-BO-10000780,365.783600,372.564224,422.506179,369.189009,280.011475,350.290798,372.079512,378.223204,294.506622,...,374.831342,309.475811,371.276431,432.458432,459.441309,389.361267,327.133831,319.679242,377.844960,362.445933
4,FUR-BO-10001337,308.499000,379.268650,352.460113,344.351327,346.215309,294.960091,275.728962,318.799806,400.758158,...,289.442469,345.953314,289.312528,322.933178,335.336342,290.721839,299.976163,261.090108,343.268369,270.561726
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1520,TEC-PH-10004897,23.976000,19.751607,20.403596,24.184396,26.936992,21.912114,28.628922,25.596469,25.778929,...,29.508884,21.224301,23.967238,25.943557,20.684456,27.549815,27.115018,19.770286,26.443182,22.863112
1521,TEC-PH-10004908,67.992000,78.577331,62.304725,57.684241,66.565759,69.637486,53.539529,79.501137,59.893246,...,73.853331,53.753728,49.320216,68.883638,70.170669,66.386499,73.039057,76.310663,57.727537,74.407657
1522,TEC-PH-10004912,219.800000,180.909870,222.887802,206.453636,217.574853,229.421531,218.562968,234.760026,210.727052,...,207.073124,212.792543,208.274468,274.546157,236.429708,186.788245,227.388206,231.171194,205.483075,239.807394
1523,TEC-PH-10004922,160.776000,133.405461,169.753951,173.217508,154.851264,167.228191,139.745511,170.563394,174.204471,...,139.137180,186.100125,144.804959,158.167364,154.135709,167.084535,137.663177,138.520198,152.237623,181.596620
