In [2]:
import pandas as pd
import statsmodels.formula.api as smf

df = pd.read_csv('dataset_1654342.csv')
df.head()


Unnamed: 0,transaction_id,transaction_date,transaction_time,transaction_qty,store_id,store_location,unit_price,product_category,product_type
0,1,06/08/2023,17:41:54,1.0,2,Groningen,3.25,Tea,Chai tea
1,2,06/10/2023,11:30:46,2.0,2,Groningen,4.5,Chocolate,Hot chocolate
2,3,06/06/2023,09:19:26,3.0,2,Groningen,3.0,Tea,Black tea
3,4,06/10/2023,17:25:49,1.0,3,Breda,4.5,Chocolate,Hot chocolate
4,5,06/05/2023,14:55:12,1.0,1,Amsterdam,4.5,Chocolate,Hot chocolate


In [3]:
#B1 descriptive statistics
# Ensure that transaction quantities are numeric and not missing
df['transaction_qty'] = pd.to_numeric(df['transaction_qty'], errors='coerce')
df_cleaned = df.dropna(subset=['transaction_qty', 'product_type', 'product_category'])
 
# Group the data by product category and type, and sum the quantities
grouped = df_cleaned.groupby(['product_category', 'product_type'])['transaction_qty'].sum().reset_index()
 
# Sort the grouped data by category and then by quantity sold
grouped_sorted = grouped.sort_values(by=['product_category','transaction_qty'], ascending=[True, False])
 
# Now, grouped_sorted contains the summed quantities of each product type within each category
print(grouped_sorted)
 
# If certain product types consistently show lower quantities sold, you could advise TTR to consider marketing efforts to boost sales or even re-evaluate the product offering


   product_category            product_type  transaction_qty
0            Bakery                     Pie           1229.0
2            Bakery                   Scone           1201.0
1            Bakery                Sandwich           1174.0
3         Chocolate           Hot chocolate           3773.0
4            Coffee            Black coffee            956.0
5            Coffee             Milk coffee            947.0
6            Coffee  Organic blended coffee            922.0
7            Coffee            Syrup coffee            907.0
9               Tea                Chai tea            959.0
8               Tea               Black tea            941.0
10              Tea               Green tea            919.0
11              Tea              Herbal tea            888.0


In [9]:
#B1 Inferential statistics
total_sales = df_cleaned.groupby('product_type')['transaction_qty'].sum().reset_index()
df_cleaned = df_cleaned.merge(total_sales, on='product_type', suffixes=('', '_total'))
product_type_dummies = pd.get_dummies(df_cleaned['product_type'])
df_product_type_dummies = pd.concat([df_cleaned,product_type_dummies],axis=1)
df_product_type_dummies.head()


Unnamed: 0,transaction_id,transaction_date,transaction_time,transaction_qty,store_id,store_location,unit_price,product_category,product_type,transaction_qty_total,...,Chai tea,Green tea,Herbal tea,Hot chocolate,Milk coffee,Organic blended coffee,Pie,Sandwich,Scone,Syrup coffee
0,1,06/08/2023,17:41:54,1.0,2,Groningen,3.25,Tea,Chai tea,959.0,...,True,False,False,False,False,False,False,False,False,False
1,2,06/10/2023,11:30:46,2.0,2,Groningen,4.5,Chocolate,Hot chocolate,3773.0,...,False,False,False,True,False,False,False,False,False,False
2,3,06/06/2023,09:19:26,3.0,2,Groningen,3.0,Tea,Black tea,941.0,...,False,False,False,False,False,False,False,False,False,False
3,4,06/10/2023,17:25:49,1.0,3,Breda,4.5,Chocolate,Hot chocolate,3773.0,...,False,False,False,True,False,False,False,False,False,False
4,5,06/05/2023,14:55:12,1.0,1,Amsterdam,4.5,Chocolate,Hot chocolate,3773.0,...,False,False,False,True,False,False,False,False,False,False


In [15]:
model_tea = smf.ols('transaction_qty_total ~ Q("Chai tea") + Q("Black tea") + Q("Green tea") + Q("Herbal tea")', data=df_product_type_dummies).fit()
print(model_tea.summary())


                              OLS Regression Results                             
Dep. Variable:     transaction_qty_total   R-squared:                       0.145
Model:                               OLS   Adj. R-squared:                  0.145
Method:                    Least Squares   F-statistic:                     314.8
Date:                   Mon, 25 Mar 2024   Prob (F-statistic):          1.37e-250
Time:                           15:21:19   Log-Likelihood:                -62645.
No. Observations:                   7429   AIC:                         1.253e+05
Df Residuals:                       7424   BIC:                         1.253e+05
Df Model:                              4                                         
Covariance Type:               nonrobust                                         
                              coef    std err          t      P>|t|      [0.025      0.975]
----------------------------------------------------------------------------------------

In [14]:
model_coffee = smf.ols('transaction_qty_total ~ Q("Black coffee") + Q("Milk coffee") + Q("Syrup coffee") + Q("Organic blended coffee")', data=df_product_type_dummies).fit()
print(model_coffee.summary())

                              OLS Regression Results                             
Dep. Variable:     transaction_qty_total   R-squared:                       0.141
Model:                               OLS   Adj. R-squared:                  0.140
Method:                    Least Squares   F-statistic:                     304.4
Date:                   Mon, 25 Mar 2024   Prob (F-statistic):          7.71e-243
Time:                           15:20:43   Log-Likelihood:                -62663.
No. Observations:                   7429   AIC:                         1.253e+05
Df Residuals:                       7424   BIC:                         1.254e+05
Df Model:                              4                                         
Covariance Type:               nonrobust                                         
                                          coef    std err          t      P>|t|      [0.025      0.975]
----------------------------------------------------------------------------

In [16]:
model_bakery = smf.ols('transaction_qty_total ~ Q("Pie") + Q("Scone") + Q("Sandwich")', data=df_product_type_dummies).fit()
print(model_bakery.summary())

                              OLS Regression Results                             
Dep. Variable:     transaction_qty_total   R-squared:                       0.060
Model:                               OLS   Adj. R-squared:                  0.060
Method:                    Least Squares   F-statistic:                     158.1
Date:                   Mon, 25 Mar 2024   Prob (F-statistic):           2.50e-99
Time:                           15:22:04   Log-Likelihood:                -62997.
No. Observations:                   7429   AIC:                         1.260e+05
Df Residuals:                       7425   BIC:                         1.260e+05
Df Model:                              3                                         
Covariance Type:               nonrobust                                         
                            coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------------------------------------------------------
