In [5]:
import numpy as np
import pandas as pd

transactions = pd.read_csv("data/transaction_data.csv")
products = pd.read_csv("data/product.csv")

transactions = pd.merge(transactions, products, how = "left", on = "PRODUCT_ID")

condition = transactions["COMMODITY_DESC"] == "SOFT DRINKS"
soft_drinks = transactions[condition]

# each row is a purchase event. we will find the first purchase event of each product by household 
# use WEEK_NO, TRANS_TIME, household_key and PRODUCT_ID to find the first purchase event of a product by household
# use the soft_drinks dataframe:
(soft_drinks
 .sort_values(by=["household_key", "PRODUCT_ID", "WEEK_NO", "DAY", "TRANS_TIME"], 
              inplace=True)
)

# group the data by household_key and PRODUCT_ID
grouped = soft_drinks.groupby(["household_key", "PRODUCT_ID"])
# get the first purchase event of each product by household
cols_to_keep = ["household_key", "PRODUCT_ID", "WEEK_NO", "DAY", "TRANS_TIME", "BASKET_ID"]
first_purchase = grouped.first().reset_index()[cols_to_keep]

soft_drinks2 = (pd
                .merge(soft_drinks, first_purchase, 
                       on=["household_key", "PRODUCT_ID", "WEEK_NO", "DAY", "TRANS_TIME"], 
                       how="left")
               )

# create "first_purchase" column, True if there is no NaN in any of the columns, False otherwise
soft_drinks2.loc[:, "first_purchase"] = ~soft_drinks2.isnull().any(axis=1)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  (soft_drinks


In [6]:
soft_drinks2.head()

Unnamed: 0,household_key,BASKET_ID_x,DAY,PRODUCT_ID,QUANTITY,SALES_VALUE,STORE_ID,RETAIL_DISC,TRANS_TIME,WEEK_NO,COUPON_DISC,COUPON_MATCH_DISC,MANUFACTURER,DEPARTMENT,BRAND,COMMODITY_DESC,SUB_COMMODITY_DESC,CURR_SIZE_OF_PRODUCT,BASKET_ID_y,first_purchase
0,1,28318295376,111,849066,1,5.99,31401,-2.0,1157,17,0.0,0.0,103,GROCERY,National,SOFT DRINKS,SOFT DRINKS 20PK&24PK CAN CARB,12 OZ,28318300000.0,True
1,1,32931810580,400,849066,1,4.88,436,-3.11,1144,58,0.0,0.0,103,GROCERY,National,SOFT DRINKS,SOFT DRINKS 20PK&24PK CAN CARB,12 OZ,,False
2,1,34010015588,472,888104,1,4.99,436,0.0,1125,68,0.0,0.0,869,GROCERY,National,SOFT DRINKS,TEA SWEETENED,16 OZ,34010020000.0,True
3,1,34226870754,488,888104,1,4.99,436,0.0,1242,70,0.0,0.0,869,GROCERY,National,SOFT DRINKS,TEA SWEETENED,16 OZ,,False
4,1,34338535512,494,888104,1,4.99,436,0.0,1055,71,0.0,0.0,869,GROCERY,National,SOFT DRINKS,TEA SWEETENED,16 OZ,,False


In [7]:
soft_drinks2['first_purchase'].value_counts()

False    72353
True     45179
Name: first_purchase, dtype: int64

In [8]:
soft_drinks2.columns

Index(['household_key', 'BASKET_ID_x', 'DAY', 'PRODUCT_ID', 'QUANTITY',
       'SALES_VALUE', 'STORE_ID', 'RETAIL_DISC', 'TRANS_TIME', 'WEEK_NO',
       'COUPON_DISC', 'COUPON_MATCH_DISC', 'MANUFACTURER', 'DEPARTMENT',
       'BRAND', 'COMMODITY_DESC', 'SUB_COMMODITY_DESC', 'CURR_SIZE_OF_PRODUCT',
       'BASKET_ID_y', 'first_purchase'],
      dtype='object')

In [17]:
# create retail_disc, coupon_disc
soft_drinks2.loc[:, "retail_disc_d"] = soft_drinks2.loc[:, "RETAIL_DISC"].apply(lambda x: 1 if x < 0 else 0)
soft_drinks2.loc[:, "coupon_disc_d"] = soft_drinks2.loc[:, "COUPON_DISC"].apply(lambda x: 1 if x < 0 else 0)

In [22]:
soft_drinks2.loc[:, "first_purchase"] = soft_drinks2.loc[:, "first_purchase"].apply(lambda x: 1 if x else 0)

In [23]:
data = soft_drinks2[['retail_disc_d', 'coupon_disc_d', 'first_purchase']]

In [24]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 117532 entries, 0 to 117531
Data columns (total 3 columns):
 #   Column          Non-Null Count   Dtype
---  ------          --------------   -----
 0   retail_disc_d   117532 non-null  int64
 1   coupon_disc_d   117532 non-null  int64
 2   first_purchase  117532 non-null  int64
dtypes: int64(3)
memory usage: 3.6 MB


In [25]:
soft_drinks2['first_purchase'].value_counts()

0    72353
1    45179
Name: first_purchase, dtype: int64

In [26]:
import statsmodels.formula.api as smf

In [27]:
log_reg = smf.logit("first_purchase ~ retail_disc_d + coupon_disc_d", data=data).fit()

Optimization terminated successfully.
         Current function value: 0.666077
         Iterations 4


In [28]:
# Summary of results
print(log_reg.summary())

                           Logit Regression Results                           
Dep. Variable:         first_purchase   No. Observations:               117532
Model:                          Logit   Df Residuals:                   117529
Method:                           MLE   Df Model:                            2
Date:                Wed, 10 May 2023   Pseudo R-squ.:               0.0001488
Time:                        10:18:31   Log-Likelihood:                -78285.
converged:                       True   LL-Null:                       -78297.
Covariance Type:            nonrobust   LLR p-value:                 8.730e-06
                    coef    std err          z      P>|z|      [0.025      0.975]
---------------------------------------------------------------------------------
Intercept        -0.4718      0.010    -46.655      0.000      -0.492      -0.452
retail_disc_d    -0.0040      0.013     -0.317      0.751      -0.029       0.021
coupon_disc_d     0.2459      0.051     