In [1]:
import pandas as pd
import os
import numpy as np
import time
from sklearn import preprocessing
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier

data = pd.read_csv("/content/drive/My Drive/data/imputed_data.csv", parse_dates=[1], names=['Panel ID', 'Date', 'Category', 'Pack Size', 'Volume', 'Spend'], skiprows=1)
panel_demo = pd.read_excel("/content/drive/My Drive/data/DSA3101_Hackathon_Panelists_Demographics.xlsx")
panel_demo.columns = ["ID", "BMI", "Income", "Ethnicity", "Lifestage", "Stata", "HH", "Location"]

data.sort_values("Date", inplace=True)

le = preprocessing.LabelEncoder()
data['Week'] = le.fit_transform(data.Date)

data = data[["Panel ID", "Category", "Spend", "Week"]].merge(data.groupby(["Panel ID"]).agg({"Week": min}).reset_index(), on="Panel ID")
data.columns = ["Panel ID", "Category", "Spend", "Week", "First Week"]
data.tail()

Unnamed: 0,Panel ID,Category,Spend,Week,First Week
1300582,Panel 801865301,Cordials,3.0,147,136
1300583,Panel 801865301,Liquid Milk,4.2,147,136
1300584,Panel 801865301,Condensed/Evap Milk,5.1,147,136
1300585,Panel 801865301,Milk Powder-Adult,17.5,147,136
1300586,Panel 801865301,Chocolate,8.7,147,136


In [2]:
train = data[(data["First Week"] < 52) & (data["Week"] < 52)]    # 2575 customers
val = data[( 52 <= data["First Week"]) & (data["First Week"] < 104) & (52 <= data["Week"]) & (data["Week"] < 104)]    # 373 customers
test = data[(data["First Week"] >= 104) & (data["Week"] >= 104)]    # 288 customers
print(train["Panel ID"].unique().shape, val["Panel ID"].unique().shape, test["Panel ID"].unique().shape)

(2575,) (373,) (288,)


In [3]:
train.head()

Unnamed: 0,Panel ID,Category,Spend,Week,First Week
0,Panel 317007101,Frozen Food,5.8,0,0
1,Panel 317007101,Cooking Sauces,8.5,0,0
2,Panel 317007101,Snack,8.0,0,0
3,Panel 317007101,Snack,5.0,0,0
4,Panel 317007101,Cake,6.0,0,0


In [4]:
%%time
def week_EMA(x):
    lst = x.shift(-1)-x
    return lst.ewm(alpha=1e-3).mean().tail(1).values

def Monetary_EMA(x):
    return x.ewm(alpha=1e-3).mean().tail(1).values

def frequency(data, cutoff_date):
    """
    cutoff_date = 52, 104 or 156
    """
    df = data.groupby(["Panel ID", "Week"]) \
        .agg({'Spend': sum}) \
        .reset_index()

    df_last_week = data[["Panel ID"]].drop_duplicates()
    df_last_week["Week"] = cutoff_date - 1

    df = df.merge(df_last_week, on=["Panel ID", "Week"], how="outer") \
        .fillna(0) \
        .groupby("Panel ID") \
        .agg({"Week": [min, max, week_EMA]}) \
        .reset_index()
    
    df.columns = ["Panel ID", "First Week", "Last Week", "Frequency_EMA"]
    df = df[df["First Week"] <= cutoff_date-3]    # remove customers that newly joined close to our cutoff date
    return df[["Panel ID", "Frequency_EMA"]]

def monetary(data, cutoff_date):
    """
    cutoff_date = 52, 104 or 156
    """
    df = data.groupby(["Panel ID", "Week"]) \
        .agg({'Spend': sum}) \
        .reset_index() \
        .pivot_table(index=['Panel ID'], columns='Week', values='Spend') \
        .fillna(0) \
        .reset_index() \
        .melt(id_vars=["Panel ID"], value_vars=list(range(cutoff_date-52, cutoff_date)))

    df = df.merge(data[["Panel ID", "First Week"]], on="Panel ID", how="left")
    df = df[df["First Week"] <= df["Week"]]
    df = df.groupby("Panel ID") \
        .agg({"value": Monetary_EMA}) \
        .reset_index()

    df.columns = ["Panel ID", "Monetary_EMA"]
    return df

def first_purchases(df):
    pivot = data.pivot_table(index=['Panel ID', "First Week"], columns='Category', values='Spend')
    df = df[["Panel ID", "First Week"]] \
        .merge(data[["Panel ID", "Week", "Category", "Spend"]], how="left", left_on=["Panel ID", "First Week"], right_on=["Panel ID", "Week"]) \
        .pivot_table(index=['Panel ID', 'First Week'], columns='Category', values='Spend') \
        .reset_index()
    df = pd.concat([df, pd.DataFrame(columns=list(pivot.columns[~pivot.columns.isin(df.columns)]))]) \
        .fillna(0) \
        .drop(['First Week'], axis=1)
    return df

train_df = frequency(train, cutoff_date=52) \
    .merge(monetary(train, cutoff_date=52), on="Panel ID", how="left") \
    .merge(first_purchases(train), on=["Panel ID"], how="left") \
    .merge(panel_demo, how="left", left_on="Panel ID", right_on="ID") \
    .drop(['ID'], axis=1)
val_df = frequency(val, cutoff_date=104) \
    .merge(monetary(val, cutoff_date=104), on="Panel ID", how="left") \
    .merge(first_purchases(val), on=["Panel ID"], how="left") \
    .merge(panel_demo, how="left", left_on="Panel ID", right_on="ID") \
    .drop(['ID'], axis=1)
test_df = frequency(test, cutoff_date=156) \
    .merge(monetary(test, cutoff_date=156), on="Panel ID", how="left") \
    .merge(first_purchases(test), on=["Panel ID"], how="left") \
    .merge(panel_demo, how="left", left_on="Panel ID", right_on="ID") \
    .drop(['ID'], axis=1)

print(train_df.shape, val_df.shape, test_df.shape)

# train_df.to_csv("/content/drive/My Drive/data/train_df.csv", index=False)
# test_df.to_csv("/content/drive/My Drive/data/test_df.csv", index=False)
# val_df.to_csv("/content/drive/My Drive/data/val_df.csv", index=False)

(2575, 72) (364, 72) (288, 72)
CPU times: user 11.7 s, sys: 655 ms, total: 12.4 s
Wall time: 12.4 s


In [5]:
train_df.Frequency_EMA.value_counts(normalize=True, bins=range(52))
# ~76.8% of the panels in training have Frequency_EMA <= 2 so we can consider a customer to be frequent if the Frequency_EMA <= 2

(1.0, 2.0]       0.690097
(2.0, 3.0]       0.084660
(-0.001, 1.0]    0.078058
(3.0, 4.0]       0.041553
(4.0, 5.0]       0.020971
(5.0, 6.0]       0.018252
(7.0, 8.0]       0.009320
(6.0, 7.0]       0.008544
(12.0, 13.0]     0.006602
(8.0, 9.0]       0.005437
(17.0, 18.0]     0.005049
(10.0, 11.0]     0.004660
(11.0, 12.0]     0.004660
(15.0, 16.0]     0.003883
(25.0, 26.0]     0.003495
(9.0, 10.0]      0.003107
(23.0, 24.0]     0.002330
(14.0, 15.0]     0.001942
(13.0, 14.0]     0.001553
(21.0, 22.0]     0.001165
(16.0, 17.0]     0.000777
(19.0, 20.0]     0.000777
(50.0, 51.0]     0.000777
(48.0, 49.0]     0.000777
(22.0, 23.0]     0.000388
(18.0, 19.0]     0.000388
(38.0, 39.0]     0.000388
(24.0, 25.0]     0.000388
(42.0, 43.0]     0.000000
(41.0, 42.0]     0.000000
(43.0, 44.0]     0.000000
(44.0, 45.0]     0.000000
(45.0, 46.0]     0.000000
(46.0, 47.0]     0.000000
(40.0, 41.0]     0.000000
(47.0, 48.0]     0.000000
(39.0, 40.0]     0.000000
(35.0, 36.0]     0.000000
(37.0, 38.0]

In [6]:
train_df.Monetary_EMA.value_counts(normalize=True, bins=range(0, int(train_df.Monetary_EMA.max())+1, 40))
# ~70.6% of the panels in training have Monetary_EMA <= 40 so we can consider a customer to be frequent if the Monetary_EMA <= 40

(-0.001, 40.0]    0.705631
(40.0, 80.0]      0.238447
(80.0, 120.0]     0.040000
(120.0, 160.0]    0.009709
(160.0, 200.0]    0.003495
(200.0, 240.0]    0.001942
(240.0, 280.0]    0.000388
Name: Monetary_EMA, dtype: float64

# Modeling

In [7]:
scaler = None
def dataloader(df, scalar, target, scale=True):
    columns = list(panel_demo.columns)
    columns.remove("ID")
    data_df = df.copy()
    data_df["Frequency_EMA"] = (data_df["Frequency_EMA"] <= 2).map(int)
    data_df["Monetary_EMA"] = (data_df["Monetary_EMA"] > 50).map(int)
    data_df = pd.get_dummies(data_df, columns=columns)
    data = data_df.drop(['Panel ID', 'Frequency_EMA', 'Monetary_EMA'], axis=1)
    if scale:
        if scaler == None:
            X = pd.DataFrame(MinMaxScaler().fit_transform(data), 
                                columns=data.columns)
        else:
            X = pd.DataFrame(scaler.transform(data), 
                                columns=data.columns)
    else:
        X = data.copy()
    y = data_df[target]
    data = pd.concat([X, y], axis=1)
    return X, y, data, scalar
    
# train_X, train_y, train, scaler = dataloader(train_df, scaler, target="Frequency_EMA")
# val_X, val_y, val, scaler = dataloader(val_df, scaler, target="Frequency_EMA")
# test_X, test_y, test, scaler = dataloader(test_df, scaler, target="Frequency_EMA")
train_X, train_y, train, scaler = dataloader(train_df, scaler, target="Monetary_EMA")
val_X, val_y, val, scaler = dataloader(val_df, scaler, target="Monetary_EMA")
test_X, test_y, test, scaler = dataloader(test_df, scaler, target="Monetary_EMA")