# Libraries

In [83]:
import pandas as pd
import numpy as np


import matplotlib.pyplot as plt
import seaborn as sns
sns.set()

import pickle

from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans

from sklearn.linear_model import LogisticRegression

## Data Prep

In [84]:
df_purchase = pd.read_csv(
    "/home/angelo/repos/vscode_repos/customer_analytics_2022/Data/purchase data.csv"
)

In [85]:
scaler = pickle.load(
    open(
        "/home/angelo/repos/vscode_repos/customer_analytics_2022/pickle_data_models/scaler.pickle",
        "rb",
    )
)

pca = pickle.load(
    open(
        "/home/angelo/repos/vscode_repos/customer_analytics_2022/pickle_data_models/pca_3.pickle",
        "rb",
    )
)

kmeans_pca = pickle.load(
    open(
        "/home/angelo/repos/vscode_repos/customer_analytics_2022/pickle_data_models/kmeans_pca_4.pickle",
        "rb",
    )
)

#### Apply imported models 



In [86]:
# only apply transform; fit would mean that the model is retrained; here we only want to apply it on the data --> so only transform
features = df_purchase.iloc[:, -7:]

# apply the scaler
df_purchase_segm_std = scaler.transform(features)

In [87]:
#### apply the already trained scaler on the new data 
# remember that customers may be duplicates
df_purchase_segm_pca = pca.transform(df_purchase_segm_std)

In [88]:
#### apply the already trained pca_3 (here just pca) on the data 
# now apply the kmeans on it: use the predict method here
purchase_segm_kmeans_pca = kmeans_pca.predict(df_purchase_segm_pca)

In [89]:
#### apply the already trained kmeans model which was trained on the 3 components of the pca model and had k = 4 groups
df_purchase_predictors = df_purchase.copy()
df_purchase_predictors["Segment"] = purchase_segm_kmeans_pca

In [90]:
#### this is now the new dataframe containing the predicted segments for the customers
df_purchase_predictors = df_purchase.copy()
df_purchase_predictors["Segment"] = purchase_segm_kmeans_pca

In [91]:
segment_dummies = pd.get_dummies(purchase_segm_kmeans_pca, 
                                prefix = 'Segment',
                                prefix_sep = '_')

In [92]:
df_purchase_predictors = pd.concat([df_purchase_predictors, segment_dummies], axis = 1)

In [93]:
df_pa = df_purchase_predictors.copy()

In [94]:
df_pa.head()

Unnamed: 0,ID,Day,Incidence,Brand,Quantity,Last_Inc_Brand,Last_Inc_Quantity,Price_1,Price_2,Price_3,...,Age,Education,Income,Occupation,Settlement size,Segment,Segment_0,Segment_1,Segment_2,Segment_3
0,200000001,1,0,0,0,0,0,1.59,1.87,2.01,...,47,1,110866,1,0,2,0,0,1,0
1,200000001,11,0,0,0,0,0,1.51,1.89,1.99,...,47,1,110866,1,0,2,0,0,1,0
2,200000001,12,0,0,0,0,0,1.51,1.89,1.99,...,47,1,110866,1,0,2,0,0,1,0
3,200000001,16,0,0,0,0,0,1.52,1.89,1.98,...,47,1,110866,1,0,2,0,0,1,0
4,200000001,18,0,0,0,0,0,1.52,1.89,1.99,...,47,1,110866,1,0,2,0,0,1,0


## Purchase Probability Model

The model task:
dependent: Incidence varaible

independent: Price

We want to know whether a purchase has happened based on the purchase price of a product assuming that purchase probabaility is influenced by price.

In [95]:
Y = df_pa['Incidence']


create price variable irrespective of the brand;

possible choices to consider here:
- i) min and max price which are good indicators how expensive a new product is
- ii) mean or median price


NOTE: the choice here is mean of all prices (while in a store, the customer has the choice to grab 5 different brands... we just average these 5 prices together into one per purchase opportunity in the dataset 8per instance) to see how much of the variance in Incidence can be explained by the average price of all brands together )

In [96]:
# create price variable irrespective of the brand
## mean price is chosen for all prices (Price_1 to Price_5) per row
X = pd.DataFrame()

X['Mean_Price'] = (df_pa['Price_1'] +
                    df_pa['Price_2'] +
                    df_pa['Price_3'] +
                    df_pa['Price_4'] +
                    df_pa['Price_5'] ) / 5



a solver in a models such as LogisticRegression is the technique for the iteration-optimization problem in logisitc regression such as 'sag'

In [97]:
model_purchase = LogisticRegression(solver = 'sag')
model_purchase.fit(X, Y)

by fitting a logistic regression on this data, this command estimates the purchase probabaiility and stores the results in the model_purchase instance of the trained LogisticRegression model.



In [98]:
# coef_ holds the coefficients for a model; here univariate price
model_purchase.coef_

array([[-2.34814301]])

Analysis: negative coefficient indicates that an increase in price will decrease the probabaility of purchase and vice versa

However, logistic regression can do more: this model also quantifies the exact  relationship between pruice and probability of purchase in addition to magnitude (elasiticity)

## Price Elasticity of Purchase Probability

Price elasitiy of purchase probabauility is the percentage change in prucahse probability in response to a one percent change in price in the product category


the coefficient for price here array([[-2.34904564]]) is the change in probabaility given the price; this is what the coefficient shows:

As such we want to see different values for mean price and see how the probabaility 

In [99]:
# create a price range to test
df_pa.loc[:, ['Price_1', 'Price_2', 'Price_3', 'Price_4', 'Price_5']].describe()

Unnamed: 0,Price_1,Price_2,Price_3,Price_4,Price_5
count,58693.0,58693.0,58693.0,58693.0,58693.0
mean,1.392074,1.780999,2.006789,2.159945,2.654798
std,0.091139,0.170868,0.046867,0.089825,0.098272
min,1.1,1.26,1.87,1.76,2.11
25%,1.34,1.58,1.97,2.12,2.63
50%,1.39,1.88,2.01,2.17,2.67
75%,1.47,1.89,2.06,2.24,2.7
max,1.59,1.9,2.14,2.26,2.8


minprice is 1.1€; and maxprice is 2.8€

--> expand this range a little got get a better understanding of the elasiticity such as [0.5; 3.5] or even [0 ; 5]

In [100]:
# add a 0.01 price change 
price_range = np.arange(0.5, 3.5, 0.01)

In [101]:
df_price_range = pd.DataFrame(price_range)

In [102]:
# now predict the purchase probabability using the trained model
Y_pr = model_purchase.predict_proba(df_price_range)



In [103]:
Y_pr

array([[0.08212568, 0.91787432],
       [0.08391319, 0.91608681],
       [0.08573598, 0.91426402],
       [0.08759457, 0.91240543],
       [0.08948951, 0.91051049],
       [0.09142134, 0.90857866],
       [0.09339059, 0.90660941],
       [0.09539781, 0.90460219],
       [0.09744353, 0.90255647],
       [0.09952829, 0.90047171],
       [0.10165263, 0.89834737],
       [0.10381709, 0.89618291],
       [0.10602219, 0.89397781],
       [0.10826847, 0.89173153],
       [0.11055646, 0.88944354],
       [0.11288668, 0.88711332],
       [0.11525965, 0.88474035],
       [0.11767588, 0.88232412],
       [0.12013589, 0.87986411],
       [0.12264018, 0.87735982],
       [0.12518924, 0.87481076],
       [0.12778357, 0.87221643],
       [0.13042364, 0.86957636],
       [0.13310994, 0.86689006],
       [0.13584292, 0.86415708],
       [0.13862304, 0.86137696],
       [0.14145074, 0.85854926],
       [0.14432646, 0.85567354],
       [0.14725062, 0.85274938],
       [0.15022362, 0.84977638],
       [0.

the predicted output above for the different prices shows the class probabilities for 0 & 1 (so unsucessful and successfull); each row == 100%

Probabbility of pruchase (== 1) is the second column

In [104]:
# get only the second column
purchase_pr = Y_pr[:][:, 1]

Question now is:

How is demand for the product change for a given change in price? --> elasiticies will answer this


Definition of price elasiticiy 

E = (delta_Pr(purchase) / (Pr(purchase)) / (delta_Price/Price))

(=) E = beta * price * (1 - Pr(purchase))

In [112]:
# price elastiticity == price coeff of the model
pe = model_purchase.coef_[:, 0] * price_range * (1 - purchase_pr)

#### The above array contains the price elasticity at each price point

In [114]:
df_price_elasticities = pd.DataFrame(price_range)

In [115]:
df_price_elasticities = df_price_elasticities.rename(columns = {0: 'Price_Point'})

In [117]:
df_price_elasticities['Mean_PE'] = pe

each row we have the price point and the price elastitiy of that price point (so a one percent change in price at price point x will increase/decrease the purchase probabaility by xxx%)

In [118]:
df_price_elasticities

Unnamed: 0,Price_Point,Mean_PE
0,0.50,-0.096421
1,0.51,-0.100490
2,0.52,-0.104687
3,0.53,-0.109013
4,0.54,-0.113472
...,...,...
295,3.45,-8.013243
296,3.46,-8.038493
297,3.47,-8.063708
298,3.48,-8.088890


In [119]:
pd.options.display.max_rows = None
df_price_elasticities

Unnamed: 0,Price_Point,Mean_PE
0,0.5,-0.096421
1,0.51,-0.10049
2,0.52,-0.104687
3,0.53,-0.109013
4,0.54,-0.113472
5,0.55,-0.118069
6,0.56,-0.122805
7,0.57,-0.127684
8,0.58,-0.132711
9,0.59,-0.137887


## Analyse the elastitiy result