### BASE (Papermill) >> Analysis of cannibalisation during PROMOTIONAL periods using Causal Impact (per store)


* Read the department/store data
* Read the calculated availability data or calculate it on demand
* Compare PROMO sales vs NON PROMO given the availabily
* Find the products that change their average sales
* Pick a pair of products and try the sales weighted distance analysis
* Saves the pairs in 'sku_pair_analysis'

This notebook comes from the experiments in `CFAV_simulate_cannibalisation_CausalImpact.ipynb`


Updates:

26.10.2020 - First attempt to out it all together

31.10.2020 - Adding post-promotional checks

06.11.2020 - Add the trend of the total sales

25.11.2020 - (Apart from Mom's bday) Adding the covariates for robust estimation

In [1]:
import os
import glob
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import fcn_helpers as fhelp
import fcn_simulation as fsim
from causalimpact import CausalImpact
import promotional_helpers as promies
from statsmodels.tsa.seasonal import STL

#plt.rcParams['figure.figsize'] = [14, 7]
fhelp.makeFolder('results')
pd.options.display.max_rows = None
# Use v2.0 colour cycle
def_colours = plt.rcParams['axes.prop_cycle'].by_key()['color']
# Fig sizes
fig_h = 10
fig_w = 18

# inliners
fcn_compare = lambda a,b: abs(a-b)/max(a,b)
fcn_compare_snap_vs_regular = lambda snap,reg: snap/reg
mapper_family_to_category = fhelp.cfav_get_map_dept_to_cat()

In [2]:
dataFolder = os.path.expanduser('~/Google Drive/order/Machine Learning Part/data/CorporacionFavorita')
xlsx_path = os.path.expanduser('~/Google Drive/order/Machine Learning Part/data/CorporacionFavorita')
graphsFolder = fhelp.fullfile(dataFolder, 'graphs_cannibalisation')
results_folder = 'CausalImpactResultsCovariates'

In [3]:
# Save the plots
save_all_cannibals = False

store_name = 'Pichincha_47_A_14'
#store_name = 'Pichincha_49_A_11'
dept_id = 'LIQUOR,WINE,BEER'
dept_id ='BREAD_BAKERY'
#dept_id ='FROZEN+FOODS'
dept_id = 'DAIRY'
dept_id = 'GROCERY_I'
dept_id = 'PRODUCE'

# variables that pertain to the new code

# Potential cannibals
# This threshold is used to detect uplifters based on the difference in average sales
sales_threshold = 1/3
# min AVG sales to be considered
min_avg_sales = 10


# This is the minimum bump between regular and promo
# and promo back to regular *0.25
min_diff_in_units_from_reg_to_promo = 10
# when analysing the SKUs
min_promo_days=3
min_regular_days=6

# values for the CI analysis
min_ratio_change = 0.4
do_exclude_promos_SKU_B = True
# This flag is pretty good for debugging/development
be_verbose=False



# LOESS - weekly
period_in_days = 7

# If true, use the sales without the weekly pattern
do_decomposition = False


# This threshold is used to detect the reduction in sales that one uplifter causes
cannibalisation_threshold = 1/3

# min days/times that both products overlap during the promotions of the cannibal
min_snap_days = 10

# N/A
#price_threshold = 2

#### Read the store level data

In [4]:
# Read departement sales for the current store
category_id = mapper_family_to_category.get(dept_id, '')
foldername = os.path.join(dataFolder, category_id, dept_id, 'store_sales')
filename = os.path.join(foldername, f'{store_name}.pickle')
df_store = fhelp.readPickleFile(filename)

sales_vars = [iVar for iVar in df_store.columns if 'sales-' in iVar]
promotion_vars = [iVar for iVar in df_store.columns if 'promotion_flag-' in iVar]

df_store.head(2)

Unnamed: 0,date,weekdays,sales-PRODUCE_1149069-Pichincha_47_A_14,promotion_flag-PRODUCE_1149069-Pichincha_47_A_14,sales-PRODUCE_1473393-Pichincha_47_A_14,promotion_flag-PRODUCE_1473393-Pichincha_47_A_14,sales-PRODUCE_1473394-Pichincha_47_A_14,promotion_flag-PRODUCE_1473394-Pichincha_47_A_14,sales-PRODUCE_1473396-Pichincha_47_A_14,promotion_flag-PRODUCE_1473396-Pichincha_47_A_14,...,promotion_flag-PRODUCE_2037487-Pichincha_47_A_14,sales-PRODUCE_2040637-Pichincha_47_A_14,promotion_flag-PRODUCE_2040637-Pichincha_47_A_14,sales-PRODUCE_2042210-Pichincha_47_A_14,promotion_flag-PRODUCE_2042210-Pichincha_47_A_14,sales-PRODUCE_2049081-Pichincha_47_A_14,promotion_flag-PRODUCE_2049081-Pichincha_47_A_14,sales-PRODUCE_2111870-Pichincha_47_A_14,promotion_flag-PRODUCE_2111870-Pichincha_47_A_14,total_units
0,2017-01-01,Sunday,0.0,False,0.0,False,0.0,False,0.0,False,...,False,0.0,False,0.0,False,0.0,False,0.0,False,0.0
1,2017-01-02,Monday,19.0,False,94.504,False,79.284,False,16.726,False,...,False,0.0,False,0.0,False,0.0,False,0.0,False,10746.39


#### Add weather data

In [5]:
weather_file = os.path.join(dataFolder, 'weather_data','daily_weather_in_Quito2017_no_header.pickle')
df_weather = pd.read_pickle(weather_file)[['date', 'avg_temp','wind_speed','total_precipitation', 'T2M_MAX_adj', 'T2M_MIN_adj']]

df_store = pd.merge(df_store, df_weather, how='inner', on='date')

#### Add the trend of the department sales

In [6]:
sales_decomposition_LOESS = STL(df_store['total_units'], period=period_in_days).fit()
df_store['total_units_trend'] = sales_decomposition_LOESS.trend
df_store.head(2)

Unnamed: 0,date,weekdays,sales-PRODUCE_1149069-Pichincha_47_A_14,promotion_flag-PRODUCE_1149069-Pichincha_47_A_14,sales-PRODUCE_1473393-Pichincha_47_A_14,promotion_flag-PRODUCE_1473393-Pichincha_47_A_14,sales-PRODUCE_1473394-Pichincha_47_A_14,promotion_flag-PRODUCE_1473394-Pichincha_47_A_14,sales-PRODUCE_1473396-Pichincha_47_A_14,promotion_flag-PRODUCE_1473396-Pichincha_47_A_14,...,promotion_flag-PRODUCE_2049081-Pichincha_47_A_14,sales-PRODUCE_2111870-Pichincha_47_A_14,promotion_flag-PRODUCE_2111870-Pichincha_47_A_14,total_units,avg_temp,wind_speed,total_precipitation,T2M_MAX_adj,T2M_MIN_adj,total_units_trend
0,2017-01-01,Sunday,0.0,False,0.0,False,0.0,False,0.0,False,...,False,0.0,False,0.0,11.03,3.62,6.29,20.139417,8.599041,5665.431807
1,2017-01-02,Monday,19.0,False,94.504,False,79.284,False,16.726,False,...,False,0.0,False,10746.39,11.13,4.15,4.42,20.100002,8.515361,6219.79884


In [7]:


category_id = mapper_family_to_category.get(dept_id, '')
foldername = os.path.join(dataFolder, results_folder, category_id, dept_id)
fhelp.makeFolder(foldername)
filename = os.path.join(foldername, f'{dept_id}-{store_name}.pickle')

#### Exclude the holiday periods from the analysis
Include it in the skus's heartbeat in the analysis below

In [8]:
preXmas  = df_store.date >= '2016-12-21'
postXmas = df_store.date < '2017-01-04'
idx_Xmas = (preXmas & postXmas).values


# at the moment only Xmas
idx_holiday_to_exclude = idx_Xmas

#### Read the store level sales decomposition
Generate the file if required
- (sales)
- heartbeat
- trend
- seasonal 
- residual

In [9]:
# Sales decomposition - on demand
folderDecomposition = os.path.join(os.path.expanduser(dataFolder), 'sku_decomposition', category_id, dept_id)
filename = os.path.join(folderDecomposition, f'{store_name}.pickle')

if os.path.exists(filename):
    df_components = fhelp.readPickleFile(filename)
else:
    decomposed_skus = []

    # Season-Trend decomposition using LOESS
    for idx_sku, sku_name in enumerate(sales_vars):

        item_id = sku_name.split('-')[1]

        current_sales = df_store[sku_name]

        df_decomposition = fhelp.decompose_signal(current_sales, \
            period_in_days=period_in_days, minimum_heartbeat=0.85)

        rename_map = {iCol: f'{iCol}-{item_id}-{store_name}' for iCol in df_decomposition.columns}
        decomposed_skus.append(df_decomposition.rename(columns=rename_map))

    df_components = pd.concat(decomposed_skus, axis=1)

    df_components['date'] = df_store.date
    df_components['weekdays'] = df_store.weekdays

    fhelp.makeFolder(folderDecomposition)
    fhelp.toPickleFile(df_components, filename)
    print(f'Saving {filename}...')
    
df_components.head(3)

Unnamed: 0,heartbeat_flag-PRODUCE_1149069-Pichincha_47_A_14,trend-PRODUCE_1149069-Pichincha_47_A_14,seasonal-PRODUCE_1149069-Pichincha_47_A_14,residual-PRODUCE_1149069-Pichincha_47_A_14,heartbeat_flag-PRODUCE_1473393-Pichincha_47_A_14,trend-PRODUCE_1473393-Pichincha_47_A_14,seasonal-PRODUCE_1473393-Pichincha_47_A_14,residual-PRODUCE_1473393-Pichincha_47_A_14,heartbeat_flag-PRODUCE_1473394-Pichincha_47_A_14,trend-PRODUCE_1473394-Pichincha_47_A_14,...,heartbeat_flag-PRODUCE_2049081-Pichincha_47_A_14,trend-PRODUCE_2049081-Pichincha_47_A_14,seasonal-PRODUCE_2049081-Pichincha_47_A_14,residual-PRODUCE_2049081-Pichincha_47_A_14,heartbeat_flag-PRODUCE_2111870-Pichincha_47_A_14,trend-PRODUCE_2111870-Pichincha_47_A_14,seasonal-PRODUCE_2111870-Pichincha_47_A_14,residual-PRODUCE_2111870-Pichincha_47_A_14,date,weekdays
0,True,9.043584,-0.235571,-8.808014,True,42.800951,-19.710199,-23.090753,True,45.687401,...,False,-6.474897e-07,-2.045778e-07,8.520675e-07,False,-6.596094e-20,5.0592089999999997e-20,1.536885e-20,2017-01-01,Sunday
1,True,10.519302,-0.036504,8.517203,True,47.821638,32.343363,14.338999,True,47.717297,...,False,-5.478036e-07,3.179813e-07,2.298223e-07,False,-5.103465e-20,4.78901e-20,3.1445449999999998e-21,2017-01-02,Monday
2,True,11.956221,-6.637622,2.681401,True,52.482605,-1.802934,6.035328,True,49.525383,...,False,-4.244715e-07,1.013116e-06,-5.886443e-07,False,-3.592699e-20,4.515809e-20,-9.231098e-21,2017-01-03,Tuesday


## Compare PROMOS sales vs NON PROMOS given the availabily (comparing windows of non-promos/promos)

Given the inferred availability of the SKU, calculate the following parameters for SNAP and regular periods: average sales and the standard deviation, the median and the number of days within each category.

```python
{'num_promo_slots': 2,
 'avg_promo_sales': 55.51020408163265,
 'avg_regular_sales': 41.74285714285714,
 'difference_averages_promo_to_regular': 13.76734693877551,
 'cum_difference_sales_promo_to_regular': -202.0,
 'slot_promo_avg_sales': array([56., 52.]),
 'slot_regular_avg_sales': array([37.38461538, 42.73684211]),
 'availability_value_sku_A': 0.986784140969163}
```


Then we simply select the uplifters as those SKUs that during SNAP periods have sales greater than:

'median_snap_sales'>=('median_regular_sales'*(1+sales_threshold))

### Comparison only valid if sales larger than zero
idx_B = (df_snap_stats['median_snap_sales']>0) & (df_snap_stats['median_regular_sales']>0)


### Meaning of uplifters and downlifters

Uplifters are SKU that simply sell more when on promotion. Downlifters here are a bit of a special case as when they are on promo they sell less. The promos are sku-level, unlike SNAP or NATIONAL events.

In [10]:
folderPromoStats = os.path.join(os.path.expanduser(dataFolder), 'sku_promo_slot_analysis', category_id, dept_id)
filename = os.path.join(folderPromoStats, f'{store_name}.pickle')

if os.path.exists(filename):
    df_snap_stats = fhelp.readPickleFile(filename)
else:
    # List of products
    sku_analysis = []

    for sku_A in sales_vars:
        # Get the taxonomy
        category_id_A, dept_id_A, sku_id, store_name = fhelp.get_taxonomy_from_sku_name_CFAV(sku_A)
        state_name = store_name.split('_')[0]

        sku_A_behaviour = promies.compare_promo_regular_sales(df_store, df_components, sku_A, \
            idx_holiday_to_exclude, min_promo_days=min_promo_days, min_regular_days=min_regular_days)

        sku_analysis.append(sku_A_behaviour)
        
    # Stick the dicts into a DF
    df_snap_stats = pd.DataFrame(sku_analysis)
    df_snap_stats.index = sales_vars
    # clean the empty dictionaries
    idx_nonsense = df_snap_stats.num_promo_slots.isna() | (df_snap_stats.num_promo_slots < 1)
    df_snap_stats = df_snap_stats[~idx_nonsense].copy()

    # for backwards compatibility
    df_snap_stats['mu_difference'] = df_snap_stats['difference_averages_promo_to_regular']
    # Add a small offset to avoid 0-divisions
    df_snap_stats['mu_delta'] = df_snap_stats[['avg_promo_sales', 'avg_regular_sales']].apply(lambda snap_reg: (snap_reg[0]+0.01)/(snap_reg[1]+0.01), axis=1)
    
    # save the file
    df_snap_stats.sort_values(by=['mu_delta'], ascending=False, inplace=True)
    
    fhelp.makeFolder(folderPromoStats)
    fhelp.toPickleFile(df_snap_stats, filename)

In [11]:
df_snap_stats.head(3)

Unnamed: 0,num_promo_slots,avg_promo_sales,avg_regular_sales,promo_days,regular_days,difference_averages_promo_to_regular,cum_difference_sales_promo_to_regular,slot_promo_avg_sales,slot_regular_avg_sales,availability_value_sku_A,mu_difference,mu_delta
sales-PRODUCE_2040637-Pichincha_47_A_14,1.0,12.857143,0.0,7.0,47.0,12.857143,90.0,[12.857142857142858],[0.0],0.171806,12.857143,1286.714286
sales-PRODUCE_1966629-Pichincha_47_A_14,1.0,9.333333,0.0,3.0,47.0,9.333333,28.0,[9.333333333333334],[0.0],0.277533,9.333333,934.333333
sales-PRODUCE_1501544-Pichincha_47_A_14,1.0,5.5,0.00625,10.0,160.0,5.49375,54.0,[5.5],[0.00625],0.303965,5.49375,339.076923


#### Uplifters

In [12]:
# A bit of work on the uplifters
# Snap sales greater than the sales + threshold
idx_A = df_snap_stats['mu_delta']>=(1+sales_threshold)

# Comparison only valid if they are not zero sales
idx_B = (df_snap_stats['avg_promo_sales']>min_avg_sales) & (df_snap_stats['avg_regular_sales']>min_avg_sales)
df_snap_stats['uplift_in_median'] = (idx_A & idx_B)

# Get two groups: potential cannibals (and haloers) and victims
df_snap_uplifters = df_snap_stats[df_snap_stats['uplift_in_median']].copy()

In [13]:
num_uplifters = df_snap_uplifters.shape[0] 
valid_uplifters = num_uplifters > 0
df_snap_uplifters.sort_values(by=['mu_delta', 'mu_difference'], ascending=False, inplace=True)
#fhelp.to_random_excel_file(df_snap_uplifters, writeIndex=True)
df_snap_uplifters.head(3)

Unnamed: 0,num_promo_slots,avg_promo_sales,avg_regular_sales,promo_days,regular_days,difference_averages_promo_to_regular,cum_difference_sales_promo_to_regular,slot_promo_avg_sales,slot_regular_avg_sales,availability_value_sku_A,mu_difference,mu_delta,uplift_in_median
sales-PRODUCE_1489895-Pichincha_47_A_14,1.0,29.533333,11.166667,15.0,6.0,18.366667,376.0,[29.533333333333335],[11.166666666666666],0.973568,18.366667,2.643305,True
sales-PRODUCE_1695874-Pichincha_47_A_14,1.0,24.666667,12.666667,9.0,6.0,12.0,146.0,[24.666666666666668],[12.666666666666666],0.986784,12.0,1.946621,True
sales-PRODUCE_1693647-Pichincha_47_A_14,1.0,51.466667,28.333333,15.0,6.0,23.133333,602.0,[51.46666666666667],[28.333333333333332],0.986784,23.133333,1.816183,True


#### Find connections between cannibals and victims during PROMOTIONAL periods



Up to here, we have demonstrated that there are uplifters. Now let's look for a connection between every uplifter and the rest of SKUs.sales_vars

We are using Causal Impact to analyse SKUs.category_id, dept_id)

In [14]:
folderCausalImpactResults = os.path.join(os.path.expanduser(dataFolder), results_folder, category_id, dept_id)
filenameCI = os.path.join(folderCausalImpactResults, f'{store_name}.pickle')

# Option to deseasonalise
sku_potential_cannibals = df_snap_uplifters.index.tolist()

if os.path.exists(filenameCI):
    df_CI_analysis = fhelp.readPickleFile(filenameCI)
else:
    causal_impact_analysis = []

    total_cannibals = len(sku_potential_cannibals)

    for idx, sku_A in enumerate(sku_potential_cannibals):

        print(f'{idx}/{total_cannibals}-{sku_A}')

        category_id_A, dept_id_A, sku_id_A, store_name = fhelp.get_taxonomy_from_sku_name_CFAV(sku_A)
        state_name = store_name.split('_')[0]

        # Get the promotions and split them into slots
        promo_sku_A = df_store[f'promotion_flag-{sku_id_A}-{store_name}']
        idx_pre_intervention, idx_post_intervention = \
            fhelp.split_promos_into_sequences(promo_sku_A, min_promo_days=min_promo_days, min_regular_days=min_regular_days)

        availability_sku_A = df_components[f'heartbeat_flag-{sku_id_A}-{store_name}']
        availability_value_sku_A = availability_sku_A.sum()/len(availability_sku_A)
        flag_min_availability_sku_A = availability_value_sku_A > 0.9

        # TO-DO: Decomposition should be done according to the SKU's patterns
        if flag_min_availability_sku_A & do_decomposition:
            sales_sku_A = df_components[f'residual-{sku_id_A}-{store_name}'] + df_components[f'trend-{sku_id_A}-{store_name}']
        else:
            sales_sku_A = df_store[sku_A]


        # go through all the SKUs in the store
        sku_potential_victims = sales_vars
        if sku_A in sku_potential_victims:
            sku_potential_victims.remove(sku_A)

        for sku_B in sku_potential_victims:    
            category_id_B, dept_id_B, sku_id_B, store_name = fhelp.get_taxonomy_from_sku_name_CFAV(sku_B)
            availability_sku_B = df_components[f'heartbeat_flag-{sku_id_B}-{store_name}']
            promo_sku_B = df_store[f'promotion_flag-{sku_id_B}-{store_name}']

            # Decide what to do with sku_B on promo
            # - Remove the promo days? when? outside the cannibalisation window?
            # - We should compare windows of pre/post promo, not the entire year
            availability_value_sku_B = availability_sku_B.sum()/len(availability_sku_B)
            flag_min_availability_sku_B = availability_value_sku_B > 0.9

            df_sales_covariates = df_store[[f'sales-{sku_id_B}-{store_name}', 'total_units_trend', 'T2M_MAX_adj']]

            if flag_min_availability_sku_B & do_decomposition:
                # Overwrite
                sales_sku_B = df_components[f'residual-{sku_id_B}-{store_name}'] + df_components[f'trend-{sku_id_B}-{store_name}']
                df_sales_covariates.iloc[:,0] = sales_sku_B
                
            ci_analysis = promies.calculate_causal_impact_with_covariates(sku_id_A, promo_sku_A, \
                availability_sku_A, sales_sku_A, \
                sku_id_B, promo_sku_B, availability_sku_B, 
                df_sales_covariates, \
                idx_pre_intervention, idx_post_intervention, \
                idx_holiday_to_exclude, \
                min_diff_in_units_from_reg_to_promo, \
                min_ratio_change = min_ratio_change,\
                do_exclude_promos_SKU_B = do_exclude_promos_SKU_B, \
                be_verbose=be_verbose)
            if ci_analysis:
                causal_impact_analysis.extend(ci_analysis)

    # Save the analysis
    df_CI_analysis = pd.DataFrame(causal_impact_analysis)
    fhelp.makeFolder(folderCausalImpactResults)
    fhelp.toPickleFile(df_CI_analysis, filenameCI)
    # Also save it in Excel
    fhelp.to_excel_file(df_CI_analysis, filenameCI.replace('.pickle', '.xlsx'))
    
df_CI_analysis.head(3)

0/7-sales-PRODUCE_1489895-Pichincha_47_A_14
Running Causal Impact...




CausalImpact >> Probability of a causal event 100.00
Running Causal Impact...




CausalImpact >> Probability of a causal event 100.00
1/7-sales-PRODUCE_1695874-Pichincha_47_A_14
Running Causal Impact...




CausalImpact >> Probability of a causal event 100.00
Running Causal Impact...




CausalImpact >> Probability of a causal event 88.31
Running Causal Impact...




CausalImpact >> Probability of a causal event 99.40
Running Causal Impact...




CausalImpact >> Probability of a causal event 91.81
Running Causal Impact...




CausalImpact >> Probability of a causal event 100.00
2/7-sales-PRODUCE_1693647-Pichincha_47_A_14
Running Causal Impact...




CausalImpact >> Probability of a causal event 100.00
Running Causal Impact...




CausalImpact >> Probability of a causal event 100.00
3/7-sales-PRODUCE_1945991-Pichincha_47_A_14
Running Causal Impact...




CausalImpact >> Probability of a causal event 93.81
Running Causal Impact...




CausalImpact >> Probability of a causal event 100.00
Running Causal Impact...




CausalImpact >> Probability of a causal event 100.00
Running Causal Impact...




CausalImpact >> Probability of a causal event 100.00
Running Causal Impact...




CausalImpact >> Probability of a causal event 99.20
4/7-sales-PRODUCE_1960806-Pichincha_47_A_14
Running Causal Impact...




CausalImpact >> Probability of a causal event 97.00
5/7-sales-PRODUCE_1960944-Pichincha_47_A_14
Running Causal Impact...




CausalImpact >> Probability of a causal event 70.93
6/7-sales-PRODUCE_1642401-Pichincha_47_A_14
Running Causal Impact...




CausalImpact >> Probability of a causal event 100.00
Running Causal Impact...




CausalImpact >> Probability of a causal event 100.00
Running Causal Impact...




CausalImpact >> Probability of a causal event 100.00
Pickle file saved to /Users/carlos.aguilar/Google Drive/order/Machine Learning Part/data/CorporacionFavorita/CausalImpactResults/FOOD/PRODUCE/Pichincha_47_A_14.pickle


Unnamed: 0,cannibal,victim,slot_number,idx_regular_days,idx_promo_days,total_overlapping_days_regular,regular_to_promo_gap,total_overlapping_days_promo,competing_promo_days,sku_B_regular_avg_sales,sku_B_avg_sales_during_promo_sku_A,diff_in_units_from_reg_to_promo,diff_in_units_from_promo_to_pos_promo,ratio_change,avg_actual,avg_predicted,avg_abs_effect,cum_abs_effect,posterior_tail_prob,prob_causal_effect
0,PRODUCE_1489895,PRODUCE_1695965,0,"[172, 177]","[178, 192]",6,0,15,3,41.759167,19.028067,22.7311,-4.960648,0.544338,19.028067,-34.731208,53.759275,806.389121,0.0,100.0
1,PRODUCE_1489895,PRODUCE_1695989,0,"[172, 177]","[178, 192]",6,0,13,2,28.333333,14.769231,13.564103,-11.659341,0.478733,12.8,-31.674831,44.474831,667.122466,0.0,100.0
2,PRODUCE_1695874,PRODUCE_1695837,0,"[200, 205]","[206, 214]",6,0,9,1,20.906333,7.652,13.254333,-11.192429,0.633987,7.652,40.329843,-32.677843,-294.100589,0.0,100.0


In [15]:
### If things go South

Now I have to do a decent amount of work to differenciate between effects: promo>non-promo or promo>partial_promo, etc

### Post mortem analysis

### post mortem - compare two products

In [18]:
run_post_mortem = False
#run_post_mortem = True
if run_post_mortem: 
    min_diff_in_units_from_reg_to_promo = 5
    idx =0 
    sku_A = f'sales-PRODUCE_1693647-{store_name}'
    sku_B = f'sales-PRODUCE_1695965-{store_name}'
    print(f'{sku_A}->{sku_B}')

    category_id_A, dept_id_A, sku_id_A, store_name = fhelp.get_taxonomy_from_sku_name_CFAV(sku_A)
    state_name = store_name.split('_')[0]

    # Get the promotions and split them into slots
    promo_sku_A = df_store[f'promotion_flag-{sku_id_A}-{store_name}']
    idx_pre_intervention, idx_post_intervention = \
        fhelp.split_promos_into_sequences(promo_sku_A, min_promo_days=min_promo_days, min_regular_days=min_regular_days)

    availability_sku_A = df_components[f'heartbeat_flag-{sku_id_A}-{store_name}']
    availability_value_sku_A = availability_sku_A.sum()/len(availability_sku_A)
    flag_min_availability_sku_A = availability_value_sku_A > 0.9

    # TO-DO: Decomposition should be done according to the SKU's patterns
    if flag_min_availability_sku_A & do_decomposition:
        sales_sku_A = df_components[f'residual-{sku_id_A}-{store_name}'] + df_components[f'trend-{sku_id_A}-{store_name}']
    else:
        sales_sku_A = df_store[sku_A]

    category_id_B, dept_id_B, sku_id_B, store_name = fhelp.get_taxonomy_from_sku_name_CFAV(sku_B)
    availability_sku_B = df_components[f'heartbeat_flag-{sku_id_B}-{store_name}']
    promo_sku_B = df_store[f'promotion_flag-{sku_id_B}-{store_name}']


    # Decide what to do with sku_B on promo
    # - Remove the promo days? when? outside the cannibalisation window?
    # - We should compare windows of pre/post promo, not the entire year
    availability_value_sku_B = availability_sku_B.sum()/len(availability_sku_B)
    flag_min_availability_sku_B = availability_value_sku_B > 0.9

    if flag_min_availability_sku_B & do_decomposition:
        sales_sku_B = df_components[f'residual-{sku_id_B}-{store_name}'] + df_components[f'trend-{sku_id_B}-{store_name}']
    else:
        sales_sku_B = df_store[sku_B]

    ci_analysis = promies.calculate_causal_impact(sku_id_A, promo_sku_A, availability_sku_A, sales_sku_A, \
        sku_id_B, promo_sku_B, availability_sku_B, sales_sku_B, \
        idx_pre_intervention, idx_post_intervention, \
        idx_holiday_to_exclude, \
        min_diff_in_units_from_reg_to_promo, \
        min_ratio_change = min_ratio_change,\
        do_exclude_promos_SKU_B = do_exclude_promos_SKU_B, \
        be_verbose=True)

    ci_analysis

### Save the analysis

### Save all the plots

In [19]:
folder_to_save_plots = os.path.join(dataFolder, results_folder)
print(f'Saving the plots in {folder_to_save_plots}')
if save_all_cannibals:
    for _, irow in df_CI_analysis.iterrows():
        fhelp.plot_causal_pairs_exogenous(irow, df_store, fig_h = 10, fig_w = 18, \
            folder_to_save_plots=folder_to_save_plots, save_to_file=True)

Saving the plots in /Users/carlos.aguilar/Google Drive/order/Machine Learning Part/data/CorporacionFavorita/CausalImpactResults


### Create a directed graph of the interactions

In [20]:
# Directed graph
import networkx as nx
DG = nx.DiGraph()

In [21]:
def add_graph_relationship(node_A, node_B, edge_properties: dict):

  DG.add_node(node_A['name'], **node_A['properties'])

  d = dict()
  DG.add_node(node_B['name'], **node_B['properties'])

  edge_label = '\n'.join([f'{k}: {v:3.2f}' for k,v in edge_properties.items()])  
  DG.add_edge(node_A['name'], node_B['name'], **edge_properties, label=edge_label)

In [22]:
#vars_edges = ['snap_days','regular_days','sku_B_regular','sku_B_snap', \
#             'cannibal_price', 'victim_price']
#
#vars_edges = ['snap_days','regular_days', \
vars_edges = ['sku_B_regular_avg_sales','avg_predicted', 'slot_number']

for item_id, df_row in df_CI_analysis.iterrows():
    node_A = {'name': df_row.cannibal, 'properties': dict()}
    node_B = {'name': df_row.victim, 'properties': dict()}

    edge_properties = df_row[vars_edges].to_dict()

    add_graph_relationship(node_A, node_B, edge_properties)

In [23]:
if save_all_cannibals:
    text_offset = 3e-4
    fig_h = 20
    fig_w = 36
    fig = plt.figure(figsize=(fig_w,fig_h))
    pos = nx.nx_agraph.graphviz_layout(DG, prog="sfdp")
    nx.draw(DG, pos, \
        node_color='lightblue', linewidths=0.5, font_size=10, \
        font_weight='bold', with_labels=False, edge_color='r')

    labels = nx.get_edge_attributes(DG, 'label')
    nx.draw_networkx_edge_labels(DG, pos, edge_labels=labels, font_size=8)

    for k, v in pos.items():
        plt.text(v[0],v[1]+text_offset, s=k, horizontalalignment='center')


    foldername = os.path.join(dataFolder, results_folder, category_id, dept_id, 'graphs')
    fhelp.makeFolder(foldername)
    plt_filename = os.path.join(foldername, f'cannibals-{store_name}.pdf')
    plt.savefig(plt_filename, format='pdf', dpi=300, bbox_inches='tight')