This notebook generates the results table that goes into the paper.


Tricks:
save the file test_table.tex and within LaTeX:
```
\begin{table*}[h]
\centering
\resizebox{1.1\textwidth}{!}{
\input{tables/test_table.tex}
}
\caption{Explanation} 
\label{tab:ssss}
\end{table*}
```

In [1]:
import numpy as np
import uuid
import pandas as pd
import os
import pickle
from datetime import datetime
import matplotlib.pyplot as plt
import matplotlib
import fcn_helpers as fhelp
import glob

dataFolder = os.path.expanduser('~/Google Drive/order/Machine Learning Part/data/CorporacionFavorita')



In [2]:
def writeTextFile(thisStr, thisFile):
    with open(thisFile, 'w') as f:
        f.write(thisStr)

In [3]:
def get_results_from_analysis(path_to_file
    ,save_all_cannibals = True
    ,min_diff_in_units_from_reg_to_promo = 10
    ,min_promo_days=3
    ,min_regular_days=6
    ,min_ratio_change = 0.5
    ,do_exclude_promos_SKU_B = True
    ,be_verbose=False
    ,sales_threshold = 1/3
    ,min_avg_sales = 10
    ,period_in_days = 7
    ,do_decomposition = False
    ,cannibalisation_threshold = 1/3
    ,min_snap_days = 10):
    '''
        * category
        * department
        * store
        * number skus > total_number_skus
        * number skus that have got promotions and are worth investigating > num_skus_promotions
        * number skus that could be cannibals > num_sku_potential_cannibals
        * number of combinations analysed > num_combinations_analysed
        * number of cannibals > number_cannibals
        * number of victims > number_victims
        * number of cannibalisation episodes > num_cannibalisation_episodes
        * percentage of cannibalisation > avg_can_percentage, std_can_percentage
    '''
    causal_impact_results = pd.read_pickle(path_to_file)
    
    dict_results = {}

    if causal_impact_results.empty:
      return
    
    current_store_name = path_to_file.split('/')[-1].split('.')[0]

    sku_A = f'sales-{causal_impact_results.cannibal[0]}-{current_store_name}'
    category_id, dept_id, sku_id, store_name = fhelp.get_taxonomy_from_sku_name_CFAV(sku_A)

    # read the stores data
    foldername = os.path.join(dataFolder, category_id, dept_id, 'store_sales')
    filename = os.path.join(foldername, f'{store_name}.pickle')
    df_store = fhelp.readPickleFile(filename)

    sales_vars = [iVar for iVar in df_store.columns if 'sales-' in iVar]
    promotion_vars = [iVar for iVar in df_store.columns if 'promotion_flag-' in iVar]

    total_number_skus = len(sales_vars)


    '''
      number skus that could be cannibals
    '''

    folderPromoStats = os.path.join(os.path.expanduser(dataFolder), 'sku_promo_slot_analysis', category_id, dept_id)
    filename = os.path.join(folderPromoStats, f'{store_name}.pickle')


    if os.path.exists(filename):
      df_snap_stats = fhelp.readPickleFile(filename)

      num_skus_promotions = len(df_snap_stats.index.tolist())

      # A bit of work on the uplifters
      # Snap sales greater than the sales + threshold
      idx_A = df_snap_stats['mu_delta']>=(1+sales_threshold)

      # Comparison only valid if they are not zero sales
      idx_B = (df_snap_stats['avg_promo_sales']>min_avg_sales) & (df_snap_stats['avg_regular_sales']>min_avg_sales)
      df_snap_stats['uplift_in_median'] = (idx_A & idx_B)

      # Get two groups: potential cannibals (and haloers) and victims
      df_snap_uplifters = df_snap_stats[df_snap_stats['uplift_in_median']].copy()
      sku_potential_cannibals = df_snap_uplifters.index.tolist()
      num_sku_potential_cannibals = len(sku_potential_cannibals)
    else:
      return
    


    '''
      number of combinations analysed
      number cannibals * potential_victims * num_promotional_slots
      potential_victims = total_number_skus-1

      num_promotional_slots can be approximated as df_snap_stats.num_promo_slots.mean()
    '''

    num_combinations_analysed = \
      round(num_sku_potential_cannibals*(total_number_skus-1)*df_snap_stats.num_promo_slots.mean())

    '''
      Final numbers
    '''
    current_cannibals = causal_impact_results.cannibal.unique().tolist()
    current_victims = causal_impact_results.victim.unique().tolist()
    number_cannibals = len(current_cannibals)
    number_victims = len(current_victims)

    num_cannibalisation_episodes = causal_impact_results.shape[0]

    percentage_cannibalisation = \
      100*causal_impact_results.sku_B_avg_sales_during_promo_sku_A/causal_impact_results.sku_B_regular_avg_sales

    avg_can_percentage = percentage_cannibalisation.mean()
    std_can_percentage = percentage_cannibalisation.std()    
    
    '''
      average daily losses (units)
    '''
    avg_abs_effect = causal_impact_results.avg_abs_effect.mean()
    std_abs_effect = causal_impact_results.avg_abs_effect.std()


    '''
      average cumulative lost due to cannibalisation (units)
    '''
    avg_cum_abs_effect = causal_impact_results.cum_abs_effect.mean()
    total_cum_abs_effect = causal_impact_results.cum_abs_effect.sum()

    '''
    Average probability of Causal Effect
    '''
    avg_prob_causal_effect = causal_impact_results.prob_causal_effect.mean()

    dict_results = {'category': category_id,
    'department': dept_id,
    'store': store_name,
    'total_number_skus': total_number_skus,
    'num_skus_promotions': num_skus_promotions,
    'num_sku_potential_cannibals': num_sku_potential_cannibals,
    'num_combinations_analysed': num_combinations_analysed,
    'number_cannibals': number_cannibals,
    'number_victims': number_victims,
    'num_cannibalisation_episodes': num_cannibalisation_episodes,
    'avg_can_percentage': avg_can_percentage,
    'std_can_percentage': std_can_percentage,
    'avg_abs_effect': avg_abs_effect,
    'std_abs_effect': std_abs_effect,
    'avg_cum_abs_effect': avg_cum_abs_effect,
    'total_cum_abs_effect': total_cum_abs_effect, 
    'avg_prob_causal_effect': avg_prob_causal_effect}
    
    return dict_results

In [4]:
glob_pattern = os.path.join(os.path.expanduser(dataFolder), 'CausalImpactResults', '*', '*', '*.pickle')
store_level_files = glob.glob(glob_pattern)

In [5]:
all_results = []
for this_store in store_level_files:
    current_results = get_results_from_analysis(this_store)
    if current_results:
        all_results.append(current_results)

In [6]:
df_results = pd.DataFrame(all_results)
df_results.sort_values(by=['category', 'department', 'store'], inplace=True)
df_results.head()

Unnamed: 0,category,department,store,total_number_skus,num_skus_promotions,num_sku_potential_cannibals,num_combinations_analysed,number_cannibals,number_victims,num_cannibalisation_episodes,avg_can_percentage,std_can_percentage,avg_abs_effect,std_abs_effect,avg_cum_abs_effect,total_cum_abs_effect,avg_prob_causal_effect
8,DRINKS,BEVERAGES,Pichincha_3_D_8,613,173,52,48748.0,56,59,165,29.921336,16.138178,-24.94963,29.19892,-317.300571,-52354.594201,95.746678
10,DRINKS,BEVERAGES,Pichincha_44_A_5,613,180,63,58262.0,78,95,308,29.566408,15.375405,-34.093438,48.249196,-477.423366,-147046.396805,96.140872
1,DRINKS,BEVERAGES,Pichincha_45_A_11,613,180,58,54230.0,72,70,270,31.375844,14.482227,-27.992059,21.867931,-402.133477,-108576.038828,94.464794
9,DRINKS,BEVERAGES,Pichincha_46_A_14,613,175,40,35391.0,40,47,134,31.830912,15.494096,-25.720304,17.485207,-438.090819,-58704.169728,95.584266
2,DRINKS,BEVERAGES,Pichincha_47_A_14,613,179,68,65795.0,77,77,244,31.381385,15.411028,-34.971578,40.34102,-497.78763,-121460.181719,96.798284


In [11]:
total_cannibals = df_results.number_cannibals.sum()
total_episodes  = df_results.num_cannibalisation_episodes.sum()
total_cum_abs_effect = df_results.total_cum_abs_effect.sum()
avg_percentage = df_results.avg_can_percentage.mean()

total_cannibals, total_episodes, total_cum_abs_effect, avg_percentage

(675, 1965, -719271.1879647669, 30.993504783481097)

Adapt the following to show the table

['category',
 'department',
 'store',
 'total_number_skus',
 'num_skus_promotions',
 'num_sku_potential_cannibals',
 'num_combinations_analysed',
 'number_cannibals',
 'number_victims',
 'num_cannibalisation_episodes',
 'avg_can_percentage',
 'std_can_percentage',
 'avg_abs_effect',
 'std_abs_effect',
 'avg_cum_abs_effect',
 'total_cum_abs_effect',
 'avg_prob_causal_effect']

for iCol in df_results.columns.tolist():
    print(f'\'{iCol}\':\'{iCol}\',')

In [7]:
df_results['percentage'] = df_results[['avg_can_percentage', 'std_can_percentage']].apply(lambda st: f'${st[0]:3.1f} PLUSMINUS {st[1]:3.1f}$', axis=1)
df_results['average_daily_losses'] = df_results[['avg_abs_effect', 'std_abs_effect']].apply(lambda st: f'${st[0]:3.1f} PLUSMINUS {st[1]:3.1f}$', axis=1)
df_results['cat_dep'] = df_results[['category', 'department']].apply(lambda st: f'{st[0]}-{st[1]}', axis=1)
df_results['store_num'] = df_results['store'].apply(lambda st: st.split('_')[1])

df_results['sku_promo'] = df_results[['total_number_skus', 'num_skus_promotions']].apply(lambda st: f'{st[0]:d}/{st[1]:d}', axis=1)

df_results.head()

Unnamed: 0,category,department,store,total_number_skus,num_skus_promotions,num_sku_potential_cannibals,num_combinations_analysed,number_cannibals,number_victims,num_cannibalisation_episodes,...,avg_abs_effect,std_abs_effect,avg_cum_abs_effect,total_cum_abs_effect,avg_prob_causal_effect,percentage,average_daily_losses,cat_dep,store_num,sku_promo
8,DRINKS,BEVERAGES,Pichincha_3_D_8,613,173,52,48748.0,56,59,165,...,-24.94963,29.19892,-317.300571,-52354.594201,95.746678,$29.9 PLUSMINUS 16.1$,$-24.9 PLUSMINUS 29.2$,DRINKS-BEVERAGES,3,613/173
10,DRINKS,BEVERAGES,Pichincha_44_A_5,613,180,63,58262.0,78,95,308,...,-34.093438,48.249196,-477.423366,-147046.396805,96.140872,$29.6 PLUSMINUS 15.4$,$-34.1 PLUSMINUS 48.2$,DRINKS-BEVERAGES,44,613/180
1,DRINKS,BEVERAGES,Pichincha_45_A_11,613,180,58,54230.0,72,70,270,...,-27.992059,21.867931,-402.133477,-108576.038828,94.464794,$31.4 PLUSMINUS 14.5$,$-28.0 PLUSMINUS 21.9$,DRINKS-BEVERAGES,45,613/180
9,DRINKS,BEVERAGES,Pichincha_46_A_14,613,175,40,35391.0,40,47,134,...,-25.720304,17.485207,-438.090819,-58704.169728,95.584266,$31.8 PLUSMINUS 15.5$,$-25.7 PLUSMINUS 17.5$,DRINKS-BEVERAGES,46,613/175
2,DRINKS,BEVERAGES,Pichincha_47_A_14,613,179,68,65795.0,77,77,244,...,-34.971578,40.34102,-497.78763,-121460.181719,96.798284,$31.4 PLUSMINUS 15.4$,$-35.0 PLUSMINUS 40.3$,DRINKS-BEVERAGES,47,613/179


In [8]:
aggregations  = {'total_number_skus': 'mean', 'num_skus_promotions': 'mean'}
df_results_agg = df_results.groupby(['category', 'department', ], as_index=False).agg(aggregations)
df_results_agg.sort_values(by=['category', 'department'], inplace=True)
df_results_agg['cat_dep'] = df_results_agg[['category', 'department']].apply(lambda st: f'{st[0]}-{st[1]}', axis=1)
vars_to_save = ['cat_dep', 'total_number_skus', 'num_skus_promotions']
str_latex = df_results_agg[vars_to_save].to_latex(index=False, float_format='{:3.2f}'.format)
print(str_latex)

\begin{tabular}{lrr}
\toprule
           cat\_dep &  total\_number\_skus &  num\_skus\_promotions \\
\midrule
  DRINKS-BEVERAGES &                613 &               171.73 \\
 FOOD-BREAD\_BAKERY &                134 &                44.50 \\
        FOOD-DAIRY &                242 &               136.20 \\
         FOOD-DELI &                 91 &                14.50 \\
      FOOD-POULTRY &                 54 &                 1.00 \\
      FOOD-PRODUCE &                306 &                40.55 \\
\bottomrule
\end{tabular}



In [9]:
fhelp.to_random_excel_file(df_results)

In [10]:
d = {'total_number_skus':'skus',     
'num_skus_promotions':'promotions',
'num_sku_potential_cannibals':'potential cannibals',
'num_combinations_analysed': 'combinations analysed',
'number_cannibals':'cannibals',
'number_victims':'victims',
'num_cannibalisation_episodes':'episodes',
     
'avg_can_percentage':'avg_can_percentage',
'std_can_percentage':'std_can_percentage',
     
'avg_abs_effect':'avg_abs_effect',
'std_abs_effect':'std_abs_effect',
     
'total_cum_abs_effect':'Cum abs effect',
'avg_prob_causal_effect':'P_{CE}'
}

In [11]:
df_results.rename(columns=d, inplace=True)
df_results.head()

Unnamed: 0,category,department,store,skus,promotions,potential cannibals,combinations analysed,cannibals,victims,episodes,...,avg_abs_effect,std_abs_effect,avg_cum_abs_effect,Cum abs effect,P_{CE},percentage,average_daily_losses,cat_dep,store_num,sku_promo
8,DRINKS,BEVERAGES,Pichincha_3_D_8,613,173,52,48748.0,56,59,165,...,-24.94963,29.19892,-317.300571,-52354.594201,95.746678,$29.9 PLUSMINUS 16.1$,$-24.9 PLUSMINUS 29.2$,DRINKS-BEVERAGES,3,613/173
10,DRINKS,BEVERAGES,Pichincha_44_A_5,613,180,63,58262.0,78,95,308,...,-34.093438,48.249196,-477.423366,-147046.396805,96.140872,$29.6 PLUSMINUS 15.4$,$-34.1 PLUSMINUS 48.2$,DRINKS-BEVERAGES,44,613/180
1,DRINKS,BEVERAGES,Pichincha_45_A_11,613,180,58,54230.0,72,70,270,...,-27.992059,21.867931,-402.133477,-108576.038828,94.464794,$31.4 PLUSMINUS 14.5$,$-28.0 PLUSMINUS 21.9$,DRINKS-BEVERAGES,45,613/180
9,DRINKS,BEVERAGES,Pichincha_46_A_14,613,175,40,35391.0,40,47,134,...,-25.720304,17.485207,-438.090819,-58704.169728,95.584266,$31.8 PLUSMINUS 15.5$,$-25.7 PLUSMINUS 17.5$,DRINKS-BEVERAGES,46,613/175
2,DRINKS,BEVERAGES,Pichincha_47_A_14,613,179,68,65795.0,77,77,244,...,-34.971578,40.34102,-497.78763,-121460.181719,96.798284,$31.4 PLUSMINUS 15.4$,$-35.0 PLUSMINUS 40.3$,DRINKS-BEVERAGES,47,613/179


In [12]:
print(','.join(sorted(df_results['store_num'].unique().tolist())))
    

3,4,44,45,46,47,48,49,6,7,8


In [13]:
vars_to_save = ['cat_dep',
 'store_num',
 'sku_promo',
 'combinations analysed',
 'cannibals',
 'victims',
 'episodes',
 'P_{CE}',
 'percentage',
 'avg_cum_abs_effect',
 'average_daily_losses']
df_latex = df_results[vars_to_save].copy()
df_latex.head()

Unnamed: 0,cat_dep,store_num,sku_promo,combinations analysed,cannibals,victims,episodes,P_{CE},percentage,avg_cum_abs_effect,average_daily_losses
8,DRINKS-BEVERAGES,3,613/173,48748.0,56,59,165,95.746678,$29.9 PLUSMINUS 16.1$,-317.300571,$-24.9 PLUSMINUS 29.2$
10,DRINKS-BEVERAGES,44,613/180,58262.0,78,95,308,96.140872,$29.6 PLUSMINUS 15.4$,-477.423366,$-34.1 PLUSMINUS 48.2$
1,DRINKS-BEVERAGES,45,613/180,54230.0,72,70,270,94.464794,$31.4 PLUSMINUS 14.5$,-402.133477,$-28.0 PLUSMINUS 21.9$
9,DRINKS-BEVERAGES,46,613/175,35391.0,40,47,134,95.584266,$31.8 PLUSMINUS 15.5$,-438.090819,$-25.7 PLUSMINUS 17.5$
2,DRINKS-BEVERAGES,47,613/179,65795.0,77,77,244,96.798284,$31.4 PLUSMINUS 15.4$,-497.78763,$-35.0 PLUSMINUS 40.3$


In [14]:
str_latex = df_latex.to_latex(index=False, float_format='{:3.2f}'.format)

str_latex= str_latex.replace('PLUSMINUS', '\pm')
#str_latex= str_latex.replace('\_', ' ')
str_latex= str_latex.replace('\$', '$')
#print(str_latex)

In [15]:
prefix = 'results'
baseFolder = '~/Google Drive/order/Machine Learning Part/Preparing the 3rd paper/examples for the paper/results_excel'
tex_file_name = os.path.join(os.path.expanduser(baseFolder), prefix + '_table.tex')
writeTextFile(str_latex, tex_file_name)

In [16]:
print(str_latex)

\begin{tabular}{lllrrrrrlrl}
\toprule
           cat\_dep & store\_num & sku\_promo &  combinations analysed &  cannibals &  victims &  episodes &  P\_\{CE\} &             percentage &  avg\_cum\_abs\_effect &    average\_daily\_losses \\
\midrule
  DRINKS-BEVERAGES &         3 &   613/173 &               48748.00 &         56 &       59 &       165 &   95.75 &  $29.9 \pm 16.1$ &             -317.30 &  $-24.9 \pm 29.2$ \\
  DRINKS-BEVERAGES &        44 &   613/180 &               58262.00 &         78 &       95 &       308 &   96.14 &  $29.6 \pm 15.4$ &             -477.42 &  $-34.1 \pm 48.2$ \\
  DRINKS-BEVERAGES &        45 &   613/180 &               54230.00 &         72 &       70 &       270 &   94.46 &  $31.4 \pm 14.5$ &             -402.13 &  $-28.0 \pm 21.9$ \\
  DRINKS-BEVERAGES &        46 &   613/175 &               35391.00 &         40 &       47 &       134 &   95.58 &  $31.8 \pm 15.5$ &             -438.09 &  $-25.7 \pm 17.5$ \\
  DRINKS-BEVERAGES &        47 &   613/1