# Quilmes - Lift analysis - Denis Trosman

# Libraries

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import warnings
warnings.filterwarnings('ignore')

# Configs
pd.options.display.float_format = '{:.2f}'.format
pd.set_option('display.max_colwidth', None)
pd.set_option('display.max_columns', None)
color_pal = sns.color_palette()
plt.style.use('fivethirtyeight')

In [2]:
import sys
sys.path.insert(0, '../')
from src.eda import analyze_df

# Read files

#### Client data

● La columna “chanel_segmentation” se refiere al tipo de negocio donde se venden los
productos: Autoservicios, kioscos y tradicional (almacén).

● La columna "business” se refiere al tipo de producto al que hace referencia la campaña (CZA:
Cerveza y NABS: bebidas sin alcohol).

In [5]:
df_clients = pd.read_csv('../data/clients.csv')
df_channels = pd.read_csv('../data/channel_segmentation.csv')

In [6]:
analyze_df(df_clients)

Number of rows: 49458, Number of columns:3

Duplicated amount: 4

------------------------------Number of null values------------------------------
cliente_id    0
channel_id    0
region        0
dtype: int64
------------------------------Type of variables------------------------------
cliente_id     int64
channel_id     int64
region        object
dtype: object


Unnamed: 0,cliente_id,channel_id,region
0,-1726485189,1759010777,COSTA ATLANTICA
1,-928788236,1625004744,GBA MINORISTAS
2,1641917107,1625004744,COSTA ATLANTICA
3,1273086795,1625004744,GBA MINORISTAS
4,-1708080869,1625004744,COSTA ATLANTICA


In [7]:
analyze_df(df_channels)

Number of rows: 38, Number of columns:2

Duplicated amount: 0

------------------------------Number of null values------------------------------
channel_id              0
channel_segmentation    0
dtype: int64
------------------------------Type of variables------------------------------
channel_id               int64
channel_segmentation    object
dtype: object


Unnamed: 0,channel_id,channel_segmentation
0,565501238,NO
1,1377949257,NO
2,1059805429,OTROS
3,-808273538,NO
4,-1995572528,RESTAURANTE


#### Promotions data

● El data set “coupons.csv” tiene una columna que se llama “sales_with_coupons”. Esta
columna tiene la cantidad de ventas que se generaron bajo alguna promoción efectuada
durante alguna campaña.

● Según el área de inteligencia comercial, se considera que el cliente fue participe de la
campaña, si efectúo más de 5 compras (“sales_with_coupons” > 5).

● El data set “blacklist.csv” indica los clientes que no hay que considerar en el estudio por
motivos varios.

In [8]:
df_coupons = pd.read_csv('../data/coupons.csv')
df_blacklist = pd.read_csv('../data/blacklist.csv')

In [9]:
analyze_df(df_coupons)

Number of rows: 805757, Number of columns:5

Duplicated amount: 1

------------------------------Number of null values------------------------------
cliente_id            0
business              0
brand                 0
yearmonth             0
sales_with_coupons    0
dtype: int64
------------------------------Type of variables------------------------------
cliente_id             int64
business              object
brand                 object
yearmonth              int64
sales_with_coupons     int64
dtype: object


Unnamed: 0,cliente_id,business,brand,yearmonth,sales_with_coupons
0,-1133775360,CZA,brahma_dorada,202307,53
1,-2013629997,CZA,andes,202304,483
2,-1623564940,CZA,quilmes,202310,41
3,2137524281,CZA,quilmes,202212,1
4,-301975142,CZA,doble_malta,202210,2


In [10]:
analyze_df(df_blacklist)

Number of rows: 5937, Number of columns:1

Duplicated amount: 0

------------------------------Number of null values------------------------------
cliente_id    0
dtype: int64
------------------------------Type of variables------------------------------
cliente_id    int64
dtype: object


Unnamed: 0,cliente_id
0,2076569343
1,486609981
2,-2133238226
3,1501782976
4,-1672885383


#### Sales data

In [11]:
df_sales_2022_1 = pd.read_csv('../data/sales_2022_sementre_1.csv')
df_sales_2022_2 = pd.read_csv('../data/sales_2022_semestre_2.csv')
df_sales_2023_1 = pd.read_csv('../data/sales_2023_semestre_1.csv')

In [12]:
analyze_df(df_sales_2022_1)

Number of rows: 588683, Number of columns:5

Duplicated amount: 0

------------------------------Number of null values------------------------------
cliente_id    0
business      0
brand         0
sales         0
yearmonth     0
dtype: int64
------------------------------Type of variables------------------------------
cliente_id      int64
business       object
brand          object
sales         float64
yearmonth       int64
dtype: object


Unnamed: 0,cliente_id,business,brand,sales,yearmonth
0,1690894282,CZA,quilmes,1.82,202201
1,-985525432,CZA,brahma,0.74,202201
2,-674489716,CZA,brahma,4.55,202201
3,1026124970,CZA,brahma,1.26,202201
4,-2030441626,CZA,quilmes,0.2,202201


In [13]:
analyze_df(df_sales_2022_2)

Number of rows: 613864, Number of columns:5

Duplicated amount: 0

------------------------------Number of null values------------------------------
cliente_id    0
business      0
brand         0
sales         0
yearmonth     0
dtype: int64
------------------------------Type of variables------------------------------
cliente_id      int64
business       object
brand          object
sales         float64
yearmonth       int64
dtype: object


Unnamed: 0,cliente_id,business,brand,sales,yearmonth
0,-1488116429,CZA,quilmes,290.57,202211
1,-1833555306,CZA,brahma,1.59,202211
2,-1369064580,CZA,quilmes,0.73,202211
3,2008344780,CZA,quilmes,0.73,202211
4,647173935,CZA,andes_origen,0.34,202211


In [14]:
analyze_df(df_sales_2023_1)

Number of rows: 571918, Number of columns:5

Duplicated amount: 0

------------------------------Number of null values------------------------------
cliente_id    0
business      0
brand         0
sales         0
yearmonth     0
dtype: int64
------------------------------Type of variables------------------------------
cliente_id      int64
business       object
brand          object
sales         float64
yearmonth       int64
dtype: object


Unnamed: 0,cliente_id,business,brand,sales,yearmonth
0,883821370,CZA,quilmes,3.24,202303
1,1602484641,CZA,brahma,80.11,202303
2,-124654407,CZA,quilmes,1.02,202303
3,2095908164,CZA,quilmes,14.53,202303
4,-1143586715,CZA,quilmes,0.15,202303


# Filters and unification

Delete duplicated values from clients (4) and coupons (1) datasets

In [20]:
df_clients.drop_duplicates(inplace=True)
df_coupons.drop_duplicates(inplace=True)

Datasets have no null values

Delete clients in blacklist: 1040 removed

In [41]:
df_clients = df_clients.merge(df_blacklist,on='cliente_id',how='left',indicator=True).query('_merge == "left_only"').drop('_merge', axis=1)

Add channel name. Even though there are clients with more than one channel segmentation, those duplicates won't be removed.

In [79]:
df_clients = df_clients.merge(df_channels, on='channel_id',how='left')

In [69]:
channels_list = ['TRADICIONAL','AUTOSERVICIOS','KIOSCOS']

In [80]:
df_clients = df_clients[df_clients.channel_segmentation.isin(channels_list)]

After this filter, we keep 38,326 registers

In [81]:
df_clients.shape

(38326, 4)

Create campaing atribute - Lift

In [50]:
df_coupons['campaign'] = [1 if x>5 else 0 for x in df_coupons.sales_with_coupons]

In [74]:
df_coupons.campaign.value_counts(normalize=True)

campaign
0   0.68
1   0.32
Name: proportion, dtype: float64

In [76]:
df_coupons.sort_values('cliente_id')

Unnamed: 0,cliente_id,business,brand,yearmonth,sales_with_coupons,campaign
160978,-2147472583,CZA,andes,202308,2,0
476262,-2147472583,CZA,stella_artois,202308,3,0
192926,-2147472583,CZA,quilmes,202205,1,0
345829,-2147376406,CZA,quilmes,202205,3,0
17847,-2147376406,CZA,quilmes,202204,4,0
...,...,...,...,...,...,...
300181,2147477644,CZA,stella_artois,202303,1,0
36462,2147477644,CZA,stella_artois,202209,28,1
55789,2147477644,CZA,quilmes,202207,15,1
268304,2147477644,CZA,quilmes,202212,15,1


Merge with clients dataset

In [83]:
df_coupons.shape

(805756, 6)

In [86]:
df = df_coupons.merge(df_clients,on='cliente_id',how='left')

In [90]:
df = df[~df.region.isna()]

In [91]:
df.shape

(128013, 9)

In [92]:
df.head()

Unnamed: 0,cliente_id,business,brand,yearmonth,sales_with_coupons,campaign,channel_id,region,channel_segmentation
7,-189677826,CZA,quilmes,202305,4,0,1625004744.0,COSTA ATLANTICA,TRADICIONAL
23,-1380570683,CZA,stella_artois,202310,18,1,1625004744.0,NOA,TRADICIONAL
29,780479776,CZA,quilmes,202304,314,1,1625004744.0,PATAGONIA,TRADICIONAL
36,-1848249141,CZA,brahma,202310,43,1,-1756013582.0,GBA MINORISTAS,AUTOSERVICIOS
37,-593571610,CZA,andes_origen,202301,17,1,-1756013582.0,CENTRAL,AUTOSERVICIOS


There's only CZA products

In [97]:
df.business.unique()

array(['CZA'], dtype=object)

In [102]:
df.drop(columns=['channel_id'],inplace=True) # No longer need the channel code

Separate DF in 3 channels

In [138]:
df_tradicional = df[df.channel_segmentation == 'TRADICIONAL'].copy()
df_autoservicio = df[df.channel_segmentation == 'AUTOSERVICIOS'].copy()
df_kiosco = df[df.channel_segmentation == 'KIOSCOS'].copy()

In [139]:
df_tradicional.shape, df_autoservicio.shape, df_kiosco.shape

((63267, 8), (47488, 8), (17258, 8))

In [119]:
df_t_camp = df_tradicional[df_tradicional.campaign == 1]
df_t_no_camp = df_tradicional[df_tradicional.campaign == 0]

In [129]:
df_t_camp[df_t_camp.cliente_id==-1380570683].sort_values(['brand','yearmonth',])

Unnamed: 0,cliente_id,business,brand,yearmonth,sales_with_coupons,campaign,region,channel_segmentation
734940,-1380570683,CZA,andes_origen,202308,9,1,NOA,TRADICIONAL
57842,-1380570683,CZA,andes_origen,202309,20,1,NOA,TRADICIONAL
468587,-1380570683,CZA,brahma,202308,12,1,NOA,TRADICIONAL
532568,-1380570683,CZA,brahma,202309,6,1,NOA,TRADICIONAL
16162,-1380570683,CZA,budweiser,202308,17,1,NOA,TRADICIONAL
475296,-1380570683,CZA,budweiser,202309,45,1,NOA,TRADICIONAL
686593,-1380570683,CZA,budweiser,202310,9,1,NOA,TRADICIONAL
534442,-1380570683,CZA,patagonia,202205,6,1,NOA,TRADICIONAL
9046,-1380570683,CZA,quilmes,202110,6,1,NOA,TRADICIONAL
640466,-1380570683,CZA,quilmes,202205,6,1,NOA,TRADICIONAL


In [123]:
df_t_camp[df_t_camp.cliente_id==-1380570683	]

Unnamed: 0,cliente_id,business,brand,yearmonth,sales_with_coupons,campaign,region,channel_segmentation
23,-1380570683,CZA,stella_artois,202310,18,1,NOA,TRADICIONAL
9046,-1380570683,CZA,quilmes,202110,6,1,NOA,TRADICIONAL
16162,-1380570683,CZA,budweiser,202308,17,1,NOA,TRADICIONAL
57842,-1380570683,CZA,andes_origen,202309,20,1,NOA,TRADICIONAL
68516,-1380570683,CZA,stella_artois,202309,11,1,NOA,TRADICIONAL
113217,-1380570683,CZA,stella_artois,202308,9,1,NOA,TRADICIONAL
117040,-1380570683,CZA,quilmes,202302,10,1,NOA,TRADICIONAL
206898,-1380570683,CZA,stella_artois,202205,10,1,NOA,TRADICIONAL
288878,-1380570683,CZA,quilmes,202309,119,1,NOA,TRADICIONAL
289042,-1380570683,CZA,quilmes,202310,75,1,NOA,TRADICIONAL


In [112]:
temp = df_tradicional.merge(df_sales_2022_1,on=['cliente_id','yearmonth','brand','business'],how='left').dropna()

In [135]:
temp.groupby('campaign').agg(sales =  ('sales','sum'))

Unnamed: 0_level_0,sales
campaign,Unnamed: 1_level_1
0,5940.27
1,1592.27


In [137]:
temp[temp.cliente_id == 185550033].groupby(['brand','campaign']).agg(total_sales = ('sales','sum'))

Unnamed: 0_level_0,Unnamed: 1_level_0,total_sales
brand,campaign,Unnamed: 2_level_1
andes_origen,1,10.91
brahma,0,0.41
brahma,1,1.21
budweiser,0,0.05
budweiser,1,0.75
patagonia,1,1.71
quilmes,0,0.57
quilmes,1,1.09
stella_artois,1,5.04


In [116]:
df_sales_2022_1[df_sales_2022_1.cliente_id == 185550033].sort_values('yearmonth')

Unnamed: 0,cliente_id,business,brand,sales,yearmonth
518773,185550033,CZA,brahma,0.24,202201
477063,185550033,CZA,andes_origen,1.44,202201
300248,185550033,CZA,corona,0.43,202201
300192,185550033,CZA,otras,0.09,202201
421647,185550033,CZA,patagonia,0.9,202201
277824,185550033,CZA,stella_artois,0.23,202201
255172,185550033,CZA,quilmes,0.51,202201
306182,185550033,CZA,stella_artois,0.04,202202
428250,185550033,CZA,budweiser,0.17,202202
118106,185550033,CZA,andes_origen,1.56,202202
