In [2]:
import pandas as pd
import numpy as np
import os as os
import seaborn as sns
import matplotlib.pyplot as plt
import warnings

warnings.simplefilter(action='ignore', category=FutureWarning)

In [3]:
!pip install awswrangler

import awswrangler as wr

[33mDEPRECATION: pyodbc 4.0.0-unsupported has a non-standard version number. pip 23.3 will enforce this behaviour change. A possible replacement is to upgrade to a newer version of pyodbc or contact the author to suggest that they release a version with a conforming version number. Discussion can be found at https://github.com/pypa/pip/issues/12063[0m[33m
[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.2.1[0m[39;49m -> [0m[32;49m23.3[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [4]:
from sqlalchemy import create_engine
import os
import boto3

### Obtenção de dados - fonte da tabela na pasta sql

In [5]:
query = """
  
with cobranca as (
select
id_customer
, max(nr_days_paste_due_current) as nr_days_paste_due_current
from (
	select
	id_customer , nr_days_paste_due_current 
	, row_number() over (partition by id_customer order by cd_yearmonth desc) as num_ordem
	from customer_curated_zone.ca_book_cobranca 
)
where num_ordem = 1
group by 1
)
, transacional as (
	select 
	id_customer
	, avg(vl_total_spending) as avg_vl_total_spending
	, sum(nr_purchases_virtual_card_online + nr_purchases_virtual_card_presentially) as nr_purchases_virtual
	, sum(nr_purchases_online) as nr_purchases_online
	from  customer_curated_zone.ca_book_cartao
	group by 1
)
, features_conta as (
	select 
	cpf 
	, max(nr_transacoes_cashin + nr_transacoes_pix_cashout + nr_transacoes_ted_cashout) as nr_trans_mov_conta
	from customer_analytics_zone.ca_base_calculo_ie_conta
	group by 1
)
, eventos as (
	select 
	cpf
    , count(distinct dt_event) as nr_acessos
    , count(distinct case when lower(ds_event_name) like '%lojawill%' or lower(ds_event_name) like '%loja-will%' or lower(ds_event_name) like '%marketplace%' then dt_event end) as nr_acessos_lojawill
	, max(case 
		when lower(ds_event_name) like '%click_button_mgmshare_cardactivation%' 
		or lower(ds_event_name) like '%click_button_mgmshare_invoicepayment%' 
		or lower(ds_event_name) like '%open_modal_mgmdeeplink%' 
		or lower(ds_event_name) like '%click_button_facebook_screenmgmhome%'
		or lower(ds_event_name) like '%click_button_instagram_screenmgmhome%' 
		or lower(ds_event_name) like '%click_button_more_screenmgmhome%'
		or lower(ds_event_name) like '%click_button_sendinvite_screenmgmhome%' 
		or lower(ds_event_name) like '%click_button_whatsapp_screenmgmhome%' 
		or lower(ds_event_name) like '%click_button_mgmshare_onbnoresultmgm%'
		or lower(ds_event_name) like '%click_button_mgmshare_onbwaitresultmgm%'
		or lower(ds_event_name) like '%click_button_mgmwhats_onbwaitresultmgm%'
		or lower(ds_event_name) like '%mgm_home_share%'
	then 1 else 0 end) as flag_mgm
	from customer_curated_zone.ca_analitico_frontend
	where lower(ds_event_name) <> 'login_knownuser'
	and lower(ds_event_name) <> 'insert_textfield_password_loginknownuser'
	and lower(ds_event_name) <> 'home'
	and lower(ds_event_name) <> 'click_box_tracking_card_home'
	group by 1
)
 select 
	cli2.cpf
    , sum(nr_acessos) as nr_acessos
    , sum(e.nr_acessos_lojawill) / cast(sum(e.nr_acessos) as double) as pc_acessos_lojawill
	, max(date_diff('day', cast(pr.dt_cfi_account as date), current_date)) as tempo_relacionamento
	, sum(ic.nr_trans_mov_conta) as nr_trans_mov_conta
    , sum(t.avg_vl_total_spending) as avg_vl_total_spending
	, max(e.flag_mgm) as flag_mgm
	, max(cob.nr_days_paste_due_current) as dias_atraso
from customer_curated_zone.ca_book_status_cliente cli1
	inner join (
		select distinct id_customer, cpf from 
		customer_curated_zone.ca_book_cliente) cli2 
		on (cli1.id_customer = cli2.id_customer)
	left join growth_curated_zone.proposal_analysis pr on (pr.cpf = cli2.cpf)
	left join features_conta ic on (ic.cpf = cli2.cpf)
	left join cobranca cob on (cob.id_customer = cli1.id_customer)
	left join eventos e on (e.cpf = cli2.cpf)
    left join transacional t on (t.id_customer = cli1.id_customer)
where pr.ds_approval_type = 'credito' and cob.nr_days_paste_due_current <= 5
group by 1

"""

In [6]:
staging_path = 's3://will-prod-ml-platform-sagemaker-studio/staging/flavia-costa/'
boto3_session = boto3.Session(region_name='sa-east-1')

# Clean temporary folder
wr.s3.delete_objects(staging_path, boto3_session=boto3_session)

# Run query
print('Started query')

df = wr.athena.read_sql_query(
    sql=query,
    database=None,
    unload_approach=True,
    ctas_approach=False,
    boto3_session=boto3_session,
    s3_output=staging_path
)
wr.s3.delete_objects(staging_path, boto3_session=boto3_session)

Started query


In [7]:
len(df)

2885524

In [8]:
df.dtypes

cpf                       string
nr_acessos                 Int64
pc_acessos_lojawill      float64
tempo_relacionamento       Int64
nr_trans_mov_conta       float64
avg_vl_total_spending    float64
flag_mgm                   Int32
dias_atraso                Int32
dtype: object

In [9]:
df.isnull().sum()

cpf                            0
nr_acessos                953609
pc_acessos_lojawill       953609
tempo_relacionamento       10678
nr_trans_mov_conta       1026362
avg_vl_total_spending     343567
flag_mgm                  953609
dias_atraso                    0
dtype: int64

### Definindo tipos de var e lidando com missings

In [10]:
#variáveis para recodificar como zero

df['flag_mgm'] = df['flag_mgm'].fillna(0.0)
df['nr_trans_mov_conta'] = df['nr_trans_mov_conta'].fillna(0.0)


In [11]:
df.isnull().sum()

cpf                           0
nr_acessos               953609
pc_acessos_lojawill      953609
tempo_relacionamento      10678
nr_trans_mov_conta            0
avg_vl_total_spending    343567
flag_mgm                      0
dias_atraso                   0
dtype: int64

In [12]:
for column in df.columns:
  if df[column].dtype == 'object':
    df[column].fillna(df[column].mode()[0], inplace = True)
  elif df[column].dtype == 'string':
    df[column].fillna(df[column].mode()[0], inplace = True)
  elif df[column].dtype != 'object':
    df[column].fillna(df[column].median(), inplace = True)

In [13]:
df.isnull().sum()

cpf                      0
nr_acessos               0
pc_acessos_lojawill      0
tempo_relacionamento     0
nr_trans_mov_conta       0
avg_vl_total_spending    0
flag_mgm                 0
dias_atraso              0
dtype: int64

In [14]:
df['flag_mgm'].value_counts()

0    2082872
1     802652
Name: flag_mgm, dtype: Int64

In [15]:
df['flag_mgm'].value_counts(normalize = True)

0    0.721835
1    0.278165
Name: flag_mgm, dtype: Float64

## Início modelagem

In [16]:
df['tempo_relacionamento_meses'] = round(df['tempo_relacionamento'] / 30,0)

In [17]:
df['fx_nr_acessos']  = np.where(df.nr_acessos <=  df.nr_acessos.quantile(0.25), 'A',
                                  np.where(df.nr_acessos <=  df.nr_acessos.quantile(0.75), 'B', 'C'))

df['fx_tempo_relacionamento'] = np.where(df.tempo_relacionamento_meses <=  5, 'A',
                                  np.where(df.tempo_relacionamento_meses <=  20, 'B', 'C'))

df['fx_loja_will'] = np.where(df.pc_acessos_lojawill > 0, 'A', 'B')


df['fx_spending_ou_conta'] = np.where((df.avg_vl_total_spending <= df.avg_vl_total_spending.quantile(0.50)) & (df.nr_trans_mov_conta <= df.nr_trans_mov_conta.quantile(0.50)), 'A',
                             np.where((df.avg_vl_total_spending > df.avg_vl_total_spending.quantile(0.50)) & (df.nr_trans_mov_conta <= df.nr_trans_mov_conta.quantile(0.50)), 'B',
                             np.where((df.avg_vl_total_spending <= df.avg_vl_total_spending.quantile(0.50)) & (df.nr_trans_mov_conta > df.nr_trans_mov_conta.quantile(0.50)), 'C',                  
                             np.where((df.avg_vl_total_spending >= df.avg_vl_total_spending.quantile(0.50)) & (df.nr_trans_mov_conta >= df.nr_trans_mov_conta.quantile(0.50)), 'D',
                             np.where(df.avg_vl_total_spending >=  df.avg_vl_total_spending.quantile(0.75), 'D', 'Ni')))))




In [21]:
df['grupos_propensao'] = df['fx_nr_acessos'] + df['fx_tempo_relacionamento'] + df['fx_loja_will'] + df['fx_spending_ou_conta']

In [23]:
df[['grupos_propensao', 'flag_mgm']].groupby('grupos_propensao').mean()

Unnamed: 0_level_0,flag_mgm
grupos_propensao,Unnamed: 1_level_1
AAAA,0.359727
AAAB,0.46975
AAAC,0.476712
AAAD,0.54713
AABA,0.202773
...,...
CCAD,0.58803
CCBA,0.216157
CCBB,0.20231
CCBC,0.368815


In [62]:
df['fx_tempo_relacionamento'].value_counts()

C    1191906
B     891215
A     802403
Name: fx_tempo_relacionamento, dtype: int64

In [37]:
df['segmento_final'] = np.where(np.isin(df['grupos_propensao'], ['CAAB', 'CAAD', 'CBAD']), '05. Alta propensao',
                       np.where(np.isin(df['grupos_propensao'], ['CBAB', 'CABD', 'CBBD', 'CAAC']), '03.  Média propensao',         
                       np.where(np.isin(df['grupos_propensao'], ['CCAC', 'BBAD', 'BAAD', 'CABB', 'CBAC', 'CAAA', 'CCAD']), '04. Média-Alta propensao',
                       np.where(np.isin(df['grupos_propensao'], ['BAAA', 'BBAA', 'BBBD', 'CBBA', 'CCAB', 'CCBC', 'AABD', 'BABB', 'BABC', 'BBBB', 'BBBC', 'BCAC']), '02. Média propensao',
                       np.where(np.isin(df['grupos_propensao'], ['ACBC', 'ABCAA', 'ACCBA', 'ACCBB', 'ABCAB', 'ABCAD', 'ABABA', 'AACBB', 'ABBBA', 'ABCBC', 'ABCBD',]), '01. Baixa propensao',
'01. Baixa propensao')))))                    

In [38]:
df[['segmento_final', 'flag_mgm']].groupby('segmento_final').mean()

Unnamed: 0_level_0,flag_mgm
segmento_final,Unnamed: 1_level_1
01. Baixa propensao,0.17686
02. Média propensao,0.397004
03. Média propensao,0.573904
04. Média-Alta propensao,0.653588
05. Alta propensao,0.728432


In [51]:
df[['segmento_final', 'flag_mgm']].groupby('segmento_final').count()

Unnamed: 0_level_0,flag_mgm
segmento_final,Unnamed: 1_level_1
01. Baixa propensao,2115952
02. Média propensao,287005
03. Média propensao,122463
04. Média-Alta propensao,241975
05. Alta propensao,118129


In [39]:
df['segmento_final'].value_counts()

01. Baixa propensao         2115952
02. Média propensao          287005
04. Média-Alta propensao     241975
03.  Média propensao         122463
05. Alta propensao           118129
Name: segmento_final, dtype: int64

In [46]:
df['segmento_final'].value_counts(normalize = True)

01. Baixa propensao         0.733299
02. Média propensao         0.099464
04. Média-Alta propensao    0.083858
03.  Média propensao        0.042440
05. Alta propensao          0.040938
Name: segmento_final, dtype: float64

In [50]:
df[df['flag_mgm'] == 0]['segmento_final'].value_counts()

01. Baixa propensao         1741725
02. Média propensao          173063
04. Média-Alta propensao      83823
03.  Média propensao          52181
05. Alta propensao            32080
Name: segmento_final, dtype: int64

In [43]:
df[['segmento_final', 'nr_acessos']].groupby('segmento_final').agg({'nr_acessos':['mean', 'min', 'max','median' ]})

Unnamed: 0_level_0,nr_acessos,nr_acessos,nr_acessos,nr_acessos
Unnamed: 0_level_1,mean,min,max,median
segmento_final,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
01. Baixa propensao,55.888682,1,2406,56.0
02. Média propensao,86.473469,2,3216,64.0
03. Média propensao,160.267836,89,1764,144.0
04. Média-Alta propensao,171.438835,35,3252,142.0
05. Alta propensao,183.859061,89,1566,165.0


In [42]:
df[['segmento_final', 'tempo_relacionamento_meses']].groupby('segmento_final').agg({'tempo_relacionamento_meses':['mean', 'min', 'max', 'median']})

Unnamed: 0_level_0,tempo_relacionamento_meses,tempo_relacionamento_meses,tempo_relacionamento_meses,tempo_relacionamento_meses
Unnamed: 0_level_1,mean,min,max,median
segmento_final,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
01. Baixa propensao,25.756206,0.0,78.0,23.0
02. Média propensao,11.602104,0.0,78.0,8.0
03. Média propensao,13.178299,2.0,20.0,14.0
04. Média-Alta propensao,14.212948,1.0,78.0,11.0
05. Alta propensao,12.318787,1.0,20.0,13.0


In [44]:
df[['segmento_final', 'avg_vl_total_spending']].groupby('segmento_final').agg({'avg_vl_total_spending':['mean', 'min', 'max', 'median']})

Unnamed: 0_level_0,avg_vl_total_spending,avg_vl_total_spending,avg_vl_total_spending,avg_vl_total_spending
Unnamed: 0_level_1,mean,min,max,median
segmento_final,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
01. Baixa propensao,706.107643,-294.564,110837.165,555.905
02. Média propensao,669.331688,0.0,38088.6185,471.772
03. Média propensao,1028.264245,0.0,17042.6225,860.54875
04. Média-Alta propensao,666.227398,0.0,20865.433423,450.341538
05. Alta propensao,1056.6012,555.906,32639.46,879.948


In [45]:
df[['segmento_final', 'nr_trans_mov_conta']].groupby('segmento_final').agg({'nr_trans_mov_conta':['mean', 'min', 'max', 'median']})

Unnamed: 0_level_0,nr_trans_mov_conta,nr_trans_mov_conta,nr_trans_mov_conta,nr_trans_mov_conta
Unnamed: 0_level_1,mean,min,max,median
segmento_final,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
01. Baixa propensao,17.214154,0.0,39064.0,1.0
02. Média propensao,14.627017,0.0,7583.0,6.0
03. Média propensao,40.330737,0.0,8269.0,14.0
04. Média-Alta propensao,54.610431,0.0,8472.0,23.0
05. Alta propensao,62.378188,0.0,11942.0,26.0


In [52]:
df.head()

Unnamed: 0,cpf,nr_acessos,pc_acessos_lojawill,tempo_relacionamento,nr_trans_mov_conta,avg_vl_total_spending,flag_mgm,dias_atraso,tempo_relacionamento_meses,fx_nr_acessos,fx_tempo_relacionamento,fx_loja_will,fx_spending_ou_conta,segmento_final,grupos_propensao
0,12803041707,56,0.0,1205,0.0,555.905,0,0,40.0,B,C,B,A,01. Baixa propensao,BCBA
1,99234467515,76,0.092105,449,77.0,198.29,1,0,15.0,B,B,A,C,01. Baixa propensao,BBAC
2,67192246549,2,0.0,19,0.0,555.905,0,0,1.0,A,A,B,A,01. Baixa propensao,AABA
3,2327033373,95,0.084211,124,47.0,538.212,0,0,4.0,C,A,A,C,03. Média propensao,CAAC
4,41375962809,59,0.016949,151,15.0,477.174,1,0,5.0,B,A,A,C,01. Baixa propensao,BAAC


In [61]:
wr.s3.to_parquet(
    df=df,
    path="s3://data-athena-query-result-will-prod/flavia-costa",  
    dataset=True,
    mode="overwrite",
    database="customer_sandbox_zone",
    table="segmentacao_mgm",
    boto3_session=boto3_session,
)

{'paths': ['s3://data-athena-query-result-will-prod/flavia-costa/00122297f367472e9e8381e98904024d.snappy.parquet'],
 'partitions_values': {}}