In [1]:
import pandas as pd
import numpy as np
import aux.acessos as ac

In [14]:
query = """

with cx_data as (
select * from (
	select
	cpf_customer
	, substring(cast(n.dt_criacao_chat as varchar), 1, 7) as mes_chat
	, concat(substring(nm_tag_n1, 5, 100) ,'|' , substring(nm_tag_n2, 5, 100), '|', substring(nm_tag_n3, 5, 100)) as issue
	, rank() over (partition by cpf_customer order by n.dt_criacao_chat desc) as rank_
	from cx_curated_zone.helpshift_tag_niveis n
	left join cx_curated_zone.helpshift_issues i on (i.id_issue = n.id_issue)
	where n.dt_criacao_chat >= to_date('2021-11-01', 'yyyy-mm-dd')
) where rank_ = 1
)
, tipo_ativacao as (
select
id_customer
, max(case when ds_first_purchase_card = 'virtual' then 1 else 0 end) as ativou_virtual
, max(case when ds_first_purchase_card is null then 1 else 0 end) as nunca_ativou
, max(case when ds_first_purchase_card = 'fisico' then 1 else 0 end) as ativou_fisico
from customer_curated_zone.ca_book_growth 
--where dt_opening_account between to_date('2021-11-01', 'yyyy-mm-dd') and to_date('2022-01-31', 'yyyy-mm-dd')
where dt_opening_account < to_date('2021-11-01', 'yyyy-mm-dd')
group by 1
)
, perfil as (
select distinct * from (
                select
                p.cd_cpf as cpf,
                c.id_customer,
                p.ds_origin as pf_origem_cliente,
                rank() over (partition by p.cd_cpf order by dt_cfi_account_created desc) as base_perfil,
                case
                when p.ds_origin = 'will' and g.ds_gender = 'F' then 'FEMALE'
                when p.ds_origin = 'will' and g.ds_gender = 'M' then 'MALE'
                when p.ds_origin <> 'will' and p.ds_gender = 'F' then 'FEMALE'
                when p.ds_origin <> 'will' and p.ds_gender = 'M' then 'MALE'
                when p.ds_origin = 'will' and g.ds_gender = 'FEMALE' then 'FEMALE'
                when p.ds_origin = 'will' and g.ds_gender = 'MALE' then 'MALE'
                when p.ds_origin <> 'will' and p.ds_gender = 'FEMALE' then 'FEMALE'
                when p.ds_origin <> 'will' and p.ds_gender = 'MALE' then 'MALE'
                else '' end as pf_genero,
                case
                when  nr_declared_income <= 1000 then 'A - ate R$ 1000'
                when  nr_declared_income <= 2000 then 'B - ate R$ 2000'
                when  nr_declared_income <= 3000 then 'C - ate R$ 3000'
                when  nr_declared_income <= 4000 then 'D - ate R$ 4000'
                when  nr_declared_income <= 5000 then 'E - ate R$ 5000'
                when  nr_declared_income > 5000 then 'F - mais de R$ 5000'
                end as pf_renda_declarada_will,
                replace(upper(ds_occupation), '(A)', '') as pf_profissao,
                p.ds_state_abbreviation as pf_estado,
                case WHEN p.ds_state_abbreviation IN('DF','GO','MS','MT') THEN 'CENTRO-OESTE'
                    WHEN p.ds_state_abbreviation IN('AL','BA','CE','MA','PB','PE','PI','RN','SE') THEN 'NORDESTE'
                    WHEN p.ds_state_abbreviation IN('AC','AM','AP','PA','RO','RR','TO') THEN 'NORTE'
                    WHEN p.ds_state_abbreviation IN('ES','MG','RJ','SP') THEN 'SUDESTE'
                    WHEN p.ds_state_abbreviation IN('PR','RS','SC') THEN 'SUL'
                    ELSE 'Z-OUTROS' END as pf_regiao,
              c.dt_cfi_account_created,
              substring(cast(c.dt_cfi_account_created as varchar), 1, 7) as pf_safra,
                case
                when round((extract(day from current_date - cast(p.dt_birth as date))/30.5/12 / 10),0) = 2 then 'a - ate 24 anos'
                when round((extract(day from current_date - cast(p.dt_birth as date))/30.5/12 / 10),0) = 3 then 'b - 25-34 anos'
                when round((extract(day from current_date - cast(p.dt_birth as date))/30.5/12 / 10),0) = 4 then 'c - 35-44 anos'
                when round((extract(day from current_date - cast(p.dt_birth as date))/30.5/12 / 10),0) = 5 then 'd - 45-54 anos'
                when round((extract(day from current_date - cast(p.dt_birth as date))/30.5/12 / 10),0) = 6 then 'e - 55-64 anos'
                when round((extract(day from current_date - cast(p.dt_birth as date))/30.5/12 / 10),0) = 7 then 'f - 65-74 anos'
                when round((extract(day from current_date - cast(p.dt_birth as date))/30.5/12 / 10),0) = 8 then 'g - 75-84 anos'
                else 'h - idade NI' end as pf_faixa_idade,
                p.nm_locality as pf_cidade
                from growth_curated_zone.proposal_general p
                inner join growth_curated_zone.clientes c on (p.cd_cpf = c.cpf and p.ds_origin = c.ds_origin)
                left join platform_curated_zone.gender_will g on (g.cd_cpf = p.cd_cpf)
                ) where base_perfil = 1 and dt_cfi_account_created is not null
               )
               , indicadores_churn as (             
               select
               cpf,
               max(case when cast(dt_mes_churn as varchar) = '2022-02-01' then 1 else 0 end) as churn_fev,
               max(case when cast(dt_mes_churn as varchar) = '2022-03-01' then 1 else 0 end) as churn_mar,
               max(case when cast(dt_mes_churn as varchar) = '2022-04-01' then 1 else 0 end) as churn_abr
				from platform_sandbox_zone.clientes_churn
               group by 1
               )
               select * from (               
               select p.*,     a.*,   c.*,     ch.churn_fev, ch.churn_mar, ch.churn_abr, rank() over (partition by 1 order by random() desc) as ordem,
                case 
               	when churn_fev = 1 and churn_mar = 1 and churn_abr  = 1 then 'a - churn alto'
               	when churn_fev = 1 and coalesce(churn_mar,0) = 0 and coalesce(churn_abr,0) = 0 then 'c - churn baixo'
 	            when churn_fev = 0 and coalesce(churn_mar,0) = 0 and coalesce(churn_abr,0) = 1 then 'c - churn baixo'
               	when coalesce(churn_fev,0) = 0 and coalesce(churn_mar,0) = 1 and coalesce(churn_abr,0) = 0 then 'c - churn baixo'  
				when coalesce(churn_fev,0) = 1 and coalesce(churn_mar,0) = 0 and coalesce(churn_abr,0) = 1 then 'c - churn baixo'  
               	when churn_fev = 1 and churn_mar = 1 and coalesce(churn_abr,0) = 0 then 'b - churn medio'
               	when coalesce(churn_fev,0) = 0 and churn_mar = 1 and coalesce(churn_abr,0) = 1 then 'b - churn medio'
               	when a.nunca_ativou = 1 then 'e - nunca ativou'
               	when coalesce(churn_fev,0) = 0 and coalesce(churn_mar,0) = 0 and coalesce(churn_abr,0) = 0  then 'd - sem churn'
              end as segmento_churn
               	from perfil p
               left join tipo_ativacao a on (a.id_customer = p.id_customer)
               left join cx_data c on (c.cpf_customer = p.cpf)
               left join indicadores_churn ch on (ch.cpf = p.cpf)
               ) order by ordem limit 1000000

"""

In [15]:
base = ac.df_athena('flavia-costa', query)

Failed to execute query.
Traceback (most recent call last):
  File "/home/flavia.costa/.local/lib/python3.9/site-packages/pyathena/common.py", line 307, in _execute
    query_id = retry_api_call(
  File "/home/flavia.costa/.local/lib/python3.9/site-packages/pyathena/util.py", line 84, in retry_api_call
    return retry(func, *args, **kwargs)
  File "/home/flavia.costa/.local/lib/python3.9/site-packages/tenacity/__init__.py", line 430, in __call__
    do = self.iter(retry_state=retry_state)
  File "/home/flavia.costa/.local/lib/python3.9/site-packages/tenacity/__init__.py", line 367, in iter
    return fut.result()
  File "/usr/lib/python3.9/concurrent/futures/_base.py", line 433, in result
    return self.__get_result()
  File "/usr/lib/python3.9/concurrent/futures/_base.py", line 389, in __get_result
    raise self._exception
  File "/home/flavia.costa/.local/lib/python3.9/site-packages/tenacity/__init__.py", line 433, in __call__
    result = fn(*args, **kwargs)
  File "/home/flavia.

In [9]:
import aux.aed as aed

from importlib import reload
reload(aed)

<module 'aux.aed' from '/home/flavia.costa/work/git/codigos/aux/aed.py'>

In [20]:
base['flag_churn'] = np.where(base['segmento_churn'] == 'd - sem churn', 0, 1)

In [8]:
aed.find_correl(base[['pf_origem_cliente', 'flag_churn', 'pf_genero', 'pf_renda_declarada_will', 'pf_regiao', 'pf_faixa_idade', 'issue']],'pf_origem_cliente' , 0.7)

pf_genero_FEMALE ---> pf_genero_MALE -> correlacao de -0.9872144351961786
__________________________________________
pf_genero_MALE ---> pf_genero_FEMALE -> correlacao de -0.9872144351961786
__________________________________________


In [16]:
base['cont'] = 1

In [None]:
aed.vi(base[['pf_origem_cliente', 'flag_churn', 'pf_genero', 'pf_renda_declarada_will', 'pf_regiao','cont' , 'pf_faixa_idade', 'issue']], 'pf_origem_cliente' , 'flag_churn')

In [18]:
base['segmento_churn'].value_counts()

d - sem churn       522882
a - churn alto      262887
c - churn baixo     148936
b - churn medio      52962
e - nunca ativou     12333
Name: segmento_churn, dtype: int64

In [12]:
import aux.perfil as pf

from importlib import reload
reload(pf)

<module 'aux.perfil' from '/home/flavia.costa/work/git/codigos/aux/perfil.py'>

In [21]:
campos = ['pf_origem_cliente', 'pf_genero', 'pf_renda_declarada_will', 'pf_regiao', 'pf_faixa_idade', 'issue']
pf.report_perfil_grupos(base, 'flag_churn', 'cont', 0.07, 100, campos)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[coluna_contagem] = df[coluna_contagem].astype('int')
  casos_signif_final = casos_signif_final.append(casos_signif).sort_values('diferenca_abs', ascending = False)
  casos_signif_final = casos_signif_final.append(casos_signif).sort_values('diferenca_abs', ascending = False)
  casos_signif_final = casos_signif_final.append(casos_signif).sort_values('diferenca_abs', ascending = False)
  casos_signif_final = casos_signif_final.append(casos_signif).sort_values('diferenca_abs', ascending = False)
  casos_signif_final = casos_signif_final.append(casos_signif).sort_values('diferenca_abs', ascending = False)
  casos_signif_final = casos_signif_final.append(casos_signif).sort_values('diferenca_abs', ascending = False)
  casos_signi

report dos perfis finalizado!


  casos_signif_final = casos_signif_final.append(casos_signif).sort_values('diferenca_abs', ascending = False)
