In [79]:
import pandas as pd
import time
import os
import boto3
import io
import numpy as np
import string
import re
from imblearn.under_sampling import RandomUnderSampler
from sklearn.model_selection import train_test_split
import lightgbm as lgbm
from matplotlib import pyplot as plt
import seaborn as sns
import shap
import joblib
import acessos as ac

In [80]:
query_string = '''
with base_chats as  (
-----------------HELPSHIFT -----------------
select distinct 
cast(issue.id_issue as varchar) as id_issue
	, dt_envio_mensagem
	, issue.cpf_customer
	, cast(issue.dt_criacao_chat as date) as dia_chat
	from cx_curated_zone.helpshift_issues_messages msg
	left join cx_curated_zone.helpshift_issues issue 
	on (issue.id_issue  = msg.id_issue)
	left join customer_curated_zone.ca_book_cliente ci 
	on (ci.cpf = issue.cpf_customer)
	where cast(dt_criacao_chat as date) >= to_date('2023-06-01', 'yyyy-mm-dd')
	and   cast(dt_criacao_chat as date) < to_date('2023-12-25', 'yyyy-mm-dd')
union all
---------------INTERCOM -------------------
select distinct 
m.id_issue 
	, m.dt_envio_mensagem
	, ci.cpf as cpf_customer
	, cast(i.dt_criacao as date) as dia_chat
	from cx_curated_zone.intercom_issues_messages m
left join cx_curated_zone.intercom_issues_id i on (i.id_issue = m.id_issue)
left join customer_curated_zone.ca_book_cliente ci on (ci.id_customer = i.id_customer)
	where cast(i.dt_criacao as timestamp) >= to_date('2023-06-01', 'yyyy-mm-dd')
	and   cast(i.dt_criacao as timestamp) < to_date('2023-12-25', 'yyyy-mm-dd')
)
, nao_reconhece as (
 select distinct o.*
   from base_chats o
    left join cx_curated_zone.helpshift_tag_niveis AS ht ON cast(o.id_issue as varchar) = cast(ht.id_issue as varchar)
    left join cx_curated_zone.tags_helpshift_intercom as ic on cast(o.id_issue as varchar) = cast(ic.id_issue as varchar)
   where coalesce(ht.nm_tag_hierarquia_4, ic.nm_tag_hierarquia_4) in ('Não reconhecimento de compra no crédito')
)
, estabelecimentos_nrc as (
select nr.*, tr.dt_dia , tr.ds_estabelecimento
from nao_reconhece nr 
inner join platform_curated_zone.transaction_dedicada_spending tr on (tr.cpf = nr.cpf_customer 
									and nr.dia_chat >= date_add('day', 0, cast(tr.dt_dia as date)) 
									and nr.dia_chat < date_add('day', 15, cast(tr.dt_dia as date)))
)
, estabelecimentos_total as (
select 
ds_estabelecimento
, dt_dia
, count(distinct cpf) as clientes_total
from platform_curated_zone.transaction_dedicada_spending
where cast(dt_dia as date) >= date_add('month', -1, date('2023-06-01'))
and cast(dt_dia as date) < to_date('2023-12-25', 'yyyy-mm-dd')
group by 1, 2
)
, agg_dia_estab as (
select
 e.ds_estabelecimento
, e.dt_dia
, max(t.ds_nome_estabelecimento_tratado) as ds_nome_estabelecimento_tratado
, max(t.ds_mcc_frequente) as ds_mcc_frequente
, max(t.ds_classificacao_nivel_4) as ds_classificacao_nivel_4
, max(t.ds_classificacao_nivel_3) as ds_classificacao_nivel_3
, count(distinct cpf_customer) as clientes_chat_nrc
, max(tot.clientes_total) as clientes_total_estab
, count(distinct id_issue) as issues
from estabelecimentos_nrc e
left join customer_curated_zone.ca_dict_merchant_classifier t on (e.ds_estabelecimento = t.ds_nome_estabelecimento)
left join estabelecimentos_total tot on (tot.ds_estabelecimento = e.ds_estabelecimento and tot.dt_dia = e.dt_dia)
where e.ds_estabelecimento is not null
group by 1, 2
)
select
ds_estabelecimento
, max(ds_nome_estabelecimento_tratado) as ds_nome_estabelecimento_tratado
, max(ds_mcc_frequente) as ds_mcc_frequente
, max(ds_classificacao_nivel_4) as ds_classificacao_nivel_4
, max(ds_classificacao_nivel_3) as ds_classificacao_nivel_3
, sum(issues) as issues
, sum(clientes_chat_nrc) as clientes_chat_nrc
, sum(clientes_chat_nrc) / cast(sum(clientes_total_estab) as double) as perc_chat_nrc
from agg_dia_estab
group by 1
having  sum(clientes_chat_nrc) > 20
order by perc_chat_nrc desc
limit 50000

 '''

In [81]:
user = 'flavia-costa'

In [99]:
data_to_predict = ac.df_athena_q(user, query_string)

In [100]:
len(data_to_predict)

1733

In [101]:
data_to_predict.head(50)

Unnamed: 0,ds_estabelecimento,ds_nome_estabelecimento_tratado,ds_mcc_frequente,ds_classificacao_nivel_4,ds_classificacao_nivel_3,issues,clientes_chat_nrc,perc_chat_nrc
0,AUTO POSTO GABIROBA,AUTO POSTO GABIROBA,Postos Combustiveis,MOBILIDADE - COMBUSTIVEL,MOBILIDADE - COMBUSTIVEL,21,21,1.0
1,RECARGA*JULIANASOUZ,RECARGA*JULIANASOUZ,CASAS DE CAMBIO COMERCIANTE E CRIPTOMOEDAS,- -,-,145,26,0.962963
2,PG *SPORTBAY,PG *SPORTBAY,Suprimentos Industriais não Classificados,E-COMMERCE - VESTUARIO - GERAL,E-COMMERCE - VESTUARIO,68,31,0.939394
3,DL *GOOGLE ADS,GOOGLE,Comércio de Eletrônicos e Informática,E-COMMERCE - ELETRONICOS,E-COMMERCE - ELETRONICOS,93,34,0.918919
4,PG *TON HELENA RAMAL,PG *TON HELENA RAMAL,Conveniência / Delicatessens,SUPERMERCADO - DELICATESSEN,SUPERMERCADO - DELICATESSEN,101,34,0.918919
5,IMPERIO DA R*IMPERIODA,IMPERIO DA R*IMPERIODA,Pet Shop,PET - SAUDE PET,PET - SAUDE PET,307,119,0.901515
6,GOOGLE ADS,GOOGLE,Processamento de Dados e Serviços,SERVICOS - TI,SERVICOS - TI,135,61,0.884058
7,DL *GOOGLE ADS,GOOGLE,SERVIÇOS DE PUBLICIDADE (ANUNCIOS E PROPAGANDA),SERVICOS - PUBLICIDADE / ANUNCIOS,SERVICOS - PUBLICIDADE / ANUNCIOS,43,21,0.875
8,DicksSportingGoods.com,DicksSportingGoods.com,LOJAS DE ARTIGOS ESPORTIVOS,E-COMMERCE - ESPORTIVO,E-COMMERCE - ESPORTIVO,60,38,0.844444
9,PG *BALLROOM,PG *BALLROOM,Fornecedores,SERVICOS - DISTRIBUIDORES,SERVICOS - DISTRIBUIDORES,57,27,0.84375


In [102]:
#data_to_predict['indice'] = data_to_predict['ds_estabelecimento'].str.find('*')
#tratado2

In [103]:
data_to_predict['indice'] = data_to_predict['ds_estabelecimento'].str.split('*')

In [104]:
data_to_predict['pos_asterisco'] = data_to_predict['indice'].str[1]

In [105]:
data_to_predict['ds_nome_estabelecimento_tratado'] = np.where(data_to_predict['ds_nome_estabelecimento_tratado'] != data_to_predict['ds_estabelecimento'], 
                                                      data_to_predict['ds_nome_estabelecimento_tratado'], 
                                                      np.where(data_to_predict['ds_estabelecimento'].str.find('*') != -1, data_to_predict['pos_asterisco'], data_to_predict['ds_estabelecimento'] ))

In [106]:
data_to_predict['ds_nome_estabelecimento_tratado2'] = data_to_predict['ds_nome_estabelecimento_tratado'].str.upper()

In [110]:
data_to_predict.head(10)

Unnamed: 0,ds_estabelecimento,ds_nome_estabelecimento_tratado,ds_mcc_frequente,ds_classificacao_nivel_4,ds_classificacao_nivel_3,issues,clientes_chat_nrc,perc_chat_nrc,indice,pos_asterisco,ds_nome_estabelecimento_tratado2
0,AUTO POSTO GABIROBA,AUTO POSTO GABIROBA,Postos Combustiveis,MOBILIDADE - COMBUSTIVEL,MOBILIDADE - COMBUSTIVEL,21,21,1.0,[AUTO POSTO GABIROBA ],,AUTO POSTO GABIROBA
1,RECARGA*JULIANASOUZ,JULIANASOUZ,CASAS DE CAMBIO COMERCIANTE E CRIPTOMOEDAS,- -,-,145,26,0.962963,"[RECARGA, JULIANASOUZ]",JULIANASOUZ,JULIANASOUZ
2,PG *SPORTBAY,SPORTBAY,Suprimentos Industriais não Classificados,E-COMMERCE - VESTUARIO - GERAL,E-COMMERCE - VESTUARIO,68,31,0.939394,"[PG , SPORTBAY ]",SPORTBAY,SPORTBAY
3,DL *GOOGLE ADS,GOOGLE,Comércio de Eletrônicos e Informática,E-COMMERCE - ELETRONICOS,E-COMMERCE - ELETRONICOS,93,34,0.918919,"[DL , GOOGLE ADS ]",GOOGLE ADS,GOOGLE
4,PG *TON HELENA RAMAL,TON HELENA RAMAL,Conveniência / Delicatessens,SUPERMERCADO - DELICATESSEN,SUPERMERCADO - DELICATESSEN,101,34,0.918919,"[PG , TON HELENA RAMAL ]",TON HELENA RAMAL,TON HELENA RAMAL
5,IMPERIO DA R*IMPERIODA,IMPERIODA,Pet Shop,PET - SAUDE PET,PET - SAUDE PET,307,119,0.901515,"[IMPERIO DA R, IMPERIODA ]",IMPERIODA,IMPERIODA
6,GOOGLE ADS,GOOGLE,Processamento de Dados e Serviços,SERVICOS - TI,SERVICOS - TI,135,61,0.884058,[GOOGLE ADS ],,GOOGLE
7,DL *GOOGLE ADS,GOOGLE,SERVIÇOS DE PUBLICIDADE (ANUNCIOS E PROPAGANDA),SERVICOS - PUBLICIDADE / ANUNCIOS,SERVICOS - PUBLICIDADE / ANUNCIOS,43,21,0.875,"[DL , GOOGLE ADS]",GOOGLE ADS,GOOGLE
8,DicksSportingGoods.com,DicksSportingGoods.com,LOJAS DE ARTIGOS ESPORTIVOS,E-COMMERCE - ESPORTIVO,E-COMMERCE - ESPORTIVO,60,38,0.844444,[DicksSportingGoods.com],,DICKSSPORTINGGOODS.COM
9,PG *BALLROOM,BALLROOM,Fornecedores,SERVICOS - DISTRIBUIDORES,SERVICOS - DISTRIBUIDORES,57,27,0.84375,"[PG , BALLROOM ]",BALLROOM,BALLROOM


In [107]:
data_to_predict.sort_values(by = 'clientes_chat_nrc', ascending = False)

Unnamed: 0,ds_estabelecimento,ds_nome_estabelecimento_tratado,ds_mcc_frequente,ds_classificacao_nivel_4,ds_classificacao_nivel_3,issues,clientes_chat_nrc,perc_chat_nrc,indice,pos_asterisco,ds_nome_estabelecimento_tratado2
1683,Uber *UBER *TRIP,UBER,TAXI E LIMUSINE,MOBILIDADE - TRANSPORTE PRIVADO - APP TRANSPORTE,MOBILIDADE - TRANSPORTE PRIVADO,34352,22752,0.008458,"[Uber , UBER , TRIP]",UBER,UBER
1694,99APP *99App,99App,TAXI E LIMUSINE,MOBILIDADE - TRANSPORTE PRIVADO - TAXIS,MOBILIDADE - TRANSPORTE PRIVADO,21203,13790,0.008161,"[99APP , 99App]",99App,99APP
1682,UBER *UBER *TRIP,UBER,Taxi / Limousines,MOBILIDADE - TRANSPORTE PRIVADO - APP TRANSPORTE,MOBILIDADE - TRANSPORTE PRIVADO,17372,9912,0.008465,"[UBER , UBER , TRIP ]",UBER,UBER
1579,APPLE.COM/BILL,APPLE,Discos,ENTRETENIMENTO - STREAMING - AUDIO,ENTRETENIMENTO - STREAMING,14326,8679,0.012457,[APPLE.COM/BILL],,APPLE
1673,99APP *99APP,99APP,Taxi / Limousines,MOBILIDADE - TRANSPORTE PRIVADO - TAXIS,MOBILIDADE - TRANSPORTE PRIVADO,10812,6134,0.008851,"[99APP , 99APP ]",99APP,99APP
...,...,...,...,...,...,...,...,...,...,...,...
425,PANIFICADORA E LANCHO,PANIFICADORA E LANCHO,Padarias,SERVICOS DE ALIMENTACAO - PADARIAS,SERVICOS DE ALIMENTACAO - PADARIAS,27,21,0.069536,[PANIFICADORA E LANCHO],,PANIFICADORA E LANCHO
421,PADARIA PAO NOSSO,PADARIA PAO NOSSO,Padarias,SERVICOS DE ALIMENTACAO - PADARIAS,SERVICOS DE ALIMENTACAO - PADARIAS,25,21,0.070000,[PADARIA PAO NOSSO],,PADARIA PAO NOSSO
414,GAZIN,GAZIN,Lojas de Departamento,E-COMMERCE - MULTICATEGORIA,E-COMMERCE - MULTICATEGORIA,23,21,0.071918,[GAZIN],,GAZIN
408,LOJAS LE BISCUIT SA,LOJAS LE BISCUIT,Lojas de Departamento,LOJAS DE DEPARTAMENTO - MULTICATEGORIA,LOJAS DE DEPARTAMENTO - MULTICATEGORIA,29,21,0.072917,[LOJAS LE BISCUIT SA],,LOJAS LE BISCUIT


In [89]:
#data_to_predict.query('ds_estabelecimento.str.contains("AHESUPCOM")')