In [1]:
import pandas as pd
import time
import os
import boto3
import io
import numpy as np
import string
import re
from imblearn.under_sampling import RandomUnderSampler
from sklearn.model_selection import train_test_split
import lightgbm as lgbm
from matplotlib import pyplot as plt
import seaborn as sns
import shap
import joblib
import acessos as ac

In [14]:
user = 'flavia-costa'

class QueryAthena:

    def __init__(self, query, database):
        self.database = database
        self.folder = 'maria-carvalho/'
        self.bucket = 'data-athena-query-result-will-prod'
        self.s3_input = 's3://' + self.bucket + '/' + user
        self.s3_output =  's3://' + self.bucket + '/' + self.folder
        self.region_name = 'sa-east-1'
        self.aws_access_key_id = os.getenv('AWS_ACCESS_KEY_ID_WILL')
        self.aws_secret_access_key = os.getenv('AWS_SECRET_ACCESS_KEY_WILL')
        self.query = query

    def load_conf(self, q):
        try:
            self.client = boto3.client('athena', 
                              region_name = self.region_name, 
                              aws_access_key_id = self.aws_access_key_id,
                              aws_secret_access_key= self.aws_secret_access_key)
            response = self.client.start_query_execution(
                QueryString = q,
                    QueryExecutionContext={
                    'Database': self.database
                    },
                    ResultConfiguration={
                    'OutputLocation': self.s3_output,
                    }
            )
            self.filename = response['QueryExecutionId']
            print('Execution ID: ' + response['QueryExecutionId'])

        except Exception as e:
            print(e)
        return response                

    def run_query(self):
        queries = [self.query]
        for q in queries:
            res = self.load_conf(q)
        try:              
            query_status = None
            while query_status == 'QUEUED' or query_status == 'RUNNING' or query_status is None:
                query_status = self.client.get_query_execution(QueryExecutionId=res["QueryExecutionId"])['QueryExecution']['Status']['State']
                print(query_status)
                if query_status == 'FAILED' or query_status == 'CANCELLED':
                    raise Exception('Athena query with the string "{}" failed or was cancelled'.format(self.query))
                time.sleep(10)
            print('Query "{}" finished.'.format(self.query))

            df = self.obtain_data()
            return df

        except Exception as e:
            print(e)      

    def obtain_data(self):
        try:
            self.resource = boto3.resource('s3', 
                                  region_name = self.region_name, 
                                  aws_access_key_id = self.aws_access_key_id,
                                  aws_secret_access_key= self.aws_secret_access_key)

            response = self.resource \
            .Bucket(self.bucket) \
            .Object(key= self.folder + self.filename + '.csv') \
            .get()

            return pd.read_csv(io.BytesIO(response['Body'].read()), encoding='utf8')   
        except Exception as e:
            print(e)  

In [15]:
text_file = open(os.path.abspath('dados_teste_contestacao.sql'), "r")
query = text_file.read()
text_file.close()
#qa = QueryAthena(query=query, database='myAthenaDb')
#data_to_predict = qa.run_query()

data_to_predict = ac.df_athena_q(user, query)


In [16]:
len(data_to_predict)

377528

In [17]:
data_to_predict

Unnamed: 0,nr_cpf,dt_autorizacao,vl_real,ds_nome_estabelecimento,ds_cidade_estabelecimento,ds_mcc,is_compra_parcelada,ds_compra,ds_transacao,ds_cartao,cd_moeda,cd_cluster_sociodemographic,nr_score_ie,vl_real_maior_media_persona,vl_real_maior_percentil_75_persona
0,05202720564,2023-05-29 09:35:35,24.99,HIPER MIGUELENSE 2,SANTO ANTONIO,SUPERMERCADO - VAREJISTAS,False,contactless,credito,STANDARD,986,12,54.380704,0,0
1,06830893585,2023-05-29 04:36:57,4.50,MERCADINHO ND,VARZEDO,SUPERMERCADO - VAREJISTAS,False,presencial,credito,STANDARD,986,15,65.115167,0,0
2,03066668207,2023-05-29 17:28:36,7.80,PAG*EdimilsonDaSilva,MARITUBA,SERVICOS DE ALIMENTACAO - LANCHONETES,False,contactless,credito,STANDARD,986,27,47.857453,0,0
3,00217445306,2023-05-29 08:58:37,800.00,NOVA ONDA,ARACATI,MOBILIDADE - TRANSPORTE PRIVADO,True,presencial,credito,STANDARD,986,23,63.728814,1,1
4,11753466679,2023-05-29 06:32:04,29.00,GERSON & D`PAULA MODAS,JANUARIA,EDUCACAO - COLEGIOS,False,contactless,credito,STANDARD,986,18,44.745763,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
377523,10221829431,2023-05-29 18:23:06,11.38,ATACADAO 152 AS,CAMARAGIBE,SUPERMERCADO - ATACADISTA,False,contactless,credito,STANDARD,986,26,75.154281,0,0
377524,01111290105,2023-05-29 13:58:40,14.50,SUMUP*Elas Tres Avi,SAO VALERIO D,LOJAS DE DEPARTAMENTO - VESTUARIO,False,presencial,credito,STANDARD,986,24,,0,0
377525,03108931199,2023-05-29 20:13:18,118.80,MAQUINA DE CARTAO,SAO PAULO,SERVICOS - NAO ESPECIFICADO,True,online,credito,VIRTUAL,986,27,,1,1
377526,08606970530,2023-05-29 10:15:54,81.00,CAMILA FARIAS BATISTA,CATU,SERVICOS - CASA E CONSTRUCAO,True,contactless,credito,STANDARD,986,15,44.798535,0,0


In [18]:
estabelecimentos_alto = pd.read_csv('estabelecimentos_alto.csv')
del estabelecimentos_alto['Unnamed: 0']

In [19]:
def pre_processing(data_to_predict,estabelecimentos_alto):
    ## Caracteristicas dos nomes dos estabelecimentos
    
    data_to_predict['dt_autorizacao'] = pd.to_datetime(data_to_predict['dt_autorizacao'], format='%Y-%m-%d')
    
    data_to_predict['ds_nome_estabelecimento'] = data_to_predict['ds_nome_estabelecimento'].astype(str)
    data_to_predict['ctg_total'] = data_to_predict['ds_nome_estabelecimento'].str.len()
    data_to_predict['ctg_espacos'] = data_to_predict['ds_nome_estabelecimento'].apply(lambda x: x.count(' ')) 
    data_to_predict['ctg_numbers'] = data_to_predict['ds_nome_estabelecimento'].apply(lambda s: sum(c.isdigit() for c in s)) 
    count = lambda l1,l2: sum([1 for x in l1 if x in l2])
    data_to_predict['ctg_punct'] = data_to_predict.ds_nome_estabelecimento.apply(lambda s: count(s, string.punctuation))
    data_to_predict['ctg_upper'] = data_to_predict.ds_nome_estabelecimento.apply(lambda s: len(re.findall(r'[A-Z]',s)))
    data_to_predict['ctg_low'] = data_to_predict.ds_nome_estabelecimento.apply(lambda s: len(re.findall(r'[a-z]',s)))
    
    data_to_predict['ctg_espacos'] = np.where(data_to_predict['ctg_espacos']>=6,6,data_to_predict['ctg_espacos'])
    
    data_to_predict = data_to_predict.merge(estabelecimentos_alto, on = 'ds_nome_estabelecimento', how = 'left')
    data_to_predict['flag_alto_vl_contestacao_estabelecimento'] = data_to_predict['flag_alto_vl_contestacao_estabelecimento'].fillna(0)
    
    ## Ajustando variável de moeda
    data_to_predict['cd_moeda'] = np.where(data_to_predict['cd_moeda']==986,1,0)
    
    ## Adiconando variável de hora da compra
    data_to_predict['hora_autorizacao'] = data_to_predict['dt_autorizacao'].dt.hour
        
    ## Ajustando a variável do tipo do estabelecimento, tornando-a mais geral
    data_to_predict['ds_mcc'] = data_to_predict['ds_mcc'].str.split(n = 1, expand = True)[0]
    
    ## Ajustando variáveis de caracteristica dos nomes do estabelecimento
    data_to_predict["ctg_numbers"] = np.where(data_to_predict["ctg_numbers"]>0,1,0)
    data_to_predict["ctg_punct"] = np.where(data_to_predict["ctg_punct"]>0,1,0)
    
    data_to_predict.drop(['nr_cpf', 'dt_autorizacao', 'vl_real_maior_percentil_75_persona', 'ds_nome_estabelecimento', 'ds_cidade_estabelecimento'], axis=1, errors='ignore')
    
    data_to_predict['nr_score_ie'] = data_to_predict['nr_score_ie'].fillna(0)
    
    ## Escalonando
    data_to_predict['vl_real'] = np.where(data_to_predict['vl_real']<=13.2, 'valores_baixos', np.where((data_to_predict['vl_real']>13.2)&(data_to_predict['vl_real']<=44.9), 'valores_medios', 'valores_altos'))
    data_to_predict['ctg_total'] = (data_to_predict['ctg_total']-2)/(22-2)
    data_to_predict['ctg_espacos'] = (data_to_predict['ctg_espacos']-0)/(6-0)
    data_to_predict['ctg_upper'] = (data_to_predict['ctg_upper']-0)/(22-0)
    data_to_predict['ctg_low'] = (data_to_predict['ctg_low']-0)/(19-0)
    data_to_predict['nr_score_ie'] = (data_to_predict['nr_score_ie']-0)/(100-0)
    
    ##Dummies
    data_to_predict = pd.get_dummies(data=data_to_predict, columns=['vl_real'])
    data_to_predict = pd.get_dummies(data=data_to_predict, columns=['is_compra_parcelada'])
    data_to_predict = pd.get_dummies(data=data_to_predict, columns=['ds_transacao'])
    data_to_predict = pd.get_dummies(data=data_to_predict, columns=['ds_compra'])
    data_to_predict = pd.get_dummies(data=data_to_predict, columns=['ds_cartao'])
    data_to_predict = pd.get_dummies(data=data_to_predict, columns=['hora_autorizacao'])
    data_to_predict = pd.get_dummies(data=data_to_predict, columns=['cd_cluster_sociodemographic'])
    data_to_predict = pd.get_dummies(data=data_to_predict, columns=['ds_mcc'])
    
    
    data_to_predict.drop(['ds_compra_nao_especificado', ## correlação de 1 com ds_transacao_debito
                         'is_compra_parcelada_True', ## correlação de 1 com is_compra_parcelada_false 
                         'ds_transacao_debito', ## correlação de -.91 com is_transacao_credito 
                         'ds_cartao_STANDARD', ## correlação de -.86 com ds_cartao_VIRTUAL 
                         'vl_real_maior_media_persona', ## correlação de -.74 com valores_altos 
                         'ctg_low', # correlação com low_upper 
                         'ds_transacao_cancelamento', 
                         'ds_mcc_-', 
                         'ds_mcc_VIAGEM', # menos de 50  
                         'ds_mcc_SEGURO', # menos de 50  
                         'ds_mcc_PET', # menos de 50  
                         'ds_compra_contactless', # correlação com compra online 
                         'ds_mcc_MOBILIDADE'],  # correlação com compra online
                        axis=1, errors='ignore')
    
    
    df_ = pd.DataFrame(columns=['cd_moeda', 'nr_score_ie', 'ctg_total', 'ctg_espacos', 'ctg_numbers','ctg_punct', 'ctg_upper', 'flag_alto_vl_contestacao_estabelecimento','vl_real_valores_baixos', 'vl_real_valores_medios','vl_real_valores_altos', 'is_compra_parcelada_False','ds_transacao_ajustes_internos', 'ds_transacao_credito','ds_compra_online', 'ds_compra_presencial', 'ds_cartao_STANDARD_PAG','ds_cartao_VIRTUAL', 'ds_cartao_YELLOW_PAG', 'hora_autorizacao_0','hora_autorizacao_1', 'hora_autorizacao_2', 'hora_autorizacao_3','hora_autorizacao_4', 'hora_autorizacao_5', 'hora_autorizacao_6','hora_autorizacao_7', 'hora_autorizacao_8', 'hora_autorizacao_9','hora_autorizacao_10', 'hora_autorizacao_11', 'hora_autorizacao_12','hora_autorizacao_13', 'hora_autorizacao_14', 'hora_autorizacao_15','hora_autorizacao_16', 'hora_autorizacao_17', 'hora_autorizacao_18','hora_autorizacao_19', 'hora_autorizacao_20', 'hora_autorizacao_21','hora_autorizacao_22', 'hora_autorizacao_23','cd_cluster_sociodemographic_0.0', 'cd_cluster_sociodemographic_1.0','cd_cluster_sociodemographic_2.0', 'cd_cluster_sociodemographic_3.0','cd_cluster_sociodemographic_4.0', 'cd_cluster_sociodemographic_5.0','cd_cluster_sociodemographic_6.0', 'cd_cluster_sociodemographic_7.0','cd_cluster_sociodemographic_8.0', 'cd_cluster_sociodemographic_10.0','cd_cluster_sociodemographic_11.0', 'cd_cluster_sociodemographic_12.0','cd_cluster_sociodemographic_13.0', 'cd_cluster_sociodemographic_14.0','cd_cluster_sociodemographic_15.0', 'cd_cluster_sociodemographic_16.0','cd_cluster_sociodemographic_17.0', 'cd_cluster_sociodemographic_18.0','cd_cluster_sociodemographic_19.0', 'cd_cluster_sociodemographic_20.0','cd_cluster_sociodemographic_21.0', 'cd_cluster_sociodemographic_22.0','cd_cluster_sociodemographic_23.0', 'cd_cluster_sociodemographic_24.0','cd_cluster_sociodemographic_25.0', 'cd_cluster_sociodemographic_26.0','cd_cluster_sociodemographic_27.0', 'cd_cluster_sociodemographic_28.0','cd_cluster_sociodemographic_29.0', 'cd_cluster_sociodemographic_30.0','cd_cluster_sociodemographic_31.0', 'cd_cluster_sociodemographic_32.0','ds_mcc_DELIVERY', 'ds_mcc_E-COMMERCE', 'ds_mcc_EDUCACAO','ds_mcc_ENTRETENIMENTO', 'ds_mcc_FINANCAS', 'ds_mcc_LOJAS','ds_mcc_SAUDE', 'ds_mcc_SERVICOS', 'ds_mcc_SUPERMERCADO','ds_mcc_SUPRIMENTOS', 'ds_mcc_TELECOM'])
    
    new_cols = list(df_.columns.difference(data_to_predict.columns))
    new_vals = [0]*len(df_.columns.difference(data_to_predict.columns))
    df = data_to_predict.reindex(columns=data_to_predict.columns.tolist() + new_cols)   # add empty cols
    df[new_cols] = new_vals  # multi-column assignment works for existing cols
    
    df_ = pd.DataFrame(columns=['cd_moeda', 'nr_score_ie', 'ctg_total', 'ctg_espacos', 'ctg_numbers','ctg_punct', 'ctg_upper', 'flag_alto_vl_contestacao_estabelecimento','vl_real_valores_baixos', 'vl_real_valores_medios','vl_real_valores_altos', 'is_compra_parcelada_False','ds_transacao_ajustes_internos', 'ds_transacao_credito','ds_compra_online', 'ds_compra_presencial', 'ds_cartao_STANDARD_PAG','ds_cartao_VIRTUAL', 'ds_cartao_YELLOW_PAG', 'hora_autorizacao_0','hora_autorizacao_1', 'hora_autorizacao_2', 'hora_autorizacao_3','hora_autorizacao_4', 'hora_autorizacao_5', 'hora_autorizacao_6','hora_autorizacao_7', 'hora_autorizacao_8', 'hora_autorizacao_9','hora_autorizacao_10', 'hora_autorizacao_11', 'hora_autorizacao_12','hora_autorizacao_13', 'hora_autorizacao_14', 'hora_autorizacao_15','hora_autorizacao_16', 'hora_autorizacao_17', 'hora_autorizacao_18','hora_autorizacao_19', 'hora_autorizacao_20', 'hora_autorizacao_21','hora_autorizacao_22', 'hora_autorizacao_23','cd_cluster_sociodemographic_0.0', 'cd_cluster_sociodemographic_1.0','cd_cluster_sociodemographic_2.0', 'cd_cluster_sociodemographic_3.0','cd_cluster_sociodemographic_4.0', 'cd_cluster_sociodemographic_5.0','cd_cluster_sociodemographic_6.0', 'cd_cluster_sociodemographic_7.0','cd_cluster_sociodemographic_8.0', 'cd_cluster_sociodemographic_10.0','cd_cluster_sociodemographic_11.0', 'cd_cluster_sociodemographic_12.0','cd_cluster_sociodemographic_13.0', 'cd_cluster_sociodemographic_14.0','cd_cluster_sociodemographic_15.0', 'cd_cluster_sociodemographic_16.0','cd_cluster_sociodemographic_17.0', 'cd_cluster_sociodemographic_18.0','cd_cluster_sociodemographic_19.0', 'cd_cluster_sociodemographic_20.0','cd_cluster_sociodemographic_21.0', 'cd_cluster_sociodemographic_22.0','cd_cluster_sociodemographic_23.0', 'cd_cluster_sociodemographic_24.0','cd_cluster_sociodemographic_25.0', 'cd_cluster_sociodemographic_26.0','cd_cluster_sociodemographic_27.0', 'cd_cluster_sociodemographic_28.0','cd_cluster_sociodemographic_29.0', 'cd_cluster_sociodemographic_30.0','cd_cluster_sociodemographic_31.0', 'cd_cluster_sociodemographic_32.0','ds_mcc_DELIVERY', 'ds_mcc_E-COMMERCE', 'ds_mcc_EDUCACAO','ds_mcc_ENTRETENIMENTO', 'ds_mcc_FINANCAS', 'ds_mcc_LOJAS','ds_mcc_SAUDE', 'ds_mcc_SERVICOS', 'ds_mcc_SUPERMERCADO','ds_mcc_SUPRIMENTOS', 'ds_mcc_TELECOM'])
    inference_data = df[['cd_moeda', 'nr_score_ie', 'ctg_total', 'ctg_espacos', 'ctg_numbers','ctg_punct', 'ctg_upper', 'flag_alto_vl_contestacao_estabelecimento','vl_real_valores_baixos', 'vl_real_valores_medios','vl_real_valores_altos', 'is_compra_parcelada_False','ds_transacao_ajustes_internos', 'ds_transacao_credito','ds_compra_online', 'ds_compra_presencial', 'ds_cartao_STANDARD_PAG','ds_cartao_VIRTUAL', 'ds_cartao_YELLOW_PAG', 'hora_autorizacao_0','hora_autorizacao_1', 'hora_autorizacao_2', 'hora_autorizacao_3','hora_autorizacao_4', 'hora_autorizacao_5', 'hora_autorizacao_6','hora_autorizacao_7', 'hora_autorizacao_8', 'hora_autorizacao_9','hora_autorizacao_10', 'hora_autorizacao_11', 'hora_autorizacao_12','hora_autorizacao_13', 'hora_autorizacao_14', 'hora_autorizacao_15','hora_autorizacao_16', 'hora_autorizacao_17', 'hora_autorizacao_18','hora_autorizacao_19', 'hora_autorizacao_20', 'hora_autorizacao_21','hora_autorizacao_22', 'hora_autorizacao_23','cd_cluster_sociodemographic_0.0', 'cd_cluster_sociodemographic_1.0','cd_cluster_sociodemographic_2.0', 'cd_cluster_sociodemographic_3.0','cd_cluster_sociodemographic_4.0', 'cd_cluster_sociodemographic_5.0','cd_cluster_sociodemographic_6.0', 'cd_cluster_sociodemographic_7.0','cd_cluster_sociodemographic_8.0', 'cd_cluster_sociodemographic_10.0','cd_cluster_sociodemographic_11.0', 'cd_cluster_sociodemographic_12.0','cd_cluster_sociodemographic_13.0', 'cd_cluster_sociodemographic_14.0','cd_cluster_sociodemographic_15.0', 'cd_cluster_sociodemographic_16.0','cd_cluster_sociodemographic_17.0', 'cd_cluster_sociodemographic_18.0','cd_cluster_sociodemographic_19.0', 'cd_cluster_sociodemographic_20.0','cd_cluster_sociodemographic_21.0', 'cd_cluster_sociodemographic_22.0','cd_cluster_sociodemographic_23.0', 'cd_cluster_sociodemographic_24.0','cd_cluster_sociodemographic_25.0', 'cd_cluster_sociodemographic_26.0','cd_cluster_sociodemographic_27.0', 'cd_cluster_sociodemographic_28.0','cd_cluster_sociodemographic_29.0', 'cd_cluster_sociodemographic_30.0','cd_cluster_sociodemographic_31.0', 'cd_cluster_sociodemographic_32.0','ds_mcc_DELIVERY', 'ds_mcc_E-COMMERCE', 'ds_mcc_EDUCACAO','ds_mcc_ENTRETENIMENTO', 'ds_mcc_FINANCAS', 'ds_mcc_LOJAS','ds_mcc_SAUDE', 'ds_mcc_SERVICOS', 'ds_mcc_SUPERMERCADO','ds_mcc_SUPRIMENTOS', 'ds_mcc_TELECOM']]
    return inference_data

In [20]:
def model_fn(model_dir):  
    model = joblib.load(os.path.join(model_dir))
    return model

In [21]:
inference_data = pre_processing(data_to_predict,estabelecimentos_alto)

In [22]:
MODEL_NAME = 'modelo_contestacao.joblib'
MODEL_PATH = os.path.join(MODEL_NAME)
INFERENCE_DATA = inference_data.copy()
contestacao_model = model_fn(model_dir = MODEL_PATH)

Trying to unpickle estimator LabelEncoder from version 1.2.0 when using version 1.1.2. This might lead to breaking code or invalid results. Use at your own risk. For more info please refer to:
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations


In [23]:
contestacao_model

In [24]:
data_to_predict['prob_contestar'] = pd.DataFrame(contestacao_model.predict_proba(inference_data)).iloc[:,1]

In [25]:
MODEL_NAME = 'modelo_fraude.joblib'
MODEL_PATH = os.path.join(MODEL_NAME)
INFERENCE_DATA = inference_data.copy()
fraude_model = model_fn(model_dir = MODEL_PATH)

Trying to unpickle estimator LabelEncoder from version 1.2.0 when using version 1.1.2. This might lead to breaking code or invalid results. Use at your own risk. For more info please refer to:
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations


In [26]:
fraude_model

In [27]:
inference_data['prob_contestar'] = data_to_predict['prob_contestar']

In [28]:
inference_data_fraude = inference_data[inference_data['prob_contestar']>0.5]
del inference_data_fraude['prob_contestar']

In [29]:
inference_data_fraude['prob_fraude'] = [item[1] for item in fraude_model.predict_proba(inference_data_fraude)]


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [30]:
data_to_predict = data_to_predict.join(inference_data_fraude[['prob_fraude']], lsuffix="_left", rsuffix="_right")
data_to_predict = data_to_predict.fillna(0)

In [31]:
data_to_predict.head(100)

Unnamed: 0,nr_cpf,dt_autorizacao,vl_real,ds_nome_estabelecimento,ds_cidade_estabelecimento,ds_mcc,is_compra_parcelada,ds_compra,ds_transacao,ds_cartao,...,vl_real_maior_media_persona,vl_real_maior_percentil_75_persona,ctg_total,ctg_espacos,ctg_numbers,ctg_punct,ctg_upper,ctg_low,prob_contestar,prob_fraude
0,05202720564,2023-05-29 09:35:35,24.99,HIPER MIGUELENSE 2,SANTO ANTONIO,SUPERMERCADO - VAREJISTAS,False,contactless,credito,STANDARD,...,0,0,18,2,1,0,15,0,0.263682,0.000000
1,06830893585,2023-05-29 04:36:57,4.50,MERCADINHO ND,VARZEDO,SUPERMERCADO - VAREJISTAS,False,presencial,credito,STANDARD,...,0,0,13,1,0,0,12,0,0.065676,0.000000
2,03066668207,2023-05-29 17:28:36,7.80,PAG*EdimilsonDaSilva,MARITUBA,SERVICOS DE ALIMENTACAO - LANCHONETES,False,contactless,credito,STANDARD,...,0,0,20,0,0,1,6,13,0.120620,0.000000
3,00217445306,2023-05-29 08:58:37,800.00,NOVA ONDA,ARACATI,MOBILIDADE - TRANSPORTE PRIVADO,True,presencial,credito,STANDARD,...,1,1,9,1,0,0,8,0,0.089761,0.000000
4,11753466679,2023-05-29 06:32:04,29.00,GERSON & D`PAULA MODAS,JANUARIA,EDUCACAO - COLEGIOS,False,contactless,credito,STANDARD,...,0,0,22,3,0,2,17,0,0.143220,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,70588988120,2023-05-29 04:53:34,5.00,50008852 PAULA FERNAND,GOIANIA,SUPERMERCADO - VAREJISTAS,False,nao_especificado,debito,STANDARD,...,0,0,22,2,8,0,12,0,0.105373,0.000000
96,06770736667,2023-05-29 11:40:32,3.50,PAG*LIMPEXHIGIENIZACA,ARAGUARI,SERVICOS - NAO ESPECIFICADO,False,contactless,credito,STANDARD,...,0,0,21,0,0,1,20,0,0.222293,0.000000
97,70696182475,2023-05-29 08:07:02,139.68,EBN *SHEIN,SAO PAULO,E-COMMERCE - VESTUARIO,False,online,credito,VIRTUAL,...,1,1,18,6,0,1,8,0,0.617654,0.699447
98,03852348307,2023-05-29 11:44:42,4.50,MP *MERCEARIANATAVENID,OSASCO,0,False,nao_especificado,debito,STANDARD,...,0,0,22,1,0,1,20,0,0.036080,0.000000


In [37]:
len(data_to_predict.query('prob_contestar > 0.5 & prob_fraude < 0.5'))

25810

In [38]:
len(data_to_predict.query('prob_contestar > 0.5 & prob_fraude > 0.5'))

37136

In [39]:
len(data_to_predict.query('prob_contestar < 0.5 & prob_fraude > 0.5'))

0

In [36]:
len(data_to_predict.query('prob_contestar > 0.9'))

9064

In [40]:
len(data_to_predict.query('prob_contestar > 0.5 | prob_fraude > 0.5'))

62946

In [41]:
data_to_predict.query('prob_contestar > 0.5 | prob_fraude > 0.5').to_csv('base_clientes_propensos.csv')