In [1]:
import pandas as pd
import time
import os
import boto3
import io
import numpy as np
import string
import re
from imblearn.under_sampling import RandomUnderSampler
from sklearn.model_selection import train_test_split
import lightgbm as lgbm
from matplotlib import pyplot as plt
import seaborn as sns
import shap
import joblib
import acessos as ac

In [2]:
user = 'flavia-costa'

class QueryAthena:

    def __init__(self, query, database):
        self.database = database
        self.folder = 'maria-carvalho/'
        self.bucket = 'data-athena-query-result-will-prod'
        self.s3_input = 's3://' + self.bucket + '/' + user
        self.s3_output =  's3://' + self.bucket + '/' + self.folder
        self.region_name = 'sa-east-1'
        self.aws_access_key_id = os.getenv('AWS_ACCESS_KEY_ID_WILL')
        self.aws_secret_access_key = os.getenv('AWS_SECRET_ACCESS_KEY_WILL')
        self.query = query

    def load_conf(self, q):
        try:
            self.client = boto3.client('athena', 
                              region_name = self.region_name, 
                              aws_access_key_id = self.aws_access_key_id,
                              aws_secret_access_key= self.aws_secret_access_key)
            response = self.client.start_query_execution(
                QueryString = q,
                    QueryExecutionContext={
                    'Database': self.database
                    },
                    ResultConfiguration={
                    'OutputLocation': self.s3_output,
                    }
            )
            self.filename = response['QueryExecutionId']
            print('Execution ID: ' + response['QueryExecutionId'])

        except Exception as e:
            print(e)
        return response                

    def run_query(self):
        queries = [self.query]
        for q in queries:
            res = self.load_conf(q)
        try:              
            query_status = None
            while query_status == 'QUEUED' or query_status == 'RUNNING' or query_status is None:
                query_status = self.client.get_query_execution(QueryExecutionId=res["QueryExecutionId"])['QueryExecution']['Status']['State']
                print(query_status)
                if query_status == 'FAILED' or query_status == 'CANCELLED':
                    raise Exception('Athena query with the string "{}" failed or was cancelled'.format(self.query))
                time.sleep(10)
            print('Query "{}" finished.'.format(self.query))

            df = self.obtain_data()
            return df

        except Exception as e:
            print(e)      

    def obtain_data(self):
        try:
            self.resource = boto3.resource('s3', 
                                  region_name = self.region_name, 
                                  aws_access_key_id = self.aws_access_key_id,
                                  aws_secret_access_key= self.aws_secret_access_key)

            response = self.resource \
            .Bucket(self.bucket) \
            .Object(key= self.folder + self.filename + '.csv') \
            .get()

            return pd.read_csv(io.BytesIO(response['Body'].read()), encoding='utf8')   
        except Exception as e:
            print(e)  

In [30]:
text_file = open(os.path.abspath('dados_teste_contestacao.sql'), "r")
query = text_file.read()
text_file.close()
#qa = QueryAthena(query=query, database='myAthenaDb')
#data_to_predict = qa.run_query()

data_to_predict = ac.df_athena_q(user, query)


In [31]:
len(data_to_predict)
#377528

379769

In [32]:
data_to_predict['nr_cpf'] = data_to_predict['nr_cpf'].astype('string').str.zfill(11)

In [56]:
len(data_to_predict['nr_cpf'].unique())

227970

In [33]:
chats_apos = pd.read_csv('Chats dia 30-mai em diante.csv')
chats_apos['nr_cpf'] = chats_apos['cpf_customer'].astype('string').str.zfill(11)

In [34]:
transacoes_propensas = pd.read_csv('base_clientes_propensos - transacoes dia 29-maio.csv')
transacoes_propensas['nr_cpf'] = transacoes_propensas['nr_cpf'].astype('string').str.zfill(11)

In [47]:
len(transacoes_propensas)

62946

In [57]:
len(transacoes_propensas['nr_cpf'].unique())

47746

In [58]:
len(transacoes_propensas['nr_cpf'].unique())  / len(data_to_predict['nr_cpf'].unique())

0.20943983857525114

In [40]:
transacoes_propensas_contestação = transacoes_propensas[['nr_cpf', 'prob_contestar', 'prob_fraude']].join(chats_apos.set_index('nr_cpf'), how = 'inner', on = 'nr_cpf')
transacoes_total_contestação = data_to_predict[['nr_cpf']].join(chats_apos.set_index('nr_cpf'), how = 'inner', on = 'nr_cpf')

In [60]:
len(transacoes_total_contestação['nr_cpf'].unique())

2761

In [61]:
len(transacoes_propensas_contestação['nr_cpf'].unique())

927

In [55]:
len(transacoes_propensas_contestação) / len(transacoes_total_contestação)

0.2844656336909019

In [48]:
len(transacoes_total_contestação['nr_cpf'].unique()) / len(data_to_predict['nr_cpf'].unique())

0.012111242707373777

In [59]:
len(transacoes_total_contestação['nr_cpf'].unique())

2761

In [49]:
len(transacoes_propensas_contestação['nr_cpf'].unique()) / len(transacoes_propensas['nr_cpf'].unique())

0.019415238972898253

In [41]:
transacoes_propensas_contestação['fx_score'] = np.round(transacoes_propensas_contestação['prob_contestar'],1)

transacoes_propensas['fx_score'] = np.round(transacoes_propensas['prob_contestar'],1)

In [42]:
transacoes_propensas_contestação['fx_score'].value_counts()

0.9    558
0.8    478
0.7    464
0.6    368
0.5    165
1.0    115
Name: fx_score, dtype: int64

In [43]:
transacoes_propensas['fx_score'].value_counts()

0.7    14346
0.8    13418
0.9    13178
0.6    12913
0.5     6640
1.0     2451
Name: fx_score, dtype: int64

In [50]:
len(transacoes_propensas_contestação.query('prob_contestar > 0.7')['nr_cpf'].unique()) / len(transacoes_propensas.query('prob_contestar > 0.7')['nr_cpf'].unique())

0.021093617741662435

In [51]:
len(transacoes_propensas_contestação.query('prob_contestar > 0.8')['nr_cpf'].unique()) / len(transacoes_propensas.query('prob_contestar > 0.8')['nr_cpf'].unique())

0.0219235836627141

In [54]:
len(transacoes_propensas.query('prob_contestar > 0.8')['nr_cpf'].unique())

18975

In [53]:
len(transacoes_propensas_contestação.query('prob_contestar > 0.8')['nr_cpf'].unique()) 

416

In [52]:
len(transacoes_propensas_contestação.query('prob_contestar > 0.9')['nr_cpf'].unique()) / len(transacoes_propensas.query('prob_contestar > 0.9')['nr_cpf'].unique())

0.02220558882235529