In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import os
import boto3
import io
from io import StringIO

from urllib.parse import quote_plus  # PY2: from urllib import quote_plus
from sqlalchemy.engine import create_engine
from sqlalchemy.sql.expression import select
from sqlalchemy.sql.functions import func
from sqlalchemy.sql.schema import Table, MetaData
#pyathena
#pyathenajdbc

## REDSHIFT

In [None]:
user_redshift = os.getenv('USER_REDSHIFT')
senha_redshift = os.getenv('SENHA_REDSHIFT')
str_conn = 'postgresql://'+user_redshift+":"+senha_redshift+"@datalake-cluster.ckkb9lvch2lp.us-east-1.redshift.amazonaws.com:5439/grupoavista"
engine = create_engine(str_conn)

In [None]:
dadoscliente = pd.read_sql("""
                         
                         select cli_cd_cliente, cli_ds_cidade_corresp, cli_ds_uf_corresp
                         from replication.cliente
                         
                         """, engine)

## AMAZON PAG

In [None]:
REGION = 'us-east-1'
ACCESS_KEY_ID     = os.getenv('AWS_ACCESS_KEY_ID_PAG') 
SECRET_ACCESS_KEY = os.getenv('AWS_SECRET_ACCESS_KEY_PAG') 
BUCKET_NAME = 'datalake-grupoavista'

In [None]:
path = 'data_science/team/Flavia/VisaoUnica_add.csv'

In [None]:
def read_from_s3(KEY, delim):
    s3c = boto3.client(
            's3', 
            region_name = REGION,
            aws_access_key_id = ACCESS_KEY_ID,
            aws_secret_access_key = SECRET_ACCESS_KEY
        )

    obj = s3c.get_object(Bucket= BUCKET_NAME , Key = KEY)
    df = pd.read_csv(io.BytesIO(obj['Body'].read()), encoding='utf8', delimiter = delim)
    return df

In [None]:
visaounica = read_from_s3(path, ';')

## AMAZON WILL SPARK

In [6]:
# import findspark
# findspark.init()
from pyspark.sql import SparkSession
from pyspark import SparkConf

In [None]:
ACCESS_KEY_ID_WILL = os.getenv('AWS_ACCESS_KEY_ID_WILL')
SECRET_ACCESS_KEY_WILL = os.getenv('AWS_SECRET_ACCESS_KEY_WILL')

In [15]:
spark = SparkSession.builder.appName("appName")\
.config("fs.s3a.access.key", ACCESS_KEY_ID_WILL)\
.config("fs.s3a.secret.key", SECRET_ACCESS_KEY_WILL)\
.config('spark.jars.packages', 'org.apache.hadoop:hadoop-aws:3.1.1')\
.getOrCreate()

In [18]:
file_read = "s3a://data-curated-zone-will-prod/platform_curated_zone/authorization_will_snapshot/"

#file_read = "s3a://data-sandbox-zone-will-prod/customer/"
#não encontrei como acessar uma tabela criada com create table - não acessível. Qdo quiser acessar pelo Spark devo subir por upload na sandbox zone

df = spark.read.parquet(file_read).cache()
df.show(5)
pandasDF = df.limit(50).toPandas() #subindo apenas 50 para testar. Se quiser tudo, remover o comando limit
pandasDF.head()

+-----------+--------------------+--------------+-----------+--------------+-------------------+-------------------+-------------------+--------------+-------------+--------+-------+-----------+------+-----------------------+-------------------------+-----------------------+------------+-------------+----------+-------------+------+----------------+------+--------------------+----------+--------------------+--------+--------+----------+---------------+------------+-------------------+----------+------------+-------------------+----------------+------------+--------+----------+-----------------------+-------------+
|ds_customer|         id_customer|id_transaction|     nr_cpf|id_pag_account|     dt_autorizacao|          dt_insert|          dt_update|vl_limite_disp|vl_referencia|vl_dolar|vl_real|nr_parcelas|cd_nsu|ds_nome_estabelecimento|ds_cidade_estabelecimento|ds_pais_estabelecimento|      cd_rrn|cd_adquirente|cd_retorno|cd_entry_mode|cd_mti|          nr_pan|cd_mcc|              ds_mcc

## AMAZON WILL PYTHENA

In [9]:
ACCESS_KEY_ID_WILL = os.getenv('AWS_ACCESS_KEY_ID_WILL')
SECRET_ACCESS_KEY_WILL = os.getenv('AWS_SECRET_ACCESS_KEY_WILL')

In [12]:
STAGING_DIR = 's3://data-athena-query-result-will-prod/flavia-costa'
SCHEMA = 'FLAVIA-COSTA'
conn_str = "awsathena+rest://{aws_access_key_id}:{aws_secret_access_key}@athena.{region_name}.amazonaws.com:443/"\
           "{schema_name}?s3_staging_dir={s3_staging_dir}"

engine = create_engine(conn_str.format(
            aws_access_key_id=quote_plus(ACCESS_KEY_ID_WILL),
            aws_secret_access_key=quote_plus(SECRET_ACCESS_KEY_WILL),
            region_name="sa-east-1",
            schema_name=SCHEMA,
            s3_staging_dir=quote_plus(STAGING_DIR)))

In [13]:
clientes_teste = pd.read_sql("""
            
            select  *
            from "curated-zone-fixed-upload".base_testes_migracao
                         """, engine)

In [14]:
clientes_teste.head()

Unnamed: 0,conta_cartao,cpf,classificacao,grupo,email,abriu_email,alterou_endereco,email_mkt,teste
0,48280220,12646635443,ausente,alvo,rocharyan820@gmail.com,sim,nao,opt-in,onboarding_maio
1,45259400,83223550097,ativo,alvo,pruilialdoalmeida1620@gmail.com,sim,nao,opt-in,onboarding_maio
2,3775196,1733551379,inativo,alvo,igorcpin@gmail.com,sim,nao,opt-in,onboarding_maio
3,48258260,14853422790,ativo,alvo,lumdxx@gmail.com,sim,nao,opt-in,onboarding_maio
4,3825100,40337235449,ativo,alvo,eleonoramaria1964@gmail.com,sim,nao,opt-in,onboarding_maio


## Salvar no S3

In [None]:
def save_to_s3(path, filename, df, delim):
    client = boto3.client(
            's3', 
            region_name = REGION,
            aws_access_key_id = ACCESS_KEY_ID,
            aws_secret_access_key = SECRET_ACCESS_KEY
        )
   
    csv_buffer=StringIO()
    df.to_csv(csv_buffer, index = False, sep = delim)
    content = csv_buffer.getvalue()
    
    response = client.put_object( 
    Bucket=BUCKET_NAME,
    Body= content,
    Key=path+filename
    )

In [None]:
save_to_s3(, , )

## Carrega S3

In [2]:
REGION = 'us-east-1'
ACCESS_KEY_ID     = os.getenv('AWS_ACCESS_KEY_ID_PAG') 
SECRET_ACCESS_KEY = os.getenv('AWS_SECRET_ACCESS_KEY_PAG') 
BUCKET_NAME = 'datalake-grupoavista'

In [3]:
def read_from_s3(KEY, delim):
    s3c = boto3.client(
            's3', 
            region_name = REGION,
            aws_access_key_id = ACCESS_KEY_ID,
            aws_secret_access_key = SECRET_ACCESS_KEY
        )

    obj = s3c.get_object(Bucket= BUCKET_NAME , Key = KEY)
    df = pd.read_csv(io.BytesIO(obj['Body'].read()), encoding='utf8', delimiter = delim)
    return df

In [4]:
path_to_csv = 'marketing/visao_unica_cliente/visao_unica/visao_unica.csv'

In [6]:
visaounica = read_from_s3(path_to_csv, ';')

In [7]:
visaounica.columns

Index(['conta_cartao', 'limitecartao', 'dataentrada', 'tickets',
       'valor_transacao', 'primeiracompra', 'ultimacompra', 'ciclodias',
       'recenciadias', 'vlm1', 'vlm2', 'vlm3', 'vlm4', 'vlm5', 'vlm6', 'vlm7',
       'vlm8', 'vlm9', 'vlm10', 'vlm11', 'vlm12', 'tktm1', 'tktm2', 'tktm3',
       'tktm4', 'tktm5', 'tktm6', 'tktm7', 'tktm8', 'tktm9', 'tktm10',
       'tktm11', 'tktm12', 'snapshot_date', 'cluster_desc', 'ever30', 'ever90',
       'ever1800', 'cli_nm_cliente', 'cli_nr_cpf', 'cli_nr_celular',
       'cli_fl_sexo', 'cli_cd_cliente', 'cli_ds_cidade_corresp',
       'cli_ds_uf_corresp', 'idade', 'cli_ds_e_mail', 'segmento_rfm',
       'coc_vl_limite_cartao', 'media_dias_entre_compras',
       'min_dias_entre_compras', 'max_dias_entre_compras',
       'med_dias_entre_compras', 'lojafavorita', 'mccfavorito',
       'diasemanafavorito', 'lojas', 'grupos', 'categorias',
       'percparcelas2xmais', 'percparcelas3xmais', 'percparcelas4xmais',
       'perctransinter', 'percweb',

In [8]:
len(visaounica)

1594642

In [None]:
.to_sql('public.cli_rfm', con=engine, if_exists='append', index=False)

In [None]:
#criar base lendo do S3
engine.execute(""" 

            DROP TABLE IF EXISTS sandbox.cpfs_consultados;
            CREATE TABLE sandbox.cpfs_consultados(
                    -- cpfs já consultados anteriormente
                    id int8,
                    nome VARCHAR(200),
                    cpf VARCHAR(15),
                    tipo_pessoa VARCHAR(5)
            );
            TRUNCATE sandbox.cpfs_consultados;
            COPY sandbox.cpfs_consultados 
            FROM 's3://datalake-grupoavista/data_analytics/xxxxxx/xxxxx.csv'
            iam_role 'arn:aws:iam::739007973549:role/RedShift-S3FullAccess'
            csv quote as '"'
            delimiter ';'
            ignoreheader 1
            ;

        """)

In [None]:
#SALVAR NO S3
engine.execute(""" unload(
                        $$
                        select * from base_temp_3
                        $$
                        )
                        TO 's3://datalake-grupoavista/xxxx/.csv'
                        iam_role 'arn:aws:iam::739007973549:role/RedShift-S3FullAccess'
                        HEADER
                        DELIMITER ';'
                        PARALLEL OFF
                        ALLOWOVERWRITE
                        ;
                        
                          """)

In [6]:
#!conda install -c anaconda psycopg2 -y
#!conda install -c anaconda sqlalchemy -y
#!conda install -c anaconda pandas -y
#!conda install -c conda-forge sqlalchemy-redshift -y

import psycopg2
from sqlalchemy import create_engine
import os

#>>>>>>>> MAKE CHANGES HERE <<<<<<<<<<<<< 
DATABASE = 'grupoavista'
USER = os.getenv('USER_REDSHIFT')
PASSWORD = os.getenv('SENHA_REDSHIFT')

HOST = 'datalake-cluster.ckkb9lvch2lp.us-east-1.redshift.amazonaws.com'
PORT = "5439"
connection_string = "redshift+psycopg2://%s:%s@%s:%s/%s" % (USER,PASSWORD,HOST,str(PORT),DATABASE)
engine = create_engine(connection_string)

In [None]:
import pandas as pd

basegeral = pd.read_sql("""
                         
                    select
                    coc.coc_cd_conta_cartao as cd_conta_cartao,
                    extract(day from current_timestamp - coc.coc_dt_abertura_conta) as dias_criacao_conta,
                    extract(day from current_timestamp - coc.coc_dt_atraso) as dias_atraso,
                    cli.cli_nr_cpf as cpf,
                    coc.coc_fl_situacao,
                    case when coc.coc_fl_situacao = 1 then True else False end as fl_aptos,
                    case when coc.coc_dt_atraso is null then False else True end as fl_inadimplencia
                    from replication.conta_cartao coc
                    inner join replication.cliente cli
                    on coc.coc_cd_cliente = cli.cli_cd_cliente
                         
                         """, engine)

In [None]:
import pag_redshift
from pag_redshift import engine
import pandas as pd
query = "Select * from replication.conta_cartao limit 1"
df = pd.read_sql(query, engine)
df

In [None]:
import psycopg2

dbname = 'grupoavista'
user = os.getenv('USER_REDSHIFT')
password = os.getenv('SENHA_REDSHIFT')
host = 'datalake-cluster.ckkb9lvch2lp.us-east-1.redshift.amazonaws.com'
port = "5439"
# connection_string = "redshift+psycopg2://%s:%s@%s:%s/%s" % (USER,PASSWORD,HOST,str(PORT),DATABASE)
# engine = create_engine(connection_string)

con=psycopg2.connect(dbname= 'dbname', host=host, port=port, user=user, password=pwd)

cur.execute("SELECT * FROM `table`;")
cur.fetchall()

cur.close() 
conn.close()

In [None]:
import os
from decouple import config
import psycopg2
from psycopg2.extras import RealDictCursor
import configparser

def db_connection():
    parser = configparser.ConfigParser()
    parser.read('py/my_credentials.ini') #template 
    connection_params = {'dbname': parser.get('Redshift','database'),
                        'user': parser.get('Redshift','username'),
                        'password': parser.get('Redshift','password'),
                        'host': parser.get('Redshift','host'),
                        'port': parser.get('Redshift','port')}
    return psycopg2.connect(**connection_params)

def load_query(path_query):
    """Load queries from file"""
    with open(path_query, 'r') as query:
        return ''.join(query.readlines())

def run_and_fetch(sql_filename, params = None):
    sql = load_query(sql_filename)
    conn = db_connection()
    with conn.cursor(cursor_factory=RealDictCursor) as cur:
         cur.execute(sql, params)
         tbl = cur.fetchall() # only for extract result
    return tbl # only for extract result

def run(sql_filename, params = None):
    sql = load_query(sql_filename)
    conn = db_connection()
    with conn.cursor(cursor_factory=RealDictCursor) as cur:
         cur.execute(sql, params)

In [None]:
df = pd.DataFrame(run_and_fetch('sql/query_autorizacoes.sql'))