
# Projeto Interdisciplinar — Entrega 1 (PicMoney)




In [None]:

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from pathlib import Path

BASE_DIR = Path('/content') 
FILES = {
    'players': 'PicMoney-Base_Cadastral_de_Players-10_000 linhas.csv',
    'cupons': 'PicMoney-Base_de_Transa__es_-_Cupons_Capturados-100000 linhas.csv',
    'pedestres': 'PicMoney-Base_Simulada_-_Pedestres_Av__Paulista-100000 linhas.csv',
    'lojas': 'PicMoney-Massa_de_Teste_com_Lojas_e_Valores-10000 linhas.csv',
}

def normalize_columns(df):
    df = df.copy()
    df.columns = (
        df.columns.str.strip().str.lower()
        .str.replace(' ', '_').str.replace('__', '_')
        .str.replace('[^0-9a-zA-Z_]+', '', regex=True)
    )
    return df

def safe_read_csv(path):
    for enc in ['utf-8','latin-1','cp1252']:
        for sep in [',',';','\t','|']:
            try:
                return pd.read_csv(path, encoding=enc, sep=sep, low_memory=False)
            except Exception:
                pass
    raise RuntimeError('Falha ao ler CSV')

loaded = {}
for k, fname in FILES.items():
    p = BASE_DIR / fname
    loaded[k] = safe_read_csv(p)
    loaded[k] = normalize_columns(loaded[k])
{ k: (len(v), v.shape[1]) for k,v in loaded.items() }



In [None]:

players = loaded['players'].copy()
cupons = loaded['cupons'].copy()
pedestres = loaded['pedestres'].copy()
lojas = loaded['lojas'].copy()

def to_datetime_safe(s):
    import pandas as pd
    return pd.to_datetime(s, errors='coerce')

players['data_nascimento'] = to_datetime_safe(players.get('data_nascimento'))
cupons['data'] = to_datetime_safe(cupons.get('data'))
lojas['data_captura'] = to_datetime_safe(lojas.get('data_captura'))

if 'numero_celular' in lojas.columns:
    lojas = lojas.rename(columns={'numero_celular':'celular'})


In [None]:

# KPIs básicos
kpis = {}
kpis['total_players'] = players['celular'].nunique()
kpis['total_cupons'] = len(cupons)
kpis['total_usuarios_transacionando'] = cupons['celular'].nunique()
kpis['total_lojas'] = lojas['nome_loja'].nunique()
kpis['total_estabelecimentos_transacoes'] = cupons['nome_estabelecimento'].nunique()
kpis['valor_cupom_total'] = float(cupons['valor_cupom'].sum())
kpis['ticket_medio_compra_estimado'] = float(lojas['valor_compra'].mean())
kpis


In [None]:

# Qualidade (nulos e duplicados)
def quality(df):
    return {
        'rows': len(df),
        'cols': df.shape[1],
        'duplicates': int(df.duplicated().sum()),
        'missing': df.isna().sum().sort_values(ascending=False).head(10)
    }

quality_players = quality(players)
quality_cupons = quality(cupons)
quality_pedestres = quality(pedestres)
quality_lojas = quality(lojas)

quality_players, quality_cupons, quality_pedestres, quality_lojas


In [None]:

# Exploração — gráficos simples
if 'valor_cupom' in cupons.columns:
    plt.figure()
    cupons['valor_cupom'].dropna().hist(bins=30)
    plt.title('Distribuição — Valor de Cupom')
    plt.xlabel('valor_cupom'); plt.ylabel('frequência')
    plt.show()

if 'valor_compra' in lojas.columns:
    plt.figure()
    lojas['valor_compra'].dropna().hist(bins=30)
    plt.title('Distribuição — Valor de Compra')
    plt.xlabel('valor_compra'); plt.ylabel('frequência')
    plt.show()

if 'data' in cupons.columns:
    tmp = cupons.copy()
    tmp['data'] = pd.to_datetime(tmp['data'], errors='coerce')
    daily = tmp.groupby(tmp['data'].dt.date).size()
    plt.figure()
    daily.plot(kind='line')
    plt.title('Transações (Cupons) por Dia')
    plt.xlabel('data'); plt.ylabel('qtd_cupons')
    plt.show()
