In [1]:
import pandas as pd
import numpy as np

np.random.seed(2025)

In [2]:
# 2. Categorias
dias_semana = ['seg', 'ter', 'qua', 'qui', 'sex']
segmentos = ['alto_valor', 'medio_valor', 'baixo_valor']
periodos_dia = ['manha', 'tarde', 'noite']

In [3]:
def create_dataset_contexto(date_start, date_end, size_sample):
    # 1. Datas realistas ao longo de 6 meses
    date_range = pd.date_range(start=date_start, end=date_end, freq='D')

    list_dfs = []
    for date in date_range:
        df_tmp = pd.DataFrame({
            'date': [date]*size_sample
        })

        df_tmp['periodo_dia'] = np.random.choice(periodos_dia, size=size_sample)
        df_tmp['dia_semana'] = df_tmp['date'].dt.dayofweek.map({0: 'seg', 1: 'ter', 2: 'qua', 3: 'qui', 4: 'sex', 5: 'sab', 6: 'dom'})
        df_tmp['segmento'] = np.random.choice(segmentos, size=size_sample, p=[0.2, 0.5, 0.3])
        df_tmp['tempo_cliente'] = np.random.exponential(scale=365, size=size_sample).astype(int)
        df_tmp['qtd_cliques_30d'] = np.random.poisson(lam=2, size=size_sample)
        df_tmp['usou_app_hoje'] = np.random.choice([0, 1], size=size_sample, p=[0.6, 0.4])
        df_tmp['dias_desde_ultimo_clique'] = np.random.randint(0, 91, size=size_sample)

        list_dfs.append(df_tmp)

    return pd.concat(list_dfs,ignore_index=True)

def simular_recompensa(row):
    p = 0.03  # baseline

    if row['segmento'] == 'alto_valor':
        p += 0.2
    elif row['segmento'] == 'medio_valor':
        p += 0.1

    if row['segmento'] == 'alto_valor' and row['periodo_dia'] == 'manha':
        p += 0.1
    if row['segmento'] == 'baixo_valor' and row['periodo_dia'] == 'noite':
        p += 0.08

    p += min(row['qtd_cliques_30d'] * 0.01, 0.1)

    if row['usou_app_hoje']:
        p += 0.05

    if row['dia_semana'] == 'sex':
        p -= 0.05

    if row['tempo_cliente'] < 30:
        p -= 0.02
        
    if row['dias_desde_ultimo_clique'] < 3:
        p += 0.05
    elif row['dias_desde_ultimo_clique'] > 30:
        p -= 0.03

    return int(np.random.rand() < p)
    
df = create_dataset_contexto(date_start="2024-01-01", date_end="2024-12-01", size_sample=20000)
df['reward'] = df.apply(simular_recompensa, axis=1)
df

KeyboardInterrupt: 

In [None]:
df.to_csv('../dataset/raw/dataset.csv', index=False)