In [1]:
import pandas as pd
import joblib
import numpy as np
from lightgbm import LGBMClassifier
from sklearn.preprocessing import OneHotEncoder

# Carregar Dados

In [2]:

df = pd.read_csv('../dataset/raw/dataset.csv')
df['usou_app_hoje'] = df['usou_app_hoje'].apply(int)

df['date'] = pd.to_datetime(df['date'])
df['safra'] = df['date'].dt.to_period('M').astype(str)

df_study = df[df['safra'].isin(['2024-01', '2024-02', '2024-03'])]

df_study

Unnamed: 0,date,periodo_dia,dia_semana,segmento,tempo_cliente,qtd_cliques_30d,usou_app_hoje,dias_desde_ultimo_clique,reward,safra
0,2024-01-01,noite,seg,baixo_valor,831,1,0,69,0,2024-01
1,2024-01-01,noite,seg,baixo_valor,339,0,0,13,0,2024-01
2,2024-01-01,noite,seg,alto_valor,448,0,0,81,0,2024-01
3,2024-01-01,noite,seg,alto_valor,631,3,0,5,1,2024-01
4,2024-01-01,manha,seg,medio_valor,159,1,1,86,0,2024-01
...,...,...,...,...,...,...,...,...,...,...
909995,2024-03-31,tarde,dom,alto_valor,507,2,0,34,0,2024-03
909996,2024-03-31,manha,dom,baixo_valor,192,0,1,22,0,2024-03
909997,2024-03-31,manha,dom,baixo_valor,115,1,1,6,0,2024-03
909998,2024-03-31,manha,dom,medio_valor,235,4,1,77,1,2024-03


In [3]:
cols_numeric = ['tempo_cliente', 'qtd_cliques_30d', 'dias_desde_ultimo_clique']
cols_categoric = ['dia_semana', 'segmento']
cols_inf = ['date', 'periodo_dia', 'reward', 'safra']

# Feature Engineering

## Binarizar Numericos

In [4]:
def classificar_percentil(df_tmp, list_features_num):
    df_copy = df_tmp.copy()
    colunas_sucesso = []
    colunas_falhas = []
    
    percentis = [0.05, 0.25, 0.5, 0.75, 0.95]
    # rotulos = ['P05', 'P25', 'P50', 'P75', 'P95']
    
    for col in list_features_num:
        try:
            limites = df_tmp[col].quantile(percentis).values
            bins = [-np.inf] + list(limites) + [np.inf]
            
            df_copy[f'{col}_percentil'] = pd.cut(
                df_tmp[col], bins=bins, include_lowest=True, duplicates='drop'
            )
            colunas_sucesso.append(f'{col}_percentil')
        except ValueError as e:
            colunas_falhas = [].append(col)
            print(f"⚠️ Erro ao classificar '{col}': {e}")

    return df_copy, colunas_sucesso, colunas_falhas

In [5]:
from sklearn.preprocessing import KBinsDiscretizer

def binarizar_numericos(df_tmp, cols_num):
    binarizer = KBinsDiscretizer(n_bins=4, encode='onehot-dense', strategy='quantile')
    binarizer.fit(df_tmp[cols_num])
    
    # Transforma
    X_bin = binarizer.transform(df_tmp[cols_num])
    
    # Converte para DataFrame e junta com o original
    X_bin_df = pd.DataFrame(X_bin, columns=binarizer.get_feature_names_out(cols_num))
    df_trans = pd.concat([df_tmp.reset_index(drop=True), X_bin_df.reset_index(drop=True)], axis=1)
    
    return df_trans, binarizer

In [6]:
df_study_bin_numeric, binarizer_num = binarizar_numericos(df_study, cols_numeric)
joblib.dump(binarizer_num, '../dataset/processed/binarizer_num.pkl')
df_study_bin_numeric.head()



Unnamed: 0,date,periodo_dia,dia_semana,segmento,tempo_cliente,qtd_cliques_30d,usou_app_hoje,dias_desde_ultimo_clique,reward,safra,...,tempo_cliente_2.0,tempo_cliente_3.0,qtd_cliques_30d_0.0,qtd_cliques_30d_1.0,qtd_cliques_30d_2.0,qtd_cliques_30d_3.0,dias_desde_ultimo_clique_0.0,dias_desde_ultimo_clique_1.0,dias_desde_ultimo_clique_2.0,dias_desde_ultimo_clique_3.0
0,2024-01-01,noite,seg,baixo_valor,831,1,0,69,0,2024-01,...,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0
1,2024-01-01,noite,seg,baixo_valor,339,0,0,13,0,2024-01,...,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
2,2024-01-01,noite,seg,alto_valor,448,0,0,81,0,2024-01,...,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
3,2024-01-01,noite,seg,alto_valor,631,3,0,5,1,2024-01,...,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0
4,2024-01-01,manha,seg,medio_valor,159,1,1,86,0,2024-01,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0


## Encoder Categoricos

In [7]:
oh_enc =  OneHotEncoder(sparse_output=False, handle_unknown='ignore')

cols_cat_encoder = oh_enc.fit_transform(df_study_bin_numeric[cols_categoric])
df_encoded = pd.DataFrame(cols_cat_encoder, columns=oh_enc.get_feature_names_out(cols_categoric))

df_study_bin_enc = pd.concat([df_study_bin_numeric.drop(columns=cols_categoric).reset_index(drop=True), df_encoded], axis=1)
df_study_bin_enc.head()

Unnamed: 0,date,periodo_dia,tempo_cliente,qtd_cliques_30d,usou_app_hoje,dias_desde_ultimo_clique,reward,safra,tempo_cliente_0.0,tempo_cliente_1.0,...,dia_semana_dom,dia_semana_qua,dia_semana_qui,dia_semana_sab,dia_semana_seg,dia_semana_sex,dia_semana_ter,segmento_alto_valor,segmento_baixo_valor,segmento_medio_valor
0,2024-01-01,noite,831,1,0,69,0,2024-01,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
1,2024-01-01,noite,339,0,0,13,0,2024-01,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
2,2024-01-01,noite,448,0,0,81,0,2024-01,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0
3,2024-01-01,noite,631,3,0,5,1,2024-01,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0
4,2024-01-01,manha,159,1,1,86,0,2024-01,0.0,1.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0


# Feature Selection

In [8]:
import pandas as pd

def selecionar_features_por_importancia(X, y, top_k=None, threshold=None):
    model = LGBMClassifier(
        verbose=0,
        random_state=2025,
    )
    model.fit(X, y)

    importancias = pd.DataFrame({
        'feature': X.columns,
        'importance': model.feature_importances_
    }).sort_values(by='importance', ascending=False)

    # Selecionar por critério
    if top_k is not None:
        selecionadas = importancias.head(top_k)['feature'].tolist()
    elif threshold is not None:
        selecionadas = importancias[importancias['importance'] >= threshold]['feature'].tolist()
    else:
        selecionadas = importancias[importancias['importance'] > 0]['feature'].tolist()

    return selecionadas, importancias

In [9]:
cols_features = [col for col in df_study_bin_enc.columns if col not in cols_inf]
len(cols_features)

26

In [10]:
# Separar X e y
X = df_study_bin_enc[cols_features]
y = df_study_bin_enc['reward']

# # Executar seleção
features_boas, ranking = selecionar_features_por_importancia(X, y, top_k=10)
# features_boas.append('key_loja')
# features_boas.append('cod_produto')

list_feat_final = list(set(features_boas))
print("Features selecionadas:", list_feat_final)
print("\nRanking completo:")
print(ranking)

Features selecionadas: ['dia_semana_qua', 'dia_semana_sab', 'usou_app_hoje', 'segmento_alto_valor', 'dia_semana_seg', 'tempo_cliente', 'dia_semana_sex', 'dias_desde_ultimo_clique', 'qtd_cliques_30d', 'segmento_baixo_valor']

Ranking completo:
                         feature  importance
0                  tempo_cliente         882
3       dias_desde_ultimo_clique         746
1                qtd_cliques_30d         470
21                dia_semana_sex         205
2                  usou_app_hoje         181
23           segmento_alto_valor          82
24          segmento_baixo_valor          82
19                dia_semana_sab          40
20                dia_semana_seg          38
17                dia_semana_qua          37
16                dia_semana_dom          35
18                dia_semana_qui          32
22                dia_semana_ter          31
25          segmento_medio_valor          31
10           qtd_cliques_30d_2.0          29
9            qtd_cliques_30d_1.0     

In [11]:
len(list_feat_final)

10

# Preparação para Simulação

In [12]:
X_bin = binarizer_num.transform(df[cols_numeric])
X_bin_df = pd.DataFrame(X_bin, columns=binarizer_num.get_feature_names_out(cols_numeric))
df_bin_num = pd.concat([df.reset_index(drop=True), X_bin_df.reset_index(drop=True)], axis=1)
df_bin_num.head()

Unnamed: 0,date,periodo_dia,dia_semana,segmento,tempo_cliente,qtd_cliques_30d,usou_app_hoje,dias_desde_ultimo_clique,reward,safra,...,tempo_cliente_2.0,tempo_cliente_3.0,qtd_cliques_30d_0.0,qtd_cliques_30d_1.0,qtd_cliques_30d_2.0,qtd_cliques_30d_3.0,dias_desde_ultimo_clique_0.0,dias_desde_ultimo_clique_1.0,dias_desde_ultimo_clique_2.0,dias_desde_ultimo_clique_3.0
0,2024-01-01,noite,seg,baixo_valor,831,1,0,69,0,2024-01,...,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0
1,2024-01-01,noite,seg,baixo_valor,339,0,0,13,0,2024-01,...,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
2,2024-01-01,noite,seg,alto_valor,448,0,0,81,0,2024-01,...,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
3,2024-01-01,noite,seg,alto_valor,631,3,0,5,1,2024-01,...,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0
4,2024-01-01,manha,seg,medio_valor,159,1,1,86,0,2024-01,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0


In [13]:
cols_cat_encoder = oh_enc.transform(df_bin_num[cols_categoric])
df_encoded = pd.DataFrame(cols_cat_encoder, columns=oh_enc.get_feature_names_out(cols_categoric))

df_bin_enc = pd.concat([df_bin_num.drop(columns=cols_categoric).reset_index(drop=True), df_encoded], axis=1)
df_bin_enc.head()

Unnamed: 0,date,periodo_dia,tempo_cliente,qtd_cliques_30d,usou_app_hoje,dias_desde_ultimo_clique,reward,safra,tempo_cliente_0.0,tempo_cliente_1.0,...,dia_semana_dom,dia_semana_qua,dia_semana_qui,dia_semana_sab,dia_semana_seg,dia_semana_sex,dia_semana_ter,segmento_alto_valor,segmento_baixo_valor,segmento_medio_valor
0,2024-01-01,noite,831,1,0,69,0,2024-01,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
1,2024-01-01,noite,339,0,0,13,0,2024-01,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
2,2024-01-01,noite,448,0,0,81,0,2024-01,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0
3,2024-01-01,noite,631,3,0,5,1,2024-01,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0
4,2024-01-01,manha,159,1,1,86,0,2024-01,0.0,1.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0


In [14]:
list_feat_final

['dia_semana_qua',
 'dia_semana_sab',
 'usou_app_hoje',
 'segmento_alto_valor',
 'dia_semana_seg',
 'tempo_cliente',
 'dia_semana_sex',
 'dias_desde_ultimo_clique',
 'qtd_cliques_30d',
 'segmento_baixo_valor']

In [17]:
list_features_final = cols_inf.copy()
list_features_final.extend(list_feat_final)

df_final = df_bin_enc[list_features_final]
df_final.head()

Unnamed: 0,date,periodo_dia,reward,safra,dia_semana_qua,dia_semana_sab,usou_app_hoje,segmento_alto_valor,dia_semana_seg,tempo_cliente,dia_semana_sex,dias_desde_ultimo_clique,qtd_cliques_30d,segmento_baixo_valor
0,2024-01-01,noite,0,2024-01,0.0,0.0,0,0.0,1.0,831,0.0,69,1,1.0
1,2024-01-01,noite,0,2024-01,0.0,0.0,0,0.0,1.0,339,0.0,13,0,1.0
2,2024-01-01,noite,0,2024-01,0.0,0.0,0,1.0,1.0,448,0.0,81,0,0.0
3,2024-01-01,noite,1,2024-01,0.0,0.0,0,1.0,1.0,631,0.0,5,3,0.0
4,2024-01-01,manha,0,2024-01,0.0,0.0,1,0.0,1.0,159,0.0,86,1,0.0


In [18]:
df_final.to_csv('../dataset/processed/datset_processed.csv', index=False)