In [1]:
import pandas as pd
import joblib
import numpy as np

from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer


# Carregar Dados

In [2]:

df = pd.read_csv('../dataset/raw/dataset.csv')
df['usou_app_hoje'] = df['usou_app_hoje'].apply(int)

df['date'] = pd.to_datetime(df['date'])
df['safra'] = df['date'].dt.to_period('M').astype(str)

df_study = df[df['safra'].isin(['2024-01', '2024-02', '2024-03'])]

df_study

Unnamed: 0,date,periodo_dia,dia_semana,segmento,tempo_cliente,qtd_cliques_30d,usou_app_hoje,dias_desde_ultimo_clique,reward,safra
0,2024-01-01,tarde,seg,medio_valor,834,1,0,37,0,2024-01
1,2024-01-01,noite,seg,medio_valor,37,1,0,86,0,2024-01
2,2024-01-01,manha,seg,baixo_valor,163,3,0,61,0,2024-01
3,2024-01-01,tarde,seg,medio_valor,854,1,1,69,0,2024-01
4,2024-01-01,manha,seg,alto_valor,19,2,1,75,0,2024-01
...,...,...,...,...,...,...,...,...,...,...
475839,2024-03-31,manha,dom,baixo_valor,178,2,1,1,0,2024-03
475840,2024-03-31,manha,dom,baixo_valor,124,1,0,8,0,2024-03
475841,2024-03-31,tarde,dom,baixo_valor,763,2,0,62,0,2024-03
475842,2024-03-31,noite,dom,medio_valor,331,5,1,29,1,2024-03


In [4]:
df.groupby('periodo_dia')['reward'].mean().sort_values(ascending=False)

periodo_dia
tarde    0.162066
noite    0.155968
manha    0.147629
Name: reward, dtype: float64

In [5]:
probs = [0.1613, 0.1576, 0.1490]
probs = np.array(probs)
probs = probs / probs.sum()
probs

array([0.34473178, 0.33682411, 0.31844411])

In [6]:
cols_numeric = ['tempo_cliente', 'qtd_cliques_30d', 'dias_desde_ultimo_clique']
cols_categoric = ['dia_semana', 'segmento']
cols_inf = ['date', 'periodo_dia', 'reward', 'usou_app_hoje', 'safra']

# Feature Engineering

In [7]:
# Cria o preprocessor
preprocessor = ColumnTransformer(transformers=[
    ('cat', OneHotEncoder(handle_unknown='ignore'), cols_categoric),
    ('num', StandardScaler(), cols_numeric)
])

# Fit-transform
X_transformed = preprocessor.fit_transform(df_study)

X_transformed

# Pegar os nomes das colunas
ohe = preprocessor.named_transformers_['cat']
ohe_features = ohe.get_feature_names_out(cols_categoric)
feature_names = list(ohe_features) + cols_numeric

# Converter para DataFrame
X_final = pd.DataFrame(X_transformed.toarray() if hasattr(X_transformed, 'toarray') else X_transformed,
                       columns=feature_names,
                       index=df_study.index)


df_transformer = pd.concat([df_study[cols_inf], X_final], axis=1)
df_transformer

Unnamed: 0,date,periodo_dia,reward,usou_app_hoje,safra,dia_semana_dom,dia_semana_qua,dia_semana_qui,dia_semana_sab,dia_semana_seg,dia_semana_sex,dia_semana_ter,segmento_alto_valor,segmento_baixo_valor,segmento_medio_valor,tempo_cliente,qtd_cliques_30d,dias_desde_ultimo_clique
0,2024-01-01,tarde,0,0,2024-01,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.282001,-0.706617,-0.305774
1,2024-01-01,noite,0,0,2024-01,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,-0.896774,-0.706617,1.557655
2,2024-01-01,manha,0,0,2024-01,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,-0.552325,0.708279,0.606926
3,2024-01-01,tarde,0,1,2024-01,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.336675,-0.706617,0.911159
4,2024-01-01,manha,0,1,2024-01,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,-0.945981,0.000831,1.139334
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
475839,2024-03-31,manha,0,1,2024-03,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,-0.511320,0.000831,-1.674824
475840,2024-03-31,manha,0,0,2024-03,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,-0.658940,-0.706617,-1.408619
475841,2024-03-31,tarde,0,0,2024-03,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.087907,0.000831,0.644955
475842,2024-03-31,noite,1,1,2024-03,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,-0.093060,2.123175,-0.610007


# Feature Selection

In [8]:
import pandas as pd
from sklearn.linear_model import LogisticRegression

def selecionar_features_por_importancia_log(X, y, top_k=None, threshold=None):
    model = LogisticRegression(max_iter=1000, random_state=2025)
    model.fit(X, y)

    # Coeficiente absoluto como medida de importância
    importancias = pd.DataFrame({
        'feature': X.columns,
        'importance': abs(model.coef_[0])
    }).sort_values(by='importance', ascending=False)

    # Selecionar por critério
    if top_k is not None:
        selecionadas = importancias.head(top_k)['feature'].tolist()
    elif threshold is not None:
        selecionadas = importancias[importancias['importance'] >= threshold]['feature'].tolist()
    else:
        selecionadas = importancias[importancias['importance'] > 0]['feature'].tolist()

    return selecionadas, importancias

In [9]:
cols_features = [col for col in df_transformer.columns if col not in cols_inf]
cols_features.append('usou_app_hoje')
len(cols_features)

14

In [10]:
cols_features

['dia_semana_dom',
 'dia_semana_qua',
 'dia_semana_qui',
 'dia_semana_sab',
 'dia_semana_seg',
 'dia_semana_sex',
 'dia_semana_ter',
 'segmento_alto_valor',
 'segmento_baixo_valor',
 'segmento_medio_valor',
 'tempo_cliente',
 'qtd_cliques_30d',
 'dias_desde_ultimo_clique',
 'usou_app_hoje']

In [11]:
# Separar X e y
X = df_transformer[cols_features]
y = df_transformer['reward']

# # Executar seleção
features_boas, ranking = selecionar_features_por_importancia_log(X, y, top_k=10)
# features_boas.append('key_loja')
# features_boas.append('cod_produto')

list_feat_final = list(set(features_boas))
print("Features selecionadas:", list_feat_final)
print("\nRanking completo:")
print(ranking)

Features selecionadas: ['segmento_medio_valor', 'segmento_alto_valor', 'dia_semana_sex', 'dia_semana_qui', 'dia_semana_sab', 'dia_semana_qua', 'dia_semana_ter', 'segmento_baixo_valor', 'dia_semana_dom', 'usou_app_hoje']

Ranking completo:
                     feature  importance
8       segmento_baixo_valor    1.383985
5             dia_semana_sex    0.471751
7        segmento_alto_valor    0.338203
13             usou_app_hoje    0.303706
9       segmento_medio_valor    0.256834
6             dia_semana_ter    0.188197
3             dia_semana_sab    0.154325
2             dia_semana_qui    0.153999
1             dia_semana_qua    0.152244
0             dia_semana_dom    0.151303
11           qtd_cliques_30d    0.087108
12  dias_desde_ultimo_clique    0.082677
10             tempo_cliente    0.072447
4             dia_semana_seg    0.030797


In [12]:
len(list_feat_final)

10

# Save

In [13]:
joblib.dump(features_boas, '../dataset/processed/features_importantes.pkl')
joblib.dump(preprocessor, '../dataset/processed/preprocessor.pkl')

['../dataset/processed/preprocessor.pkl']