In [7]:
core_features = [
    "datetime",
    "weekday",
    "hour",
    "advertiser_bundle",
    "advertiser_category",
    "advertiser_subcategory",
    "country",
    "dev_os",
    "dev_model",
    "dev_make",
    "dev_osv",
    "release_date",
    "release_msrp",
]

# Historial de revenue/compras
rev_buy_history_features = [
    "iap_revenue_usd_category_bottom_taxonomy",
    "iap_revenue_usd_bundle",
    "num_buys_category_bottom_taxonomy",
    "num_buys_bundle",
    "whale_users_bundle_num_buys_prank",
    "whale_users_bundle_revenue_prank",
    "whale_users_bundle_total_num_buys",
    "whale_users_bundle_total_revenue",
    "last_buy",
    "last_buy_ts_bundle",
    "last_buy_ts_category",
]

# Historial de installs/bundles
install_bundle_history_features = [
    "bundles_ins",
    "bundles_cat_bottom_taxonomy",
    "user_bundles_l28d",
    "new_bundles",
    "bcat_bottom_taxonomy",
]

# Actividad de usuario
user_activity_features = [
    "avg_act_days",
    "avg_daily_sessions",
    "avg_duration",
    "avg_days_ins",
    "weeks_since_first_seen", 
    "weekend_ratio",
    "hour_ratio",
    "wifi_ratio",
]

# Historial de primeras y últimas acciones de uso
usage_timing_features = [
    "first_request_ts_bundle",
    "first_request_ts_category_bottom_taxonomy",
    "last_install_ts_bundle",
    "last_ins",
]

# Métricas de anuncios
ad_metrics_features = [
    "cpm_pct_rk",
    "rwd_prank",
]

# Acciones sobre bundles
bundle_action_features = [
    "user_actions_bundles_action_count",
    "user_actions_bundles_action_last_timestamp",
]

targets = [
	"iap_revenue_d7",
	"buyer_d7",
]

# Si quieres todo junto (para cargar columnas del parquet):
all_selected_features = (
    core_features
    + rev_buy_history_features
    + install_bundle_history_features
    + user_activity_features
    + usage_timing_features
    + ad_metrics_features
    + bundle_action_features
	+ targets
	+ ["row_id"] 
)

In [8]:
import dask
import dask.dataframe as dd
import pandas as pd

dask.config.set({"dataframe.convert-string": False})

TRAIN_PATH = "./data/train/train"

# 1. Cargar dataset completo en Dask
ddf = dd.read_parquet(
    TRAIN_PATH,
    columns=all_selected_features,
	engine="pyarrow",
)

# 2. Muestreo (porcentaje ajustable)
FRAC_SAMPLE = 0.01  # 1% de 20M -> ~200k filas
sample = ddf.sample(frac=FRAC_SAMPLE, random_state=42).compute()

print(sample.shape)
sample.head()

(206009, 48)


Unnamed: 0,datetime,weekday,hour,advertiser_bundle,advertiser_category,advertiser_subcategory,country,dev_os,dev_model,dev_make,...,first_request_ts_category_bottom_taxonomy,last_install_ts_bundle,last_ins,cpm_pct_rk,rwd_prank,user_actions_bundles_action_count,user_actions_bundles_action_last_timestamp,iap_revenue_d7,buyer_d7,row_id
122775,2025-10-01-01-00,3,1,d90c371450511851756cdd246a3e0d413826e744,application,travel & local,mx,android,2311drk48g,xiaomi,...,"[(Bubble Shooter, 1758067200)]",,,"[(b, 0.0954035770549532)]","[(cd7838a8ea84117ac3ce795ef070f2dcb0eca7f5, 0....","[(4ed9c0e40d0c3ccfcaa1de9deccc5b73ce17dac7, [(...","[(4ed9c0e40d0c3ccfcaa1de9deccc5b73ce17dac7, [(...",0.0,0,9ff575cc-7488-42dd-a284-92277d6d75fa
1776,2025-10-01-00-00,3,0,3714e810963165f514bbeec2cc0b54e91239314b,application,dating,us,android,moto g 5g 2024,motorola,...,"[(Platformer / Runner, 1758412800), (Video Edi...","[(4ed9c0e40d0c3ccfcaa1de9deccc5b73ce17dac7, 17...",1758474000.0,"[(i, 0.45697978681223245), (r, 0.3274703716948...","[(d55fcfff4d87f95effb170b0436b380d2f046e6d, 0....","[(db42749ef73fd488a95e37718f033eab132da07c, [(...","[(db42749ef73fd488a95e37718f033eab132da07c, [(...",0.0,0,4594fe5b-0c86-4d57-9dff-7861dbc83994
102030,2025-10-01-01-00,3,1,7eaeef21e215654ea75a5ef78692d1f8ffd0ca54,sport betting,unknown,ru,android,2409brn2ca,xiaomi,...,,,,,,,,0.0,0,a533c251-5a8f-4443-bdef-5b0cdb9369da
1322,2025-10-01-01-00,3,1,72e7756b6e94619b84ace04509dfd105cbf39b0e,sport betting,unknown,us,android,moto g 5g 2024,motorola,...,,,,"[(i, 0.2610404881680478)]","[(b74cace594a9b39fbb0ec28efc2a57fae7ffcf70, 0....","[(001b729f996da29d01e9e3641ac7fb153e6bc98e, [(...","[(001b729f996da29d01e9e3641ac7fb153e6bc98e, [(...",6.310399,1,de71033d-595f-427c-8be2-57880a88097b
29908,2025-10-01-00-00,3,0,4dd4aa85cf8fae402f1dd15d2204226925b483e7,games,games/action,ni,android,x6525,infinix,...,,,,,,,,0.0,0,42847219-074f-4286-be24-d8ca4f0ec5e6


In [None]:
import numpy as np
import pandas as pd
import lightgbm as lgb
from sklearn.preprocessing import OrdinalEncoder
from sklearn.metrics import mean_squared_log_error
from sklearn.model_selection import train_test_split

df = sample.copy()

TARGET = "iap_revenue_d7"
drop_cols = [TARGET, "row_id", "buyer_d7"]

# 1) Definimos solo categóricas "simples"
simple_cat_cols = [
    "advertiser_bundle",
    "advertiser_category",
    "advertiser_subcategory",
    "country",
    "dev_os",
    "dev_model",
    "dev_make",
    "dev_osv",
    "weekday",
    "hour",
]

# columnas categóricas que realmente existen en tu df
cat_cols = [c for c in simple_cat_cols if c in df.columns]

# 2) Feature columns iniciales
feature_cols = [c for c in df.columns if c not in drop_cols]

# 3) Quitamos columnas object raras (listas/dicts) que no estén en cat_cols
bad_obj_cols = [
    c for c in feature_cols
    if (df[c].dtype == "object") and (c not in cat_cols)
]

print("Quitando columnas object no simples:", bad_obj_cols)

feature_cols = [c for c in feature_cols if c not in bad_obj_cols]

# 4) Codificamos SOLO las categóricas simples
oe = OrdinalEncoder(handle_unknown="use_encoded_value", unknown_value=-1)
if cat_cols:
    df[cat_cols] = oe.fit_transform(df[cat_cols])

# 5) Preparar X, y
X = df[feature_cols]
y = np.log1p(df[TARGET])

X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.2, random_state=42
)

train_data = lgb.Dataset(
    X_train, label=y_train,
    categorical_feature=cat_cols if cat_cols else None,
)

val_data = lgb.Dataset(
    X_val, label=y_val,
    categorical_feature=cat_cols if cat_cols else None,
)

params = {
    "objective": "regression",
    "metric": "l2",
    "learning_rate": 0.05,
    "num_leaves": 64,
    "feature_fraction": 0.8,
    "bagging_fraction": 0.8,
    "bagging_freq": 1,
    "verbose": -1,
}

model = lgb.train(
    params,
    train_data,
    valid_sets=[train_data, val_data],
    num_boost_round=2000,
)

y_val_pred_log = model.predict(X_val)
y_val_pred = np.expm1(y_val_pred_log)

from sklearn.metrics import mean_squared_log_error
msle = mean_squared_log_error(np.expm1(y_val), y_val_pred)
print("MSLE baseline:", msle)

Quitando columnas object no simples: ['release_date', 'iap_revenue_usd_category_bottom_taxonomy', 'iap_revenue_usd_bundle', 'num_buys_category_bottom_taxonomy', 'num_buys_bundle', 'whale_users_bundle_num_buys_prank', 'whale_users_bundle_revenue_prank', 'whale_users_bundle_total_num_buys', 'whale_users_bundle_total_revenue', 'last_buy_ts_bundle', 'last_buy_ts_category', 'bundles_ins', 'bundles_cat_bottom_taxonomy', 'user_bundles_l28d', 'new_bundles', 'bcat_bottom_taxonomy', 'avg_daily_sessions', 'avg_duration', 'hour_ratio', 'first_request_ts_bundle', 'first_request_ts_category_bottom_taxonomy', 'last_install_ts_bundle', 'cpm_pct_rk', 'rwd_prank', 'user_actions_bundles_action_count', 'user_actions_bundles_action_last_timestamp']
MSLE baseline: 0.18773834255368788
