In [1]:
import os
import sys
import matplotlib.pyplot as plt
import pandas as pd



PROJECT_ROOT = os.getcwd()

if PROJECT_ROOT not in sys.path:
    sys.path.insert(0, PROJECT_ROOT)

print("PROJECT_ROOT:", PROJECT_ROOT)
print("Contenido de ./data:", os.listdir(os.path.join(PROJECT_ROOT, "data")))

PROJECT_ROOT: c:\Users\adria.flores\Documents\Projects\hacks\datathon2025-smadex
Contenido de ./data: ['sample_submission.csv', 'test', 'train']


In [2]:
TARGET_COLUMN = ["iap_revenue_d7"]
ROW_ID_COLUMN = ["row_id"]
OTHER_TARGET_COLUMNS = [
    "buyer_d1",
    "buyer_d7",
    "buyer_d14",
    "buyer_d28",
    "buy_d7",
    "buy_d14",
    "buy_d28",
    # other revenue horizons (excluding the main target)
    "iap_revenue_d14",
    "iap_revenue_d28",
    # registration / retention labels
    "registration",
    "retention_d1_to_d7",
    "retention_d3_to_d7",
    "retention_d7_to_d14",
    "retention_d1",
    "retention_d3",
    "retentiond7",  # note: name without underscore, as in the dataset description
]

In [3]:
import dask
import dask.dataframe as dd
dask.config.set({"dataframe.convert-string": False})

dataset_path = "./data/train/train"
ddf_meta = dd.read_parquet(dataset_path)

# take ~2% of the data (so ~400k rows if you have 20M)
sample_frac = 0.02

ddf_sample = ddf_meta.sample(frac=sample_frac, random_state=42)

In [26]:
cols_to_drop = OTHER_TARGET_COLUMNS+ROW_ID_COLUMN

In [27]:
all_cols = list(ddf_meta.columns)
cols_to_read = [c for c in all_cols if c not in cols_to_drop]

In [28]:
ddf = dd.read_parquet(dataset_path, columns=cols_to_read)

In [4]:
# bring it to pandas for easier modeling with sklearn, etc.
df_sample = ddf_sample.compute() # consider using .head() for even smaller data
print(df_sample.shape)

(412017, 85)


In [5]:
y = df_sample[TARGET_COLUMN]
X = df_sample.drop(columns=TARGET_COLUMN+OTHER_TARGET_COLUMNS+ROW_ID_COLUMN)

train_row_id = df_sample[ROW_ID_COLUMN]

In [6]:
missing_ratio = X.isna().mean()
missing_ratio_sorted = missing_ratio.sort_values(ascending=False)
missing_ratio_sorted

advertiser_actions_action_count             0.987675
advertiser_actions_action_last_timestamp    0.987675
last_advertiser_action                      0.987675
rev_by_adv                                  0.983231
last_buy_ts_category                        0.981190
                                              ...   
dev_os                                      0.000002
advertiser_bundle                           0.000000
hour                                        0.000000
weekday                                     0.000000
datetime                                    0.000000
Length: 67, dtype: float64

In [7]:
cols_to_drop = missing_ratio[missing_ratio > 0.11].index
X_rm = X.drop(columns=cols_to_drop)
print(f"Dropped {len(cols_to_drop)} columns with more than 11% missing values.")

#Chech the size of X
print("Shape of X after dropping columns:", X_rm.shape)

#Check the variables of X
print("Columns of X after dropping columns:", X_rm.columns.tolist())

Dropped 54 columns with more than 11% missing values.
Shape of X after dropping columns: (412017, 13)
Columns of X after dropping columns: ['advertiser_bundle', 'advertiser_category', 'advertiser_subcategory', 'country', 'dev_make', 'dev_model', 'dev_os', 'dev_osv', 'hour', 'release_date', 'release_msrp', 'weekday', 'datetime']


In [8]:
from sklearn.model_selection import train_test_split
import numpy as np

buyer_flag = (y > 0).astype(int)

X_train, X_val, y_train, y_val = train_test_split(
    X_rm,
    y,
    test_size=0.2,
    random_state=42,
    stratify=buyer_flag,
)

In [9]:
cat_cols = X_train.select_dtypes(include=["object", "category"]).columns.tolist()

for c in cat_cols:
    X_train[c] = X_train[c].astype("category")
    X_val[c] = X_val[c].astype("category")

In [10]:
y_train_log = np.log1p(y_train)
y_val_log   = np.log1p(y_val)

In [23]:
import lightgbm as lgb

model = lgb.LGBMRegressor(
    objective="regression",
    n_estimators=20000,
    learning_rate=0.05,
    num_leaves=256,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42,
    n_jobs=-1,
)

model.fit(
    X_train,
    y_train_log,          # log target
    eval_set=[(X_val, y_val_log)],
    eval_metric="rmse",   # metric for logging, MSLE we'll compute ourselves
    callbacks=[lgb.early_stopping(stopping_rounds=50, verbose=False)],
)

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0,004606 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 5082
[LightGBM] [Info] Number of data points in the train set: 329613, number of used features: 13
[LightGBM] [Info] Start training from score 0,067368


0,1,2
,boosting_type,'gbdt'
,num_leaves,256
,max_depth,-1
,learning_rate,0.05
,n_estimators,20000
,subsample_for_bin,200000
,objective,'regression'
,class_weight,
,min_split_gain,0.0
,min_child_weight,0.001


In [24]:
from sklearn.metrics import mean_squared_log_error

y_val_pred_log = model.predict(X_val, num_iteration=model.best_iteration_)
y_val_pred = np.expm1(y_val_pred_log)

# Ensure no negatives due to numeric weirdness
y_val_pred = np.clip(y_val_pred, 0, None)

msle_val = mean_squared_log_error(y_val, y_val_pred)
print("Validation MSLE:", msle_val)

Validation MSLE: 0.17205776139485252


In [13]:
feature_cols = X_rm.columns.tolist()
cat_cols = X_rm.select_dtypes(include=["object", "category"]).columns.tolist()

In [14]:
import dask.dataframe as dd
import numpy as np
import pandas as pd

test_path = "./data/test"

ddf_test = dd.read_parquet(test_path)

print("Test dataset columns:", ddf_test.columns.tolist())

# Keep same feature columns + row_id
ddf_test_small = ddf_test[ROW_ID_COLUMN + feature_cols]

#Check the size of ddf_test_small, the number of rows and columns
print("Number of rows in ddf_test_small:", len(ddf_test_small))

Test dataset columns: ['advertiser_bundle', 'advertiser_category', 'advertiser_subcategory', 'advertiser_bottom_taxonomy_level', 'carrier', 'country', 'region', 'dev_make', 'dev_model', 'dev_os', 'dev_osv', 'hour', 'release_date', 'release_msrp', 'weekday', 'avg_act_days', 'avg_daily_sessions', 'avg_days_ins', 'avg_duration', 'bcat', 'bcat_bottom_taxonomy', 'bundles_cat', 'bundles_cat_bottom_taxonomy', 'bundles_ins', 'city_hist', 'country_hist', 'cpm', 'cpm_pct_rk', 'ctr', 'ctr_pct_rk', 'dev_language_hist', 'dev_osv_hist', 'first_request_ts', 'first_request_ts_bundle', 'first_request_ts_category_bottom_taxonomy', 'hour_ratio', 'iap_revenue_usd_bundle', 'iap_revenue_usd_category', 'iap_revenue_usd_category_bottom_taxonomy', 'last_buy', 'last_buy_ts_bundle', 'last_buy_ts_category', 'last_ins', 'last_install_ts_bundle', 'last_install_ts_category', 'advertiser_actions_action_count', 'advertiser_actions_action_last_timestamp', 'user_actions_bundles_action_count', 'user_actions_bundles_actio

In [15]:
df_test = ddf_test_small.compute()
print(df_test.shape)

test_row_id = df_test["row_id"]
X_test = df_test[feature_cols]

(13188409, 14)


In [16]:
X_test = X_test.copy()

for col in cat_cols:
    if col in X_test.columns:
        X_test[col] = X_test[col].astype("category")

In [17]:
# Predict in log-space
test_pred_log = model.predict(X_test, num_iteration=model.best_iteration_)

# Back to original scale
test_pred = np.expm1(test_pred_log)

# Avoid negatives from numeric noise
test_pred = np.clip(test_pred, 0, None)

submission = pd.DataFrame({
    "row_id": test_row_id,
    "iap_revenue_d7": test_pred,
})

import time
# Add timestamp to submission filename
timestamp = time.strftime("%Y%m%d-%H%M%S")
submission.to_csv(f"submission_baseline_{timestamp}.csv", index=False)
print(submission.head())

                                 row_id  iap_revenue_d7
0  e2f514a9-d922-4a17-bf94-f228bf4cd82f        0.006316
1  4bfc70d3-d619-410a-9683-4cd759f30f32        0.006980
2  ad433b66-b41e-4157-a6fd-24cd30701f6a        0.005570
3  5ed964d6-ddce-42e8-9fad-276eb7f64c2f        0.013729
4  81b73a45-c395-4d08-a4a3-513873440db3        0.006979


In [18]:
submission.shape

(13188409, 2)