# Imports

In [1]:
import os
import sys
import matplotlib.pyplot as plt
import pandas as pd

# Set the root path to ease the imports

In [2]:
PROJECT_ROOT = os.getcwd()

if PROJECT_ROOT not in sys.path:
    sys.path.insert(0, PROJECT_ROOT)

print("PROJECT_ROOT:", PROJECT_ROOT)
print("Contenido de ./data:", os.listdir(os.path.join(PROJECT_ROOT, "data")))

PROJECT_ROOT: c:\Users\adria.flores\Documents\Projects\hacks\datathon2025-smadex
Contenido de ./data: ['sample_submission.csv', 'test', 'train']


# Read the train parquet and split it in train and val

In [4]:
import dask.dataframe as dd
import dask
dask.config.set({"dataframe.convert-string": False})

DATASET_PATH = "data/train/train"

train_ddf = dd.read_parquet(
    DATASET_PATH,
    filters=[("datetime", "<", "2025-10-07-00-00")],
)

val_ddf = dd.read_parquet(
    DATASET_PATH,
    filters=[
        ("datetime", ">=", "2025-10-07-00-00"),
        ("datetime", "<",  "2025-10-08-00-00"),
    ],
)

obj_cols_train = list(train_ddf.select_dtypes(include=["object"]).columns)
obj_cols_val   = list(val_ddf.select_dtypes(include=["object"]).columns)

print("Columnas object que pasamos a string (train):", obj_cols_train)

train_ddf = train_ddf.astype({c: "string" for c in obj_cols_train})
val_ddf   = val_ddf.astype({c: "string" for c in obj_cols_val})

# Ahora sÃ­: guardar splits
train_ddf.to_parquet("data/split/train", write_index=False)
val_ddf.to_parquet("data/split/val", write_index=False)

Columnas object que pasamos a string (train): ['advertiser_bundle', 'advertiser_category', 'advertiser_subcategory', 'advertiser_bottom_taxonomy_level', 'carrier', 'country', 'region', 'dev_make', 'dev_model', 'dev_os', 'dev_osv', 'hour', 'release_date', 'avg_daily_sessions', 'avg_duration', 'bcat', 'bcat_bottom_taxonomy', 'bundles_cat', 'bundles_cat_bottom_taxonomy', 'bundles_ins', 'city_hist', 'country_hist', 'cpm', 'cpm_pct_rk', 'ctr', 'ctr_pct_rk', 'dev_language_hist', 'dev_osv_hist', 'first_request_ts_bundle', 'first_request_ts_category_bottom_taxonomy', 'hour_ratio', 'iap_revenue_usd_bundle', 'iap_revenue_usd_category', 'iap_revenue_usd_category_bottom_taxonomy', 'last_buy_ts_bundle', 'last_buy_ts_category', 'last_install_ts_bundle', 'last_install_ts_category', 'advertiser_actions_action_count', 'advertiser_actions_action_last_timestamp', 'user_actions_bundles_action_count', 'user_actions_bundles_action_last_timestamp', 'last_advertiser_action', 'new_bundles', 'num_buys_bundle', 

In [None]:
ddf = dd.read_parquet(DATASET_PATH, columns=["datetime", "buyer_d7"])

train_ddf = ddf[ddf["datetime"] < "2025-10-07-00-00"]
val_ddf   = ddf[(ddf["datetime"] >= "2025-10-07-00-00") & (ddf["datetime"] < "2025-10-08-00-00")]

def buyer_stats(ddf, name):
    total = ddf.shape[0].compute()
    buyers = ddf["buyer_d7"].sum().compute()
    rate   = ddf["buyer_d7"].mean().compute()
    print(f"{name}: total={total}, buyers={buyers}, rate={rate:.6f}")

buyer_stats(train_ddf, "TRAIN")
buyer_stats(val_ddf,   "VAL")

In [None]:
# nos quedamos solo con la fecha (sin hora)
train_ddf_dates = train_ddf.assign(
    date=train_ddf["datetime"].str.slice(0, 10)  # si es string tipo "2025-10-01-00-00"
)

buyers_by_date = (
    train_ddf_dates[["date", "buyer_d7"]]
    .groupby("date")
    .mean()
    .compute()
    .rename(columns={"buyer_d7": "buyer_d7_rate"})
)

print(buyers_by_date)
