In [1]:
import os
import sys
import matplotlib.pyplot as plt



PROJECT_ROOT = os.getcwd()

if PROJECT_ROOT not in sys.path:
    sys.path.insert(0, PROJECT_ROOT)

print("PROJECT_ROOT:", PROJECT_ROOT)
print("Contenido de ./data:", os.listdir(os.path.join(PROJECT_ROOT, "data")))

PROJECT_ROOT: c:\Users\adria.flores\Documents\Projects\hacks\datathon2025-smadex
Contenido de ./data: ['sample_submission.csv', 'test', 'train']


In [2]:
import dask
import dask.dataframe as dd
# When dask uses pandas 2.0.0+ it casts object columns to string automatically ([("i", 0.48)] -> '[("i", 0.48)]')
dask.config.set({"dataframe.convert-string": False})

dataset_path = "./data/train"
filters = [("datetime", ">=", "2025-10-01-00-00"), ("datetime", "<", "2025-10-13-00-00")]

ddf = dd.read_parquet(
    dataset_path,
    filters=filters
)

In [None]:
# 1. Basic structure
print("=== Basic info ===")
print("Number of partitions:", ddf.npartitions)
print("Columns:", list(ddf.columns))

print("\n=== Dtypes ===")
print(ddf.dtypes)

# 2. Quick look at the data
print("\n=== Head ===")
display(ddf.head())  # small preview, does not require full compute

# 3. Summary statistics for numeric columns
print("\n=== Numeric summary (describe) ===")
numeric_summary = ddf.describe().compute()
display(numeric_summary)

# 4. Missing values per column (as percentage)
print("\n=== Missing values (%) per column ===")
missing_pct = (ddf.isna().mean() * 100).compute()
display(missing_pct.sort_values(ascending=False))

# 5. Categorical / object-like columns: top values
print("\n=== Top values for categorical columns ===")
cat_cols = ddf.select_dtypes(include=["object", "category"]).columns

for col in cat_cols:
    print(f"\nColumn: {col}")
    vc = ddf[col].value_counts().compute().head(10)
    display(vc)

# 6. Numeric distributions: histograms for a subset of columns
print("\n=== Histograms for numeric columns (sample) ===")
numeric_cols = ddf.select_dtypes(include=["number"]).columns

# Take a sample to avoid computing the entire dataset
sample = ddf[numeric_cols].sample(frac=0.05, random_state=42).compute()

for col in numeric_cols:
    plt.figure()
    sample[col].hist(bins=30)
    plt.title(f"Histogram of {col}")
    plt.xlabel(col)
    plt.ylabel("Frequency")
    plt.show()


=== Basic info ===
Number of partitions: 144
Columns: ['buyer_d1', 'buyer_d7', 'buyer_d14', 'buyer_d28', 'buy_d7', 'buy_d14', 'buy_d28', 'iap_revenue_d7', 'iap_revenue_d14', 'iap_revenue_d28', 'registration', 'retention_d1_to_d7', 'retention_d3_to_d7', 'retention_d7_to_d14', 'retention_d1', 'retention_d3', 'retentiond7', 'advertiser_bundle', 'advertiser_category', 'advertiser_subcategory', 'advertiser_bottom_taxonomy_level', 'carrier', 'country', 'region', 'dev_make', 'dev_model', 'dev_os', 'dev_osv', 'hour', 'release_date', 'release_msrp', 'weekday', 'avg_act_days', 'avg_daily_sessions', 'avg_days_ins', 'avg_duration', 'bcat', 'bcat_bottom_taxonomy', 'bundles_cat', 'bundles_cat_bottom_taxonomy', 'bundles_ins', 'city_hist', 'country_hist', 'cpm', 'cpm_pct_rk', 'ctr', 'ctr_pct_rk', 'dev_language_hist', 'dev_osv_hist', 'first_request_ts', 'first_request_ts_bundle', 'first_request_ts_category_bottom_taxonomy', 'hour_ratio', 'iap_revenue_usd_bundle', 'iap_revenue_usd_category', 'iap_revenu

Unnamed: 0,buyer_d1,buyer_d7,buyer_d14,buyer_d28,buy_d7,buy_d14,buy_d28,iap_revenue_d7,iap_revenue_d14,iap_revenue_d28,...,user_bundles_l28d,weekend_ratio,weeks_since_first_seen,wifi_ratio,whale_users_bundle_num_buys_prank,whale_users_bundle_revenue_prank,whale_users_bundle_total_num_buys,whale_users_bundle_total_revenue,row_id,datetime
0,0,1,1,1,1,1,1,2.147718,2.147718,2.147718,...,"[88981729bd5c1e5aea9ada4bce00a2531e9e98f7, 25c...",0.019802,6.0,0.913366,,,,,819ecc0e-1a97-43ed-83f6-b9ede4f7fc48,2025-10-01-00-00
1,0,0,0,0,0,0,0,0.0,0.0,0.0,...,,,,,,,,,0a7fbf18-5041-42af-bd0a-0cb6586b8598,2025-10-01-00-00
2,0,0,0,0,0,0,0,0.0,0.0,0.0,...,"[6506b7e0a24666debd08f74266800f2eb154df5a, 150...",0.399021,6.0,0.999388,,,,,fc1a2689-b136-4ffa-b23b-9d8215bd720f,2025-10-01-00-00
3,0,0,0,0,0,0,0,0.0,0.0,0.0,...,"[2b472e3dc96f1847490d7411b25e12ed417b9714, 3ba...",0.121547,6.0,1.0,,,,,0340fcc6-50bd-42ab-b9f4-4c1184b640cb,2025-10-01-00-00
4,0,0,0,0,0,0,0,0.0,0.0,0.0,...,"[1031535cf2a1315422fd05d321349bcd3c3ffc04, 478...",0.293285,6.0,0.160243,,,,,219d253f-bef4-4039-84b2-ed55f009cc43,2025-10-01-00-00



=== Numeric summary (describe) ===


Unnamed: 0,buyer_d1,buyer_d7,buyer_d14,buyer_d28,buy_d7,buy_d14,buy_d28,iap_revenue_d7,iap_revenue_d14,iap_revenue_d28,...,release_msrp,weekday,avg_act_days,avg_days_ins,first_request_ts,last_buy,last_ins,weekend_ratio,weeks_since_first_seen,wifi_ratio
count,20600580.0,20600580.0,20600580.0,20600580.0,20600580.0,20600580.0,20600580.0,20600580.0,20600580.0,20600580.0,...,18479030.0,20600580.0,11172710.0,921996.0,9405562.0,377675.0,3026575.0,11600270.0,12248960.0,11599480.0
mean,0.02376821,0.03343634,0.03532721,0.03717182,0.07203491,0.09582308,0.1322014,1.437688,1.773321,2.332586,...,522.5308,4.389697,3.862084,5.392996,1758974000.0,1758342000.0,1758553000.0,0.3316598,4.882932,0.5782966
std,0.1523263,0.1797731,0.1846055,0.1891827,0.9459988,1.524582,2.566221,431.2014,441.5945,482.4463,...,496.9947,1.968518,2.085652,5.666884,445976.9,767671.6,753614.6,0.2719729,1.940092,0.4176631
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-5.8,-14.30459,...,23.0,1.0,1.0,0.0,1758067000.0,1756598000.0,1756771000.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,182.0,3.0,2.0,1.0,1758499000.0,1758367000.0,1758277000.0,0.1470588,5.0,0.2692308
50%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,300.0,5.0,4.0,4.0,1759190000.0,1758870000.0,1758954000.0,0.2997022,6.0,0.8531469
75%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,899.0,6.0,6.0,8.5,1759363000.0,1759104000.0,1759363000.0,0.5,6.0,1.0
max,1.0,1.0,1.0,1.0,539.0,1324.0,2443.0,861191.1,861191.1,861191.1,...,5160.0,7.0,7.0,28.0,1759622000.0,1759709000.0,1759709000.0,1.0,6.0,1.0



=== Missing values (%) per column ===
