In [2]:
import pandas as pd
import numpy as np
import os

print("📦 Loading event logs...")
event_df = pd.read_parquet("../../Dataset/add_event.parquet")

# Ensure dtypes are optimal
event_df["id2"] = event_df["id2"].astype(str)
event_df["id3"] = event_df["id3"].astype(str)
event_df["id4"] = pd.to_datetime(event_df["id4"], errors="coerce")

# Only keep relevant rows
event_df = event_df[event_df["id6"] == "Tiles"].dropna(subset=["id4"])

# Sort by time for delay calculation
event_df.sort_values(["id2", "id3", "id4"], inplace=True)

print("🧮 Calculating delays...")
# Compute click delays per (id2, id3)
event_df["click_delay"] = event_df.groupby(["id2", "id3"])["id4"].diff().dt.total_seconds()
event_df["click_delay"].fillna(0, inplace=True)

print("🔁 Aggregating event stats...")

def aggregate_event(df, level):
    grouped = df.groupby(level).agg(
        clicks=("id6", "count"),
        views=("id6", "count"),  # same as clicks if no separate views
        click_rate=("id6", "count"),  # placeholder, = clicks/views = 1
        avg_click_delay=("click_delay", "mean"),
        min_click_delay=("click_delay", "min"),
        max_click_delay=("click_delay", "max"),
        std_click_delay=("click_delay", "std")
    ).fillna(0)

    grouped["click_rate"] = 1.0  # If views==clicks
    return grouped.reset_index()

# (1) id2-id3 pair
pair_df = aggregate_event(event_df, ["id2", "id3"])
pair_df.to_parquet("../../Code/EventsEncoding/event_pair_agg.parquet", index=False)

# (2) id2 (customer)
cust_df = aggregate_event(event_df, "id2")
cust_df.to_parquet("../../Code/EventsEncoding/event_customer_agg.parquet", index=False)

# (3) id3 (offer)
offer_df = aggregate_event(event_df, "id3")
offer_df.to_parquet("../../Code/EventsEncoding/event_offer_agg.parquet", index=False)

print("✅ Event encoder done (pair + customer + offer level)")


📦 Loading event logs...
🧮 Calculating delays...


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  event_df["click_delay"].fillna(0, inplace=True)


🔁 Aggregating event stats...
✅ Event encoder done (pair + customer + offer level)


In [10]:
import pandas as pd

pair_df = pd.read_parquet("../EventsEncoding/event_pair_agg.parquet")
cust_df = pd.read_parquet("../EventsEncoding/event_customer_agg.parquet")
offer_df = pd.read_parquet("../EventsEncoding/event_offer_agg.parquet")

print("Pair-level shape:", pair_df.shape)
print("Customer-level shape:", cust_df.shape)
print("Offer-level shape:", offer_df.shape)

print("Columns:", pair_df.columns.tolist())


Pair-level shape: (5071961, 9)
Customer-level shape: (156657, 8)
Offer-level shape: (867, 8)
Columns: ['id2', 'id3', 'clicks', 'views', 'click_rate', 'avg_click_delay', 'min_click_delay', 'max_click_delay', 'std_click_delay']


In [14]:
import numpy as np

# --------- Load the aggregated Parquet files ---------
pair_agg = pd.read_parquet("event_pair_agg.parquet")
customer_agg = pd.read_parquet("event_customer_agg.parquet")
offer_agg = pd.read_parquet("event_offer_agg.parquet")

# --------- 1. Shape and Column Sanity ---------
print("\n📐 Aggregate shapes & columns:")
print("Pair-level:", pair_agg.shape)
print("Columns:", list(pair_agg.columns))
print("Customer-level:", customer_agg.shape)
print("Offer-level:", offer_agg.shape)

# --------- 2. Null Checks ---------
print("\n🧪 Null checks:")
print("Pair-level nulls:\n", pair_agg.isnull().sum())
print("Customer-level nulls:\n", customer_agg.isnull().sum())
print("Offer-level nulls:\n", offer_agg.isnull().sum())

# --------- 3. Consistency Check for click_rate ---------
def check_click_rate(df, level_name):
    expected = df["clicks"] / df["views"]
    actual = df["click_rate"]
    mismatched = ~np.isclose(expected, actual, atol=1e-3)
    print(f"🔍 {level_name} click_rate mismatches: {mismatched.sum()}")

check_click_rate(pair_agg, "Pair-level")
check_click_rate(customer_agg, "Customer-level")
check_click_rate(offer_agg, "Offer-level")

# --------- 4. Sample Preview ---------
print("\n🧾 Sample entries:")
print("Pair-level:\n", pair_agg.sample(2))
print("Customer-level:\n", customer_agg.sample(2))
print("Offer-level:\n", offer_agg.sample(2))





📐 Aggregate shapes & columns:
Pair-level: (5071961, 9)
Columns: ['id2', 'id3', 'clicks', 'views', 'click_rate', 'avg_click_delay', 'min_click_delay', 'max_click_delay', 'std_click_delay']
Customer-level: (156657, 8)
Offer-level: (867, 8)

🧪 Null checks:
Pair-level nulls:
 id2                0
id3                0
clicks             0
views              0
click_rate         0
avg_click_delay    0
min_click_delay    0
max_click_delay    0
std_click_delay    0
dtype: int64
Customer-level nulls:
 id2                0
clicks             0
views              0
click_rate         0
avg_click_delay    0
min_click_delay    0
max_click_delay    0
std_click_delay    0
dtype: int64
Offer-level nulls:
 id3                0
clicks             0
views              0
click_rate         0
avg_click_delay    0
min_click_delay    0
max_click_delay    0
std_click_delay    0
dtype: int64
🔍 Pair-level click_rate mismatches: 0
🔍 Customer-level click_rate mismatches: 0
🔍 Offer-level click_rate mismatches: 0
