In [None]:
import pandas as pd
import numpy as np

pd.set_option("display.max_columns", 50)
pd.set_option("display.max_rows", 100)

In [None]:
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")
sample_submission = pd.read_csv("sample_submission.csv")

In [None]:
train = train.drop_duplicates()
test = test.drop_duplicates()

In [None]:
train["event_time"] = pd.to_datetime(train["event_time"])
test["event_time"] = pd.to_datetime(test["event_time"])

In [None]:
def extract_datetime_features(df, datetime_cols):
    for col in datetime_cols:
        # Temel bileşenler
        df[f"{col}_year"] = df[col].dt.year
        df[f"{col}_month"] = df[col].dt.month
        df[f"{col}_day"] = df[col].dt.day
        df[f"{col}_weekday"] = df[col].dt.dayofweek
        df[f"{col}_dayofyear"] = df[col].dt.dayofyear
        df[f"{col}_quarter"] = df[col].dt.quarter
        df[f"{col}_hour"] = df[col].dt.hour
        df[f"{col}_minute"] = df[col].dt.minute
        df[f"{col}_second"] = df[col].dt.second

        # Boolean özellikler
        df[f"{col}_is_weekend"] = df[col].dt.dayofweek >= 5
        df[f"{col}_is_month_start"] = df[col].dt.is_month_start
        df[f"{col}_is_month_end"] = df[col].dt.is_month_end
        df[f"{col}_is_quarter_start"] = df[col].dt.is_quarter_start
        df[f"{col}_is_quarter_end"] = df[col].dt.is_quarter_end
        df[f"{col}_is_year_start"] = df[col].dt.is_year_start
        df[f"{col}_is_year_end"] = df[col].dt.is_year_end
        df[f"{col}_after_15"] = df[col].dt.day > 15

        # Cyclical encoding (saat için)
        df[f"{col}_hour_sin"] = np.sin(2 * np.pi * df[col].dt.hour / 24)
        df[f"{col}_hour_cos"] = np.cos(2 * np.pi * df[col].dt.hour / 24)

        df[f"{col}_weekday_sin"] = np.sin(2 * np.pi * df[col].dt.dayofweek / 7)
        df[f"{col}_weekday_cos"] = np.cos(2 * np.pi * df[col].dt.dayofweek / 7)

    return df

def extract_features(df_train):
    df_train["buy_ratio"] = df_train["buy_count"] / df_train["total_events"]
    df_train["add_cart_ratio"] = df_train["add_cart_count"] / df_train["total_events"]
    df_train["view_ratio"] = df_train["view_count"] / df_train["total_events"]
    df_train["remove_cart_ratio"] = df_train["remove_cart_count"] / df_train["total_events"]

    df_train["user_past_buy_ratio"] = df_train["user_past_buy_count"] / df_train["user_past_event_count"]
    df_train["user_past_add_cart_ratio"] = df_train["user_past_add_cart_count"] / df_train["user_past_event_count"]
    df_train["user_past_view_ratio"] = df_train["user_past_view_count"] / df_train["user_past_event_count"]
    df_train["user_past_remove_cart_ratio"] = df_train["user_past_remove_cart_count"] / df_train["user_past_event_count"]

    df_train["session_duration"] = (df_train["max_date"] - df_train["min_date"]).dt.total_seconds() / 86400
    df_train["last_activity_gap"] = (df_train["min_date"] - df_train["user_last_event_time"]).dt.total_seconds() / 86400
    df_train["user_activity_duration"] = (df_train["user_last_event_time"] - df_train["user_first_event_time"]).dt.total_seconds() / 86400

    df_train["user_past_session_count_ratio"] = df_train["user_nunique_session"] / df_train["user_past_event_count"]
    df_train["user_past_session_time_ratio"] = df_train["user_nunique_session"] / (df_train["user_activity_duration"]+1e-6)
    df_train["user_past_event_time_ratio"] = df_train["user_past_event_count"] / (df_train["user_activity_duration"]+1e-6)

    df_train["event_frequency"]= df_train['total_events'] / (df_train["session_duration"] + 1e-6)

    return df_train

# Train

In [None]:
df_train = (
    train.groupby("user_session")
            .agg(
                min_date=("event_time", "min"),
                max_date=("event_time", "max"),
                user_id=("user_id", "first"),
                session_value=("session_value", "first"),
                buy_count=("event_type", lambda x: (x == "BUY").sum()),
                add_cart_count=("event_type", lambda x: (x == "ADD_CART").sum()),
                view_count=("event_type", lambda x: (x == "VIEW").sum()),
                remove_cart_count=("event_type", lambda x: (x == "REMOVE_CART").sum()),
                total_events=("event_time", "count"),

                unique_product_count = ("product_id", "nunique"),
                top_product_count = ("product_id", lambda x: x.value_counts().iloc[0]),
                unique_category_count = ("category_id", "nunique"),
                category_switch_count = ("category_id",lambda x: ((x != x.shift(1)) & x.shift(1).notna()).sum()), #kaç kez bir kategoriden diğerine geçilmiş

                intra_session_revisit=("product_id",lambda x: x.value_counts().sub(1).clip(lower=0).sum()),

                intra_session_category_revisit=("category_id",lambda x: x.value_counts().sub(1).clip(lower=0).sum()),
            )
            .reset_index()
            .assign(
            product_per_category = (lambda a: (a["unique_product_count"] / a["unique_category_count"])), #1den büyükse aynı kategoriye ait birden fazla ürün var

            #category_entropy product_per_category'ye göre daha kapsamlı bilgi veriyor. (entropy=0 daha kararlı user demek)
            category_entropy = lambda a: a['user_session'].map(
                train.groupby(['user_session','category_id'])['product_id']
                .nunique()  # session bazında kategori başına unique product
                .groupby('user_session')
                .agg(lambda x: -np.sum((x / x.sum()) * np.log(x / x.sum())))
                ).clip(lower=0) # -0 değerleri geliyordu o yüzden ekledim (başka negatif değer gelmiyordu)
        )
    )

In [None]:
event_per_product = (
    train.groupby(["user_session", "product_id"])["event_type"]
    .nunique()
    .reset_index(name="unique_event_count")
)

# session bazında mono/duo/trio/tetra sayıları
mono_duo_counts = (
    event_per_product.groupby("user_session")["unique_event_count"]
    .value_counts()
    .unstack(fill_value=0)
    .rename(columns={
        1: "session_product_mono_count",
        2: "session_product_duo_count",
        3: "session_product_trio_count",
        4: "session_product_tetra_count"
    })
    .reset_index()
)

df_train = df_train.merge(mono_duo_counts, on="user_session", how="left")

In [None]:
df_train["user_last_event_time"] = pd.NaT
df_train["user_last_event_time"] = (
    df_train["user_last_event_time"]
    .astype("datetime64[ns]")
    .dt.tz_localize("UTC")
)

df_train["user_first_event_time"] = pd.NaT
df_train["user_first_event_time"] = (
    df_train["user_first_event_time"]
    .astype("datetime64[ns]")
    .dt.tz_localize("UTC")
)

In [None]:
for id,row in df_train.iterrows():
    selected = train[ (train["user_id"]==row["user_id"]) & (train["event_time"]<row["min_date"]) ]

    df_train.loc[id, "user_past_event_count"] = len(selected)
    df_train.loc[id, "user_past_add_cart_count"] = (selected["event_type"] == "ADD_CART").sum()
    df_train.loc[id, "user_past_view_count"] = (selected["event_type"] == "VIEW").sum()
    df_train.loc[id, "user_past_buy_count"] = (selected["event_type"] == "BUY").sum()
    df_train.loc[id, "user_past_remove_cart_count"] = (selected["event_type"] == "REMOVE_CART").sum()

    df_train.loc[id, "user_nunique_session"] = selected["user_session"].nunique()
    df_train.loc[id, "user_nunique_product"] = selected["product_id"].nunique()
    df_train.loc[id, "user_nunique_category"] = selected["category_id"].nunique()
    df_train.loc[id, "user_last_event_time"] = selected["event_time"].max()
    df_train.loc[id, "user_first_event_time"] = selected["event_time"].min()

    # inter-session revisit count
    product_counts = selected['product_id'].value_counts()
    revisit_count = (product_counts.sub(1).clip(lower=0)).sum()
    df_train.loc[id, 'inter_session_revisit'] = revisit_count

    # Inter-session category revisit count
    category_counts = selected['category_id'].value_counts()
    inter_session_category_count = (category_counts.sub(1).clip(lower=0)).sum()
    df_train.loc[id, 'inter_session_category_count'] = inter_session_category_count

    event_per_product = selected.groupby("product_id")["event_type"].nunique().value_counts()
    df_train.loc[id, "user_past_product_mono_count"] = event_per_product.get(1, 0)
    df_train.loc[id, "user_past_product_duo_count"] = event_per_product.get(2, 0)
    df_train.loc[id, "user_past_product_trio_count"] = event_per_product.get(3, 0)
    df_train.loc[id, "user_past_product_tetra_count"] = event_per_product.get(4, 0)

    event_per_category = selected.groupby("category_id")["event_type"].nunique().value_counts()
    df_train.loc[id, "user_past_category_mono_count"] = event_per_category.get(1, 0)
    df_train.loc[id, "user_past_category_duo_count"] = event_per_category.get(2, 0)
    df_train.loc[id, "user_past_category_trio_count"] = event_per_category.get(3, 0)
    df_train.loc[id, "user_past_category_tetra_count"] = event_per_category.get(4, 0)

    df_train.loc[id, "past_value_sum"] = (selected.drop_duplicates(subset="user_session")["session_value"].sum())
    df_train.loc[id, "past_value_mean"] = (selected.drop_duplicates(subset="user_session")["session_value"].mean())

In [None]:
df_train = extract_features(df_train)
df_train = extract_datetime_features(df_train,["min_date"])

In [None]:
df_train = df_train.drop(["user_last_event_time","user_first_event_time"],axis=1)
df_train = df_train[ (df_train["user_session"] != "SESSION_000000") | (df_train["user_session"] != "SESSION_020888") ]

In [None]:
columns_0= ["user_activity_duration","user_past_event_time_ratio","user_past_session_time_ratio","user_past_session_count_ratio",
           "user_past_remove_cart_ratio","user_past_view_ratio","user_past_add_cart_ratio","user_past_buy_ratio","category_entropy","past_value_mean"]
columns_99= ["last_activity_gap"]
for i in columns_0:
    df_train[i] = df_train[i].fillna(0)
for i in columns_99:
    df_train[i] = df_train[i].fillna(999)

In [None]:
df_train.to_csv("train_final.csv",index=False)

In [None]:
df_train.shape

(70736, 78)

In [None]:
del df_train

# Test

In [None]:
df_test = (
    test.groupby("user_session")
            .agg(
                min_date=("event_time", "min"),
                max_date=("event_time", "max"),
                user_id=("user_id", "first"),
                buy_count=("event_type", lambda x: (x == "BUY").sum()),
                add_cart_count=("event_type", lambda x: (x == "ADD_CART").sum()),
                view_count=("event_type", lambda x: (x == "VIEW").sum()),
                remove_cart_count=("event_type", lambda x: (x == "REMOVE_CART").sum()),
                total_events=("event_time", "count"),

                unique_product_count = ("product_id", "nunique"),
                top_product_count = ("product_id", lambda x: x.value_counts().iloc[0]),
                unique_category_count = ("category_id", "nunique"),
                category_switch_count = ("category_id",lambda x: ((x != x.shift(1)) & x.shift(1).notna()).sum()), #kaç kez bir kategoriden diğerine geçilmiş

                intra_session_revisit=("product_id",lambda x: x.value_counts().sub(1).clip(lower=0).sum()),

                intra_session_category_revisit=("category_id",lambda x: x.value_counts().sub(1).clip(lower=0).sum()),
            )
            .reset_index()
            .assign(
            product_per_category = (lambda a: (a["unique_product_count"] / a["unique_category_count"])), #1den büyükse aynı kategoriye ait birden fazla ürün var

            #category_entropy product_per_category'ye göre daha kapsamlı bilgi veriyor. (entropy=0 daha kararlı user demek)
            category_entropy = lambda a: a['user_session'].map(
                train.groupby(['user_session','category_id'])['product_id']
                .nunique()  # session bazında kategori başına unique product
                .groupby('user_session')
                .agg(lambda x: -np.sum((x / x.sum()) * np.log(x / x.sum())))
                ).clip(lower=0) # -0 değerleri geliyordu o yüzden ekledim (başka negatif değer gelmiyordu)
        )
    )

In [None]:
event_per_product = (
    test.groupby(["user_session", "product_id"])["event_type"]
    .nunique()
    .reset_index(name="unique_event_count")
)

# session bazında mono/duo/trio/tetra sayıları
mono_duo_counts = (
    event_per_product.groupby("user_session")["unique_event_count"]
    .value_counts()
    .unstack(fill_value=0)
    .rename(columns={
        1: "session_product_mono_count",
        2: "session_product_duo_count",
        3: "session_product_trio_count",
        4: "session_product_tetra_count"
    })
    .reset_index()
)

df_test = df_test.merge(mono_duo_counts, on="user_session", how="left")

In [None]:
df_test["user_last_event_time"] = pd.NaT
df_test["user_last_event_time"] = (
    df_test["user_last_event_time"]
    .astype("datetime64[ns]")
    .dt.tz_localize("UTC")
)

df_test["user_first_event_time"] = pd.NaT
df_test["user_first_event_time"] = (
    df_test["user_first_event_time"]
    .astype("datetime64[ns]")
    .dt.tz_localize("UTC")
)

In [None]:
for id,row in df_test.iterrows():
    selected = pd.concat([
        train[train["user_id"] == row["user_id"]],
        test[(test["user_id"] == row["user_id"]) & (test["event_time"] < row["min_date"])]
    ], ignore_index=True)

    df_test.loc[id, "user_past_event_count"] = len(selected)
    df_test.loc[id, "user_past_add_cart_count"] = (selected["event_type"] == "ADD_CART").sum()
    df_test.loc[id, "user_past_view_count"] = (selected["event_type"] == "VIEW").sum()
    df_test.loc[id, "user_past_buy_count"] = (selected["event_type"] == "BUY").sum()
    df_test.loc[id, "user_past_remove_cart_count"] = (selected["event_type"] == "REMOVE_CART").sum()

    df_test.loc[id, "user_nunique_session"] = selected["user_session"].nunique()
    df_test.loc[id, "user_nunique_product"] = selected["product_id"].nunique()
    df_test.loc[id, "user_nunique_category"] = selected["category_id"].nunique()
    df_test.loc[id, "user_last_event_time"] = selected["event_time"].max()
    df_test.loc[id, "user_first_event_time"] = selected["event_time"].min()

    # inter-session revisit count
    product_counts = selected['product_id'].value_counts()
    revisit_count = (product_counts.sub(1).clip(lower=0)).sum()
    df_test.loc[id, 'inter_session_revisit'] = revisit_count

    # Inter-session category revisit count
    category_counts = selected['category_id'].value_counts()
    inter_session_category_count = (category_counts.sub(1).clip(lower=0)).sum()
    df_test.loc[id, 'inter_session_category_count'] = inter_session_category_count

    event_per_product = selected.groupby("product_id")["event_type"].nunique().value_counts()
    df_test.loc[id, "user_past_product_mono_count"] = event_per_product.get(1, 0)
    df_test.loc[id, "user_past_product_duo_count"] = event_per_product.get(2, 0)
    df_test.loc[id, "user_past_product_trio_count"] = event_per_product.get(3, 0)
    df_test.loc[id, "user_past_product_tetra_count"] = event_per_product.get(4, 0)

    event_per_category = selected.groupby("category_id")["event_type"].nunique().value_counts()
    df_test.loc[id, "user_past_category_mono_count"] = event_per_category.get(1, 0)
    df_test.loc[id, "user_past_category_duo_count"] = event_per_category.get(2, 0)
    df_test.loc[id, "user_past_category_trio_count"] = event_per_category.get(3, 0)
    df_test.loc[id, "user_past_category_tetra_count"] = event_per_category.get(4, 0)

    df_test.loc[id, "past_value_sum"] = (selected.drop_duplicates(subset="user_session")["session_value"].sum())
    df_test.loc[id, "past_value_mean"] = (selected.drop_duplicates(subset="user_session")["session_value"].mean())

In [None]:
df_test = extract_features(df_test)
df_test = extract_datetime_features(df_test,["min_date"])

In [None]:
df_test = df_test.drop(["user_last_event_time","user_first_event_time"],axis=1)

In [None]:
columns_0= ["user_activity_duration","user_past_event_time_ratio","user_past_session_time_ratio","user_past_session_count_ratio",
           "user_past_remove_cart_ratio","user_past_view_ratio","user_past_add_cart_ratio","user_past_buy_ratio","category_entropy"]
columns_99= ["last_activity_gap"]
for i in columns_0:
    df_test[i] = df_test[i].fillna(0)
for i in columns_99:
    df_test[i] = df_test[i].fillna(999)

In [None]:
df_test.to_csv("test_final.csv",index=False)