# In this notebook, window size is (0,1]

In [16]:
from collections import defaultdict, Counter
from typing import List, Dict, Union

from tqdm import tqdm
import polars as pl

In [17]:
def construct_session_sequnce(df: pl.DataFrame) -> pl.DataFrame:
  df = df.explode(['prev_items']) # Expand each item in prev_items into distinct rows
  df = df.with_columns(
        df.select(pl.col("session_id").cum_count().over('session_id').alias("sequence_num"))
    ) # add column 'sequence_num' as the position in the sequence 'session_id'
  return df

In [18]:
def covisit_matrix(df:pl.DataFrame) -> pl.DataFrame:
    # Creation of co-occurrence pairs
    df = df[["session_id", "prev_items", "sequence_num"]].join(df[["session_id", "prev_items", "sequence_num"]], on="session_id")

    # Calculate the co-occurrence interval and filter
    df = df.with_columns(
        (pl.col("sequence_num_right").cast(pl.Int64) - pl.col("sequence_num").cast(pl.Int64)).alias("diff_sequence_num")
    )
    df = df.filter((pl.col("diff_sequence_num") == 1))
    df = df.filter(pl.col("prev_items") != pl.col("prev_items_right"))

    # Calculate weight and sum for each co-occurrence pair
    df = df.with_columns(
        pl.lit(1).alias("consective_1_weight")
    )
    df = df.groupby(["prev_items", "prev_items_right"]).sum()
    df = df.rename({"prev_items":"item", "prev_items_right":"candidate_item"})[["item", "candidate_item", "consective_1_weight"]]

    return df

In [19]:
"""
从“共同访问矩阵”（co_visit_matrix）中过滤出在指定地区（LOCALES 列表中）都可用的商品对，计算共同访问的权重并进行排名。
"""
def filter_by_locale_availability(co_visit_matrix:pl.DataFrame, product:pl.DataFrame):
    product = product.unique(subset=["id"])
    product = product[["id", "available_locales"]]
    co_visit_matrix = co_visit_matrix.join(product, left_on="item", right_on="id", how="left").rename({"available_locales":"item_locales"})
    co_visit_matrix = co_visit_matrix.join(product, left_on="candidate_item", right_on="id", how="left").rename({"available_locales":"candidate_item_locales"})
    dfs = []
    for locale in LOCALES:
        df = co_visit_matrix.filter(pl.lit(locale).is_in(pl.col("item_locales")) & pl.lit(locale).is_in(pl.col("candidate_item_locales")))
        df = df.with_columns(pl.lit(locale).alias("locale"))
        df = df[["item", "candidate_item", "consective_1_weight", "locale"]]
        df = df.sort(["item", "consective_1_weight"], descending=[False, True])
        df = df.groupby("item", maintain_order=True).head(TOP_N)
        df = df.with_columns(
            pl.col("consective_1_weight").rank(descending=True, method="min").over("item").alias("consective_1_rank")
        )
        dfs.append(df)
    co_visit_matrix = pl.concat(dfs)
    return co_visit_matrix

**Build the co-visit matrix**

In [20]:
#WINDOW_N = 10
#WEIGHTS = {-10:0.01, -9:0.01, -8:0.01, -7:0.01, -6:0.01, -5:0.05, -4:0.05, -3:0.1, -2:0.25, -1:0.5, 1:3, \ 2:0.5, 3:0.25, 4:0.1, 5:0.05, 6:0.01, 7:0.01, 8:0.01, 9:0.01, 10:0.01, }
TOP_N = 200
LOCALES = ["DE", "UK", "JP"]
version = "12"

In [21]:
train = pl.read_parquet("/root/KDDCUP/KDDCUP_DATA_CLEANED/task1_session_train.parquet")
train = construct_session_sequnce(train)
product = pl.read_parquet("/root/KDDCUP/KDDCUP_DATA_CLEANED/product.parquet")
co_visit_matrix = covisit_matrix(train)
co_visit_matrix = filter_by_locale_availability(co_visit_matrix, product)

  df = df.with_columns(


In [22]:
co_visit_matrix.head()

item,candidate_item,co_visit_weight,locale,co_visit_rank
str,str,f64,str,u32
"""0007440847""","""0261103563""",0.5,"""DE""",1
"""0007440847""","""3608938184""",0.5,"""DE""",1
"""0007440847""","""3608939849""",0.4,"""DE""",3
"""0007440847""","""3423214120""",0.25,"""DE""",4
"""0007440847""","""0008376107""",0.25,"""DE""",4


**Save the result**

In [23]:
co_visit_matrix.write_parquet("/root/KDDCUP/KDDCUP_RECALL_INTERM/co_visit_matrix_{version}_for_train_or_eval.parquet")

**Test the training loss (MRR@100)**

In [24]:
train = pl.read_parquet("/root/KDDCUP/KDDCUP_DATA_CLEANED/task1_session_train.parquet")

# Get the last item
last_item_list = []
prev_items_list = train["prev_items"].to_list()
for prev_items in prev_items_list:
    last_item_list.append(prev_items[-1])
train = train.with_columns(pl.Series(name="last_item", values=last_item_list))

train = train[["session_id", "locale", "last_item", "next_item"]]

# Merge candidate and add label
dfs = []
label_lists = []
for locale in LOCALES:
    df = train.filter(pl.col("locale")==locale)
    matrix = co_visit_matrix.filter(pl.col("locale")==locale)
    df = df.join(matrix, left_on=["locale", "last_item"], right_on=["locale", "item"], how="left")
    df = df.sort(["session_id", "consective_1_weight"], descending=[False, True])
    df = df.with_columns((pl.col("candidate_item") == pl.col("next_item")).cast(pl.Int8).alias("label"))
    label_lists.extend(df.group_by("session_id", maintain_order=True).all()["label"].to_list())
    
# MRR
rr = 0
for labels in label_lists:
    labels = labels[:100]
    for i, label in enumerate(labels):
        if label == 1:
            rr += 1 / (i+1)
            break
mrr = rr / len(label_lists)
print("MRR:", round(mrr, 5))

MRR: 0.10847


**Now add the next_item into prev_item to do inference**

In [2]:
train = pl.read_parquet("/root/KDDCUP/KDDCUP_DATA_CLEANED/task1_session_train.parquet")

# Append train's next_item to prev_items
prev_items_list = train["prev_items"].to_list()
next_item_list = train["next_item"].to_list()
prev_items_list_updated = []
for prev_items, next_item in zip(prev_items_list, next_item_list):
    prev_items.append(next_item)
    prev_items_list_updated.append(prev_items)

train = train.with_columns(
    pl.Series(name="prev_items", values=prev_items_list_updated)
)

# preprocess the data
train = construct_session_sequnce(train)
product = pl.read_parquet("/root/KDDCUP/KDDCUP_DATA_CLEANED/product.parquet")
co_visit_matrix = covisit_matrix(train)
co_visit_matrix = filter_by_locale_availability(co_visit_matrix, product)


NameError: name 'pl' is not defined

**Save the result**

In [None]:
co_visit_matrix.write_parquet("/root/KDDCUP/KDDCUP_RECALL_INTERM/co_visit_matrix_{version}_for_inference.parquet")