In [1]:
import polars as pl
import numpy as np 



def load_candidates_df(filepath: str) -> pl.DataFrame:
    """Load candidates data from a CSV file into a Polars DataFrame."""
    df = pl.read_parquet(filepath)
    return df

parquet_df_path = "candidates_stage1.parquet"
candidates_df = load_candidates_df(parquet_df_path)
candidates_df.head()

customer_id,item_id
i64,str
14732,"""0029140000031"""
14732,"""5471000000006"""
14732,"""5471000000005"""
14732,"""2744000000008"""
14732,"""2744000000009"""


In [1]:
candidates_df.shape

NameError: name 'candidates_df' is not defined

In [4]:
purchase_df_path = "./data/table/purchase_new.parquet"
purchase_df = pl.read_parquet(purchase_df_path)
purchase_df = purchase_df.select(['customer_id','item_id'])
purchase_df.head()

customer_id,item_id
i32,str
5254214,"""7115000000004"""
7573232,"""0029130000030"""
8187418,"""3496000000053"""
8187418,"""2700000000002"""
6931560,"""0029110000036"""


In [5]:
def group_by_customer_id(purchase_df: pl.DataFrame) -> pl.DataFrame:
    """Group the purchase DataFrame by customer_id and aggregate item_id into a list."""
    grouped_df = purchase_df.group_by("customer_id").agg(
        pl.col("item_id").alias("item_id")
    )
    return grouped_df



In [6]:
candidates_df = group_by_customer_id(candidates_df)
candidates_df.head()

customer_id,item_id
i64,list[str]
7961288,"[""2542000000003"", ""5950000000002"", … ""5858000000001""]"
1107503,"[""4869000000011"", ""4869000000001"", … ""5617000000001""]"
7057750,"[""2488000000006"", ""0028090000046"", … ""5137000000002""]"
4042637,"[""6697000000002"", ""6697000000001"", … ""1434000000013""]"
4324589,"[""6065000000004"", ""6065000000003"", … ""5537000000012""]"


In [7]:
history_df = group_by_customer_id(purchase_df)
history_df.head()

customer_id,item_id
i32,list[str]
7111746,"[""4789000000015"", ""2231000000012"", … ""0008104000002""]"
941028,"[""2287000850001"", ""2287000850001"", … ""2287000850001""]"
8033118,"[""4467000000001"", ""4467000000004"", … ""6848000000003""]"
2655951,"[""2180000000025"", ""2166000000003"", … ""4279000000061""]"
2849561,"[""5843000000005"", ""6997000000032"", … ""2202000000008""]"


In [8]:
groundtruth_path = 'data/final_groundtruth.pkl'
import pickle
with open(groundtruth_path, 'rb') as f:
    groundtruth = pickle.load(f)

def convert_dict_to_pl_df(groundtruth: dict) -> pl.DataFrame:
    """Convert a dictionary of groundtruth data to a Polars DataFrame."""
    records = [{"customer_id": k, "item_id": v} for k, v in groundtruth.items()]
    df = pl.DataFrame(records)
    return df

groundtruth = pl.from_pandas(groundtruth)
groundtruth.head()

customer_id,item_id
i32,list[str]
2337685,"[""0020010000305""]"
7934799,"[""0020010000438""]"
2052333,"[""3513000000064"", ""2403000000004"", … ""2125000000025""]"
6548920,"[""6701000000004""]"
368770,"[""5420000000003"", ""5420000000002"", … ""6768000000003""]"


In [23]:
candidates_df.head()

customer_id,item_id
i64,str
2033290,"""0020020000185"""
2033290,"""5857000000001"""
2033290,"""4355000000001"""
2033290,"""1371000000006"""
2033290,"""6697000000001"""


In [16]:
def precision_at_k(df_pred, df_gt, df_hist=None, filter_bought_items=True, K=10):
    precisions = []
    cold_start_users = []
    
    gt_dict = dict(zip(df_gt["customer_id"], df_gt["item_id"]))
    hist_dict = dict(zip(df_hist["customer_id"], df_hist["item_id"])) if df_hist is not None else {}

    for user, pred_items in zip(df_pred["customer_id"], df_pred["item_id"]):
        if user not in gt_dict:
            continue
        
        gt_items = set(gt_dict[user])
        relevant_items = gt_items.copy()
        
        if filter_bought_items and user in hist_dict:
            relevant_items -= set(hist_dict[user])
        
        if len(relevant_items) == 0:
            cold_start_users.append(user)
            continue

        # Lấy top K
        top_k = pred_items[:100]
        hits = len(set(top_k) & relevant_items)
        # SỬA: Chia cho số items thực tế (nếu < K)
        precisions.append(hits / K)

    mean_precision = np.mean(precisions) if precisions else 0.0
    return mean_precision, cold_start_users, len(df_gt)

In [10]:
def get_shape_per_user(df: pl.DataFrame) -> pl.DataFrame:
    """Get the number of items per user in the DataFrame."""
    shape_df = df.with_columns(
        pl.col("item_id").list.len().alias("num_items")
    )
    return shape_df

df_shape = get_shape_per_user(candidates_df)
df_shape.head(100)

customer_id,item_id,num_items
i64,list[str],u32
7961288,"[""2542000000003"", ""5950000000002"", … ""5858000000001""]",200
1107503,"[""4869000000011"", ""4869000000001"", … ""5617000000001""]",200
7057750,"[""2488000000006"", ""0028090000046"", … ""5137000000002""]",200
4042637,"[""6697000000002"", ""6697000000001"", … ""1434000000013""]",200
4324589,"[""6065000000004"", ""6065000000003"", … ""5537000000012""]",200
…,…,…
7819185,"[""4584000000006"", ""0225000000007"", … ""5435000000002""]",200
8047129,"[""0212000000012"", ""0210000000005"", … ""0029140000031""]",200
1395353,"[""4027000000037"", ""6053000000062"", … ""3855000000001""]",200
5224727,"[""3880000000002"", ""3880000000001"", … ""0029140000031""]",200


In [18]:
precision, cold,  nusers = precision_at_k(candidates_df, groundtruth, history_df, filter_bought_items=True, K=10)
print("precision at K:", precision)
print("cold start users:", len(cold))
print("total users:", nusers)

precision at K: 0.08453694774317087
cold start users: 81492
total users: 644970


In [1]:
import pickle

def load_file(file_path: str):
    with open(file_path, 'rb') as f:
        data = pickle.load(f)
    return data

data = load_file('data/final_groundtruth.pkl')


In [4]:
data = pl.from_pandas(data)

In [5]:
data.head()

customer_id,item_id
i32,list[str]
2337685,"[""0020010000305""]"
7934799,"[""0020010000438""]"
2052333,"[""3513000000064"", ""2403000000004"", … ""2125000000025""]"
6548920,"[""6701000000004""]"
368770,"[""5420000000003"", ""5420000000002"", … ""6768000000003""]"


In [3]:
import polars as pl

result = pl.read_parquet('data/table/purchase_new.parquet')

In [14]:
def count_users_in_transaction(groundtruth: pl.DataFrame, transaction: pl.DataFrame) -> dict:
    # user trong groundtruth
    gt_users = groundtruth.select("customer_id").unique()

    # user trong transaction
    tx_users = transaction.select("customer_id").unique()

    # user groundtruth có xuất hiện trong transaction
    users_in_tx = gt_users.join(tx_users, on="customer_id", how="inner")

    return {
        "total_users_groundtruth": gt_users.height,
        "users_appeared_in_transaction": users_in_tx.height,
        "users_not_appeared_in_transaction": gt_users.height - users_in_tx.height,
    }

stats = count_users_in_transaction(data, result)
stats

{'total_users_groundtruth': 644970,
 'users_appeared_in_transaction': 509735,
 'users_not_appeared_in_transaction': 135235}

In [12]:
merged_df.shape

(22836646, 18)

In [3]:
df_grouped = result.sort(["customer_id", "pred_score"], descending=[False, True]).group_by("customer_id", maintain_order=True).agg(pl.col("item_id"))


In [4]:
df_grouped

customer_id,item_id
i64,list[str]
14732,"[""6768000000005"", ""7176000000002"", … ""1512000000004""]"
15126,"[""6768000000004"", ""2803000000011"", … ""6768000000005""]"
16520,"[""1512000000004"", ""5950000000001"", … ""2803000000013""]"
17212,"[""6768000000004"", ""5950000000001"", … ""7176000000002""]"
17224,"[""2803000000011"", ""1512000000004"", … ""6768000000004""]"
…,…
9264155,"[""2798000000001"", ""2803000000013"", … ""6768000000005""]"
9264156,"[""2798000000001"", ""5950000000001"", … ""2803000000011""]"
9264157,"[""3773000000004"", ""6768000000005"", … ""2803000000013""]"
9264158,"[""7176000000002"", ""2798000000001"", … ""3773000000004""]"


In [5]:
import pickle

def load_file(file_path: str):
    with open(file_path, 'rb') as f:
        data = pickle.load(f)
    return data

data = load_file('data/final_groundtruth.pkl')

In [8]:
data = pl.from_pandas(data)

In [12]:
final_submiss = df_grouped.join(data , on = "customer_id"  ).drop('item_id_right')

In [18]:
final_submiss.head()

customer_id,item_id
i64,list[str]
17212,"[""6768000000004"", ""5950000000001"", … ""7176000000002""]"
28879,"[""2803000000013"", ""3773000000004"", … ""2798000000001""]"
29041,"[""5950000000001"", ""6768000000004"", … ""2803000000011""]"
29421,"[""3773000000004"", ""7176000000002"", … ""4690000000001""]"
29718,"[""6768000000004"", ""2803000000011"", … ""2798000000001""]"


In [19]:
import json

# key = customer_id (string), value = list[item_id]
out = {
    str(cid): items
    for cid, items in final_submiss.select(["customer_id", "item_id"]).iter_rows()
}

with open("final_submission.json", "w", encoding="utf-8") as f:
    json.dump(out, f, ensure_ascii=False, indent=2)