# Getting started with the EB-NeRD

In [1]:
from pathlib import Path
import polars as pl

import sys
sys.path.append("/Users/frederiknagel/Desktop/NewsRecommendation_19")

from src.ebrec.utils._descriptive_analysis import (
    min_max_impression_time_behaviors,
    min_max_impression_time_history,
)
from src.ebrec.utils._polars import slice_join_dataframes
from src.ebrec.utils._behaviors import (
    create_binary_labels_column,
    sampling_strategy_wu2019,
    truncate_history,
)
from src.ebrec.utils._constants import *
from src.ebrec.utils._python import compute_npratio

## Load dataset:

In [3]:

PATH = Path("/Users/frederiknagel/Desktop/NewsRecommendation_19/ebnerd_data")
TRAIN_VAL_SPLIT = f"ebnerd_small"  # [ebnerd_demo, ebnerd_small, ebnerd_large]
TEST_SPLIT = f"ebnerd_testset"

In [4]:
df_behaviors_train = df_behaviors = pl.scan_parquet(
    PATH.joinpath(TRAIN_VAL_SPLIT, "train", "behaviors.parquet")
)
df_history_train = df_behaviors = pl.scan_parquet(
    PATH.joinpath(TRAIN_VAL_SPLIT, "train", "history.parquet")
)
df_behaviors_val = df_behaviors = pl.scan_parquet(
    PATH.joinpath(TRAIN_VAL_SPLIT, "validation", "behaviors.parquet")
)
df_history_val = df_behaviors = pl.scan_parquet(
    PATH.joinpath(TRAIN_VAL_SPLIT, "validation", "history.parquet")
)
df_behaviors_test = df_behaviors = pl.scan_parquet(
    PATH.joinpath(TEST_SPLIT, "test", "behaviors.parquet")
)
df_history_test = df_behaviors = pl.scan_parquet(
    PATH.joinpath(TEST_SPLIT, "test", "history.parquet")
)
df_articles = pl.scan_parquet(PATH.joinpath(TEST_SPLIT, "articles.parquet"))

### Check min/max time-stamps in the data-split period

In [5]:
print(f"History: {min_max_impression_time_history(df_history_train).collect()}")
print(f"Behaviors: {min_max_impression_time_behaviors(df_behaviors_train).collect()}")

History: shape: (1, 2)
┌─────────────────────┬─────────────────────┐
│ min                 ┆ max                 │
│ ---                 ┆ ---                 │
│ datetime[μs]        ┆ datetime[μs]        │
╞═════════════════════╪═════════════════════╡
│ 2023-04-27 07:00:00 ┆ 2023-05-18 06:59:59 │
└─────────────────────┴─────────────────────┘
Behaviors: shape: (1, 2)
┌─────────────────────┬─────────────────────┐
│ min                 ┆ max                 │
│ ---                 ┆ ---                 │
│ datetime[μs]        ┆ datetime[μs]        │
╞═════════════════════╪═════════════════════╡
│ 2023-05-18 07:00:01 ┆ 2023-05-25 06:59:58 │
└─────────────────────┴─────────────────────┘


## Add History to Behaviors

In [6]:
df_history = df_history_train.select(
    DEFAULT_USER_COL, DEFAULT_HISTORY_ARTICLE_ID_COL
).pipe(
    truncate_history,
    column=DEFAULT_HISTORY_ARTICLE_ID_COL,
    history_size=30,
    padding_value=0,
    enable_warning=False,
)
df_history.head(5).collect()

user_id,article_id_fixed
u32,list[i32]
13538,"[9767342, 9767751, … 9769366]"
14241,"[9763401, 9763250, … 9767852]"
20396,"[9763634, 9763401, … 9769679]"
34912,"[9766722, 9759476, … 9770882]"
37953,"[9762836, 9763942, … 9769306]"


In [7]:
df = slice_join_dataframes(
    df1=df_behaviors_train.collect(),
    df2=df_history_train.collect(),
    on=DEFAULT_USER_COL,
    how="left",
)
df.head(5)

impression_id,article_id,impression_time,read_time,scroll_percentage,device_type,article_ids_inview,article_ids_clicked,user_id,is_sso_user,gender,postcode,age,is_subscriber,session_id,next_read_time,next_scroll_percentage,impression_time_fixed,scroll_percentage_fixed,article_id_fixed,read_time_fixed
u32,i32,datetime[μs],f32,f32,i8,list[i32],list[i32],u32,bool,i8,i8,i8,bool,u32,f32,f32,list[datetime[μs]],list[f32],list[i32],list[f32]
149474,,2023-05-24 07:47:53,13.0,,2,"[9778623, 9778682, … 9778728]",[9778657],139836,False,,,,False,759,7.0,22.0,"[2023-05-03 19:04:15, 2023-05-03 19:05:22, … 2023-05-14 19:52:58]","[100.0, 89.0, … 47.0]","[9745590, 9748574, … 9765156]","[60.0, 11.0, … 3.0]"
150528,,2023-05-24 07:33:25,25.0,,2,"[9778718, 9778728, … 9778682]",[9778623],143471,False,,,,False,1240,287.0,100.0,"[2023-04-27 08:05:09, 2023-04-27 10:05:55, … 2023-05-18 06:56:14]","[21.0, 100.0, … 69.0]","[9737881, 9738659, … 9770989]","[7.0, 24.0, … 9.0]"
153068,9778682.0,2023-05-24 07:09:04,78.0,100.0,1,"[9778657, 9778669, … 9778682]",[9778669],151570,False,,,,False,1976,45.0,100.0,"[2023-04-27 14:07:16, 2023-04-27 14:08:16, … 2023-05-18 06:33:24]","[100.0, null, … 100.0]","[9738303, 9738993, … 9770829]","[59.0, 1.0, … 60.0]"
153070,9777492.0,2023-05-24 07:13:14,26.0,100.0,1,"[9020783, 9778444, … 9778628]",[9778628],151570,False,,,,False,1976,4.0,18.0,"[2023-04-27 14:07:16, 2023-04-27 14:08:16, … 2023-05-18 06:33:24]","[100.0, null, … 100.0]","[9738303, 9738993, … 9770829]","[59.0, 1.0, … 60.0]"
153071,9778623.0,2023-05-24 07:11:08,125.0,100.0,1,"[9777492, 9774568, … 9775990]",[9777492],151570,False,,,,False,1976,26.0,100.0,"[2023-04-27 14:07:16, 2023-04-27 14:08:16, … 2023-05-18 06:33:24]","[100.0, null, … 100.0]","[9738303, 9738993, … 9770829]","[59.0, 1.0, … 60.0]"


## Generate labels

Here's an example how to generate binary labels based on ``article_ids_clicked`` and ``article_ids_inview``

In [8]:
df.select(DEFAULT_CLICKED_ARTICLES_COL, DEFAULT_INVIEW_ARTICLES_COL).pipe(
    create_binary_labels_column, shuffle=True, seed=123
).with_columns(pl.col("labels").list.len().name.suffix("_len")).head(5)

article_ids_clicked,article_ids_inview,labels,labels_len
list[i32],list[i32],list[i8],u32
[9778657],"[9778623, 9778657, … 9778736]","[0, 1, … 0]",6
[9778623],"[9778657, 9778769, … 9778669]","[0, 0, … 0]",9
[9778669],"[9756397, 9693002, … 9778682]","[0, 0, … 0]",7
[9778628],"[9778444, 9778718, … 9430567]","[0, 0, … 0]",8
[9777492],"[9770218, 9131971, … 9778623]","[0, 0, … 0]",9


An example using the downsample strategy employed by Wu et al.

In [9]:
NPRATIO = 2
df.select(DEFAULT_CLICKED_ARTICLES_COL, DEFAULT_INVIEW_ARTICLES_COL).pipe(
    sampling_strategy_wu2019,
    npratio=NPRATIO,
    shuffle=False,
    with_replacement=True,
    seed=123,
).pipe(create_binary_labels_column, shuffle=True, seed=123).with_columns(
    pl.col("labels").list.len().name.suffix("_len")
).head(
    5
)

article_ids_clicked,article_ids_inview,labels,labels_len
list[i64],list[i64],list[i8],u32
[9778657],"[9778669, 9778728, 9778657]","[0, 0, 1]",3
[9778623],"[9778669, 9778682, 9778623]","[0, 0, 1]",3
[9778669],"[9778669, 9778682, 9776259]","[1, 0, 0]",3
[9778628],"[7213923, 9430567, 9778628]","[0, 0, 1]",3
[9777492],"[9775990, 9771223, 9777492]","[0, 0, 1]",3
