In [31]:
import os
import polars as pl

In [32]:
from pathlib import Path

from ebrec.utils._descriptive_analysis import (
    min_max_impression_time_behaviors, 
    min_max_impression_time_history
)
from ebrec.utils._polars import slice_join_dataframes
from ebrec.utils._behaviors import (
    create_binary_labels_column,
    sampling_strategy_wu2019,
    truncate_history,
)
from ebrec.utils._constants import (
    DEFAULT_HISTORY_ARTICLE_ID_COL,
    DEFAULT_CLICKED_ARTICLES_COL,
    DEFAULT_INVIEW_ARTICLES_COL,
    DEFAULT_USER_COL
)

In [18]:
FULL_PATH = os.path.join("../../data_demo", 'train')

# Load the data
df_behaviors = pl.scan_parquet(os.path.join(FULL_PATH, 'behaviors.parquet'))
df_history = pl.scan_parquet(os.path.join(FULL_PATH, 'history.parquet'))
df_articles = pl.scan_parquet(os.path.join("../../data_demo", 'articles.parquet'))

df_behaviors.head(5).collect()

impression_id,article_id,impression_time,read_time,scroll_percentage,device_type,article_ids_inview,article_ids_clicked,user_id,is_sso_user,gender,postcode,age,is_subscriber,session_id,next_read_time,next_scroll_percentage
u32,i32,datetime[μs],f32,f32,i8,list[i32],list[i32],u32,bool,i8,i8,i8,bool,u32,f32,f32
48401,,2023-05-21 21:06:50,21.0,,2,"[9774516, 9771051, … 9759966]",[9759966],22779,False,,,,False,21,16.0,27.0
152513,9778745.0,2023-05-24 07:31:26,30.0,100.0,1,"[9778669, 9778736, … 9777397]",[9778661],150224,False,,,,False,298,2.0,48.0
155390,,2023-05-24 07:30:33,45.0,,1,"[9778369, 9777856, … 9778448]",[9777856],160892,False,,,,False,401,215.0,100.0
214679,,2023-05-23 05:25:40,33.0,,2,"[9776715, 9776406, … 9776855]",[9776566],1001055,False,,,,False,1357,40.0,47.0
214681,,2023-05-23 05:31:54,21.0,,2,"[9775202, 9776855, … 9776570]",[9776553],1001055,False,,,,False,1358,5.0,49.0


In [19]:
df_history.head(5).collect()

user_id,impression_time_fixed,scroll_percentage_fixed,article_id_fixed,read_time_fixed
u32,list[datetime[μs]],list[f32],list[i32],list[f32]
13538,"[2023-04-27 10:17:43, 2023-04-27 10:18:01, … 2023-05-17 20:36:34]","[100.0, 35.0, … 100.0]","[9738663, 9738569, … 9769366]","[17.0, 12.0, … 16.0]"
58608,"[2023-04-27 18:48:09, 2023-04-27 18:48:45, … 2023-05-17 19:46:40]","[37.0, 61.0, … null]","[9739362, 9739179, … 9770333]","[2.0, 24.0, … 0.0]"
95507,"[2023-04-27 15:20:28, 2023-04-27 15:20:47, … 2023-05-17 14:57:46]","[60.0, 100.0, … null]","[9739035, 9738646, … 9769450]","[18.0, 29.0, … 0.0]"
106588,"[2023-04-27 08:29:09, 2023-04-27 08:29:26, … 2023-05-16 05:50:52]","[24.0, 57.0, … 100.0]","[9738292, 9738216, … 9747803]","[9.0, 15.0, … 33.0]"
617963,"[2023-04-27 14:42:25, 2023-04-27 14:43:10, … 2023-05-18 02:28:09]","[100.0, 100.0, … 90.0]","[9739035, 9739088, … 9770798]","[45.0, 29.0, … 22.0]"


In [20]:
df_articles.head(5).collect()

article_id,title,subtitle,last_modified_time,premium,body,published_time,image_ids,article_type,url,ner_clusters,entity_groups,topics,category,subcategory,category_str,total_inviews,total_pageviews,total_read_time,sentiment_score,sentiment_label
i32,str,str,datetime[μs],bool,str,datetime[μs],list[i64],str,str,list[str],list[str],list[str],i16,list[i16],str,i32,i32,f32,f32,str
3037230,"""Ishockey-spiller: Jeg troede j…","""ISHOCKEY: Ishockey-spilleren S…",2023-06-29 06:20:57,False,"""Ambitionerne om at komme til U…",2003-08-28 08:55:00,,"""article_default""","""https://ekstrabladet.dk/sport/…",[],[],"[""Kriminalitet"", ""Kendt"", … ""Mindre ulykke""]",142,"[327, 334]","""sport""",,,,0.9752,"""Negative"""
3044020,"""Prins Harry tvunget til dna-te…","""Hoffet tvang Prins Harry til a…",2023-06-29 06:21:16,False,"""Den britiske tabloidavis The S…",2005-06-29 08:47:00,"[3097307, 3097197, 3104927]","""article_default""","""https://ekstrabladet.dk/underh…","[""Harry"", ""James Hewitt""]","[""PER"", ""PER""]","[""Kriminalitet"", ""Kendt"", … ""Personfarlig kriminalitet""]",414,[432],"""underholdning""",,,,0.7084,"""Negative"""
3057622,"""Rådden kørsel på blå plader""","""Kan ikke straffes: Udenlandske…",2023-06-29 06:21:24,False,"""Slingrende spritkørsel. Grove …",2005-10-10 07:20:00,[3047102],"""article_default""","""https://ekstrabladet.dk/nyhede…",[],[],"[""Kriminalitet"", ""Transportmiddel"", ""Bil""]",118,[133],"""nyheder""",,,,0.9236,"""Negative"""
3073151,"""Mærsk-arvinger i livsfare""","""FANGET I FLODBØLGEN: Skibsrede…",2023-06-29 06:21:38,False,"""To oldebørn af skibsreder Mærs…",2005-01-04 06:59:00,"[3067474, 3067478, 3153705]","""article_default""","""https://ekstrabladet.dk/nyhede…",[],[],"[""Erhverv"", ""Privat virksomhed"", … ""Rejse""]",118,[133],"""nyheder""",,,,0.9945,"""Negative"""
3193383,"""Skød svigersøn gennem babydyne""","""44-årig kvinde tiltalt for dra…",2023-06-29 06:22:57,False,"""En 44-årig mormor blev i dag f…",2003-09-15 15:30:00,,"""article_default""","""https://ekstrabladet.dk/krimi/…",[],[],"[""Kriminalitet"", ""Personfarlig kriminalitet""]",140,[],"""krimi""",,,,0.9966,"""Negative"""


In [21]:
df_full_behaviour = slice_join_dataframes(
    df1=df_behaviors.collect(),
    df2=df_history.collect(),
    on=DEFAULT_USER_COL,
    how="left",
)
df_full_behaviour.head(5)

impression_id,article_id,impression_time,read_time,scroll_percentage,device_type,article_ids_inview,article_ids_clicked,user_id,is_sso_user,gender,postcode,age,is_subscriber,session_id,next_read_time,next_scroll_percentage,impression_time_fixed,scroll_percentage_fixed,article_id_fixed,read_time_fixed
u32,i32,datetime[μs],f32,f32,i8,list[i32],list[i32],u32,bool,i8,i8,i8,bool,u32,f32,f32,list[datetime[μs]],list[f32],list[i32],list[f32]
48401,,2023-05-21 21:06:50,21.0,,2,"[9774516, 9771051, … 9759966]",[9759966],22779,False,,,,False,21,16.0,27.0,"[2023-04-27 09:05:54, 2023-04-27 09:06:09, … 2023-05-18 06:26:39]","[28.0, 17.0, … 15.0]","[9738452, 9737521, … 9770541]","[5.0, 4.0, … 7.0]"
152513,9778745.0,2023-05-24 07:31:26,30.0,100.0,1,"[9778669, 9778736, … 9777397]",[9778661],150224,False,,,,False,298,2.0,48.0,"[2023-04-29 11:34:06, 2023-04-29 11:34:25, … 2023-05-18 06:13:47]","[100.0, 49.0, … 24.0]","[9740087, 9741986, … 9735909]","[18.0, 244.0, … 7.0]"
155390,,2023-05-24 07:30:33,45.0,,1,"[9778369, 9777856, … 9778448]",[9777856],160892,False,,,,False,401,215.0,100.0,"[2023-04-27 09:10:33, 2023-04-27 09:20:25, … 2023-05-17 15:51:19]","[100.0, 20.0, … 100.0]","[9738557, 9738211, … 9770178]","[583.0, 257.0, … 158.0]"
214679,,2023-05-23 05:25:40,33.0,,2,"[9776715, 9776406, … 9776855]",[9776566],1001055,False,,,,False,1357,40.0,47.0,"[2023-04-27 12:12:45, 2023-04-27 12:13:30, … 2023-05-18 05:31:44]","[100.0, 100.0, … 28.0]","[9738777, 9738663, … 9769981]","[35.0, 62.0, … 16.0]"
214681,,2023-05-23 05:31:54,21.0,,2,"[9775202, 9776855, … 9776570]",[9776553],1001055,False,,,,False,1358,5.0,49.0,"[2023-04-27 12:12:45, 2023-04-27 12:13:30, … 2023-05-18 05:31:44]","[100.0, 100.0, … 28.0]","[9738777, 9738663, … 9769981]","[35.0, 62.0, … 16.0]"


In [21]:
# Now select row at index 1
row = 1
select_row = df_full_behaviour[row]


In [57]:
import importlib
import NeRD_data as NeRD_data
importlib.reload(NeRD_data)

from NeRD_data import EB_NeRDDataset
from torch.utils.data import DataLoader

In [58]:
from transformers import BertTokenizer
import yaml
with open('./config.yaml') as f:
    cfg = yaml.safe_load(f)

In [56]:
tokenizer = BertTokenizer.from_pretrained(cfg['model']['pretrained_model_name'])
dataset = EB_NeRDDataset(tokenizer, **cfg['dataset'])
dataloader = DataLoader(dataset, batch_size=2, shuffle=True)

ColumnNotFoundError: labels

In [28]:
x = next(iter(dataloader))

TypeError: default_collate: batch must contain tensors, numpy arrays, numbers, dicts or lists; found <class 'polars.series.series.Series'>