## Init Lib

In [1]:
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import matplotlib.ticker as mticker
from matplotlib.ticker import MaxNLocator
from transformers import AutoTokenizer, AutoModel
from pathlib import Path
import os
import sys
import polars as pl # used to read the .parquet files so its important
import numpy as np
import warnings

# Suppress all warnings
warnings.filterwarnings("ignore")

# Add the parent directory (or specify path to 'utils' if it's higher)
dir = os.path.abspath(os.path.join(os.getcwd(), '..', '..')) 
sys.path.append(dir)

from dataloader.NRMSdataloader import NRMSDataLoader
from models.nrms import NRMSModelPytorch
from eval.metricEval import MetricEvaluator, AucScore, MrrScore, NdcgScore

# Now you can import from utils
from utils import add_known_user_column, add_prediction_scores
from utils import get_transformers_word_embeddings, concat_str_columns,convert_text2encoding_with_transformers, create_article_id_to_value_mapping
from utils import get_script_directory, slice_join_dataframes, truncate_history,sampling_strategy_wu2019, create_binary_labels_column

In [2]:
import pandas as pd
import numpy as np

def decompose_datetime(datetime_str):
    # Convert to datetime object
    datetime_obj = pd.to_datetime(datetime_str)
    
    # Extract relevant features from the datetime object
    features = {
        'year': datetime_obj.year,
        'month': datetime_obj.month,
        'day': datetime_obj.day,
        'hour': datetime_obj.hour,
        'minute': datetime_obj.minute,
        'second': datetime_obj.second,
        'day_of_week': datetime_obj.weekday(),  # 0=Monday, 6=Sunday
        'is_weekend': 1 if datetime_obj.weekday() >= 5 else 0,
        'day_of_year': datetime_obj.dayofyear,
        'is_leap_year': 1 if (datetime_obj.year % 4 == 0 and (datetime_obj.year % 100 != 0 or datetime_obj.year % 400 == 0)) else 0
    }

    # Normalize certain features to [0, 1] range
    normalized_features = [
        features['year'] / 2024,  # Assuming year range of interest is up to 2023
        features['month'] / 12,
        features['day'] / 31,
        features['hour'] / 24,
        features['minute'] / 60,
        features['second'] / 60,
        features['day_of_week'] / 6,  # Normalize weekday to [0, 1]
        features['is_weekend'],  # Already binary
        features['day_of_year'] / 365,
        features['is_leap_year']
    ]
    
    return normalized_features

## Load Data

In [3]:
basic_path = get_script_directory()

DEFAULT_IS_BEYOND_ACCURACY_COL = "is_beyond_accuracy"


PATH = Path(basic_path+"/Data")
TRAIN_VAL_SPLIT = f"ebnerd_demo"  # [ebnerd_demo, ebnerd_small, ebnerd_large]
TEST_SPLIT = f"ebnerd_testset"  # "ebnerd_testset", "ebnerd_testset_gt"

#_____________________Training____________________________
# Reads the behaviors file from training data
df_behaviors_train = df_behaviors = pl.scan_parquet(
    PATH.joinpath(TRAIN_VAL_SPLIT, "train", "behaviors.parquet")
)
# Reads the history file from training data
df_history_train = df_behaviors = pl.scan_parquet(
    PATH.joinpath(TRAIN_VAL_SPLIT, "train", "history.parquet")
)

#_____________________Validation____________________________
# Reads the behaviors file from Validation data
df_behaviors_val = df_behaviors = pl.scan_parquet(
    PATH.joinpath(TRAIN_VAL_SPLIT, "validation", "behaviors.parquet")
)
# Reads the History file from Validation data
df_history_val = df_behaviors = pl.scan_parquet(
    PATH.joinpath(TRAIN_VAL_SPLIT, "validation", "history.parquet")
)

#_____________________Test____________________________
# Reads the behaviors file from test data
df_behaviors_test = df_behaviors = (
    pl.scan_parquet(PATH.joinpath(TEST_SPLIT, "test", "behaviors.parquet"))
    .filter(~pl.col(DEFAULT_IS_BEYOND_ACCURACY_COL))
    .drop(DEFAULT_IS_BEYOND_ACCURACY_COL)
)
# Reads the History file from test data
df_history_test = df_behaviors = pl.scan_parquet(
    PATH.joinpath(TEST_SPLIT, "test", "history.parquet")
)

# ?? seem we already read this file ? but now without .drop(DEFAULT_IS_BEYOND_ACCURACY_COL)
df_behaviors_test_ba = df_behaviors = pl.scan_parquet(
    PATH.joinpath(TEST_SPLIT, "test", "behaviors.parquet")
).filter(pl.col(DEFAULT_IS_BEYOND_ACCURACY_COL))

#_____________________Reads Articles ??____________________________
df_articles = pl.scan_parquet(PATH.joinpath(TEST_SPLIT,"articles.parquet")).collect()

PLOT_PATH = Path("plot")

In [4]:
print(df_articles.columns)

print(len(df_history_train.collect()))  
print(len(df_history_val.collect()))  
col_names_1 = df_history_val.collect().columns
print(col_names_1)
col_names_1 = df_history_train.collect().columns
print(col_names_1)

print(df_history_train.collect())

test = df_history_train.collect()['impression_time_fixed'][2]
print(test)



['article_id', 'title', 'subtitle', 'last_modified_time', 'premium', 'body', 'published_time', 'image_ids', 'article_type', 'url', 'ner_clusters', 'entity_groups', 'topics', 'category', 'subcategory', 'category_str', 'total_inviews', 'total_pageviews', 'total_read_time', 'sentiment_score', 'sentiment_label']
1590
1562
['user_id', 'impression_time_fixed', 'scroll_percentage_fixed', 'article_id_fixed', 'read_time_fixed']
['user_id', 'impression_time_fixed', 'scroll_percentage_fixed', 'article_id_fixed', 'read_time_fixed']
shape: (1_590, 5)
┌─────────┬──────────────────────┬─────────────────────┬─────────────────────┬─────────────────────┐
│ user_id ┆ impression_time_fixe ┆ scroll_percentage_f ┆ article_id_fixed    ┆ read_time_fixed     │
│ ---     ┆ d                    ┆ ixed                ┆ ---                 ┆ ---                 │
│ u32     ┆ ---                  ┆ ---                 ┆ list[i32]           ┆ list[f32]           │
│         ┆ list[datetime[μs]]   ┆ list[f32]        

## Initialize Training and Validation split

In [5]:
def ebnerd_from_path(path: Path, history_size: int = 30) -> pl.DataFrame:
    """
    Load ebnerd - function
    """
    df_history = (
        pl.scan_parquet(path.joinpath("history.parquet"))
        .select("user_id", "article_id_fixed")
        .pipe(
            truncate_history,
            column="article_id_fixed",
            history_size=history_size,
            padding_value=0,
            enable_warning=False,
        )
    )
    df_behaviors = (
        pl.scan_parquet(path.joinpath("behaviors.parquet"))
        .collect()
        .pipe(
            slice_join_dataframes,
            df2=df_history.collect(),
            on="user_id",
            how="left",
        )
    )
    return df_behaviors

DATASPLIT = f"ebnerd_demo"

COLUMNS = [
    "user_id",
    "article_id_fixed",
    "article_ids_inview",
    "article_ids_clicked",
    "read_time",
    "scroll_percentage",
    "impression_time",
    "impression_id",
]
HISTORY_SIZE = 20 # Size of the article title length to include? so any article that is longer is not included?
FRACTION = 0.01  # We only include 0.01% of the data? 

df_train = (
    ebnerd_from_path(PATH.joinpath(DATASPLIT, "train"), history_size=HISTORY_SIZE)
    .select(COLUMNS)
    .pipe(
        sampling_strategy_wu2019,
        npratio=4,
        shuffle=True,
        with_replacement=True,
        seed=123,
    )
    .pipe(create_binary_labels_column)
    .sample(fraction=FRACTION)
)
# =>
df_validation = (
    ebnerd_from_path(PATH.joinpath(DATASPLIT, "validation"), history_size=HISTORY_SIZE)
    .select(COLUMNS)
    .pipe(create_binary_labels_column)
    .sample(fraction=FRACTION)
)

df_articles = pl.read_parquet(PATH.joinpath(DATASPLIT,"articles.parquet"))
#df_articles.head(2)

In [6]:
print(df_train.columns)

print(df_train)

"""
article_ids = df_train['article_id_fixed'][5]
print(article_ids)
clicked = df_train['scroll_percentage'][5]
print(clicked)
impression = df_train['impression_time'][5]
print(impression)
label = df_train['labels'][5]
print(label)
"""

['user_id', 'article_id_fixed', 'article_ids_inview', 'article_ids_clicked', 'read_time', 'scroll_percentage', 'impression_time', 'impression_id', 'labels']
shape: (248, 9)
┌─────────┬────────────┬───────────┬───────────┬───┬───────────┬───────────┬───────────┬───────────┐
│ user_id ┆ article_id ┆ article_i ┆ article_i ┆ … ┆ scroll_pe ┆ impressio ┆ impressio ┆ labels    │
│ ---     ┆ _fixed     ┆ ds_inview ┆ ds_clicke ┆   ┆ rcentage  ┆ n_time    ┆ n_id      ┆ ---       │
│ u32     ┆ ---        ┆ ---       ┆ d         ┆   ┆ ---       ┆ ---       ┆ ---       ┆ list[i8]  │
│         ┆ list[i32]  ┆ list[i64] ┆ ---       ┆   ┆ f32       ┆ datetime[ ┆ u32       ┆           │
│         ┆            ┆           ┆ list[i64] ┆   ┆           ┆ μs]       ┆           ┆           │
╞═════════╪════════════╪═══════════╪═══════════╪═══╪═══════════╪═══════════╪═══════════╪═══════════╡
│ 1961735 ┆ [9768819,  ┆ [9758424, ┆ [9775518] ┆ … ┆ 38.0      ┆ 2023-05-2 ┆ 491195657 ┆ [0, 1, …  │
│         ┆ 9768820

"\narticle_ids = df_train['article_id_fixed'][5]\nprint(article_ids)\nclicked = df_train['scroll_percentage'][5]\nprint(clicked)\nimpression = df_train['impression_time'][5]\nprint(impression)\nlabel = df_train['labels'][5]\nprint(label)\n"

## Tokenize data: 

In [7]:
TRANSFORMER_MODEL_NAME = "FacebookAI/xlm-roberta-base"
TEXT_COLUMNS_TO_USE = ["subtitle", "title"]
MAX_TITLE_LENGTH = 30
Test_col_to_use = ["published_time"]

# LOAD HUGGINGFACE:
transformer_model = AutoModel.from_pretrained(TRANSFORMER_MODEL_NAME)
transformer_tokenizer = AutoTokenizer.from_pretrained(TRANSFORMER_MODEL_NAME)

# We'll init the word embeddings using the
word2vec_embedding = get_transformers_word_embeddings(transformer_model)
#
df_articles_old, cat_cal = concat_str_columns(df_articles, columns=TEXT_COLUMNS_TO_USE)
df_articles_old, token_col_title = convert_text2encoding_with_transformers(
    df_articles_old, transformer_tokenizer, cat_cal, max_length=MAX_TITLE_LENGTH
)

print(token_col_title)

# =>
article_mapping_old = create_article_id_to_value_mapping(
    df=df_articles_old, value_col=token_col_title
)


df_articles_test, cat_cal_test = concat_str_columns(df_articles, columns=Test_col_to_use)
article_mapping_test = create_article_id_to_value_mapping(
    df=df_articles_test, value_col="published_time"
)

print(article_mapping_test)

decompose_datetime


subtitle-title_encode_FacebookAI/xlm-roberta-base
{3037230: '2003-08-28 08:55:00.000000', 3044020: '2005-06-29 08:47:00.000000', 3057622: '2005-10-10 07:20:00.000000', 3073151: '2005-01-04 06:59:00.000000', 3193383: '2003-09-15 15:30:00.000000', 3196611: '2005-06-10 05:40:00.000000', 3200325: '2002-06-25 05:10:00.000000', 3200913: '2003-09-11 08:55:00.000000', 3209311: '2003-03-20 12:50:00.000000', 3209357: '2005-02-26 04:45:00.000000', 3223114: '2005-02-10 15:39:00.000000', 3251664: '2002-05-17 07:10:00.000000', 3971783: '2013-04-17 17:27:57.000000', 3977720: '2013-05-01 14:11:38.000000', 3983641: '2013-04-11 13:24:43.000000', 3987418: '2013-03-19 10:10:30.000000', 3988589: '2013-03-12 15:20:40.000000', 3993395: '2013-02-05 05:21:28.000000', 3994264: '2013-01-31 06:16:05.000000', 3997229: '2013-01-16 06:49:35.000000', 3997233: '2013-01-16 05:46:14.000000', 3998680: '2013-01-07 16:45:55.000000', 4000567: '2012-12-26 07:53:18.000000', 4000823: '2012-12-21 06:00:31.000000', 4003163: '201

<function __main__.decompose_datetime(datetime_str)>

In [9]:
keys = article_mapping_test.keys()
print(keys)

test = article_mapping_test.get(3251664)

print(test)
print(len(test))

dict_keys([3037230, 3044020, 3057622, 3073151, 3193383, 3196611, 3200325, 3200913, 3209311, 3209357, 3223114, 3251664, 3971783, 3977720, 3983641, 3987418, 3988589, 3993395, 3994264, 3997229, 3997233, 3998680, 4000567, 4000823, 4003163, 4003362, 4003431, 4003926, 4007811, 4008213, 4009177, 4014381, 4021392, 4024289, 4024449, 4026960, 4035059, 4035605, 4040731, 4040737, 4043121, 4044042, 4045439, 4048845, 4050800, 4051858, 4055236, 4056868, 4062565, 4066000, 4067703, 4067935, 4069792, 4073764, 4074883, 4081011, 4081857, 4083620, 4084225, 4085973, 4086433, 4091672, 4092282, 4096785, 4102985, 4106368, 4108345, 4116752, 4117997, 4122820, 4123699, 4135480, 4148823, 4152423, 4156692, 4160452, 4160891, 4163017, 4166146, 4166549, 4171506, 4174342, 4179715, 4180288, 4196366, 4202892, 4210181, 4210425, 4211908, 4212112, 4212220, 4214211, 4215525, 4217156, 4219741, 4219768, 4219799, 4220534, 4223060, 4225213, 4233674, 4234038, 4241055, 4241857, 4251878, 4257110, 4263399, 4265340, 4277709, 4280152,

In [10]:
for key in article_mapping_test:
    article_mapping_test[key] = decompose_datetime(article_mapping_test[key])

In [11]:
keys = article_mapping_test.keys()
print(keys)

test = article_mapping_test.get(3251664)

print(test)
print(len(test))

dict_keys([3037230, 3044020, 3057622, 3073151, 3193383, 3196611, 3200325, 3200913, 3209311, 3209357, 3223114, 3251664, 3971783, 3977720, 3983641, 3987418, 3988589, 3993395, 3994264, 3997229, 3997233, 3998680, 4000567, 4000823, 4003163, 4003362, 4003431, 4003926, 4007811, 4008213, 4009177, 4014381, 4021392, 4024289, 4024449, 4026960, 4035059, 4035605, 4040731, 4040737, 4043121, 4044042, 4045439, 4048845, 4050800, 4051858, 4055236, 4056868, 4062565, 4066000, 4067703, 4067935, 4069792, 4073764, 4074883, 4081011, 4081857, 4083620, 4084225, 4085973, 4086433, 4091672, 4092282, 4096785, 4102985, 4106368, 4108345, 4116752, 4117997, 4122820, 4123699, 4135480, 4148823, 4152423, 4156692, 4160452, 4160891, 4163017, 4166146, 4166549, 4171506, 4174342, 4179715, 4180288, 4196366, 4202892, 4210181, 4210425, 4211908, 4212112, 4212220, 4214211, 4215525, 4217156, 4219741, 4219768, 4219799, 4220534, 4223060, 4225213, 4233674, 4234038, 4241055, 4241857, 4251878, 4257110, 4263399, 4265340, 4277709, 4280152,

## Create Batch Dataset

In [17]:
train_dataloader = NRMSDataLoader(
    behaviors=df_train,
    article_dict=article_mapping_old,
    unknown_representation="zeros",
    history_column="article_id_fixed",
    eval_mode=False,
    batch_size=32,
)



val_dataloader = NRMSDataLoader(
    behaviors=df_validation,
    article_dict=article_mapping_old,
    unknown_representation="zeros",
    history_column="article_id_fixed",
    eval_mode=True,
    batch_size=32,
)


Time_dataloader_train = NRMSDataLoader(
    behaviors=df_train,
    article_dict=article_mapping_test,
    unknown_representation="zeros",
    history_column="article_id_fixed",
    eval_mode=False,
    batch_size=32,
)

Time_dataloader_val = NRMSDataLoader(
    behaviors=df_validation,
    article_dict=article_mapping_test,
    unknown_representation="zeros",
    history_column="article_id_fixed",
    eval_mode=True,
    batch_size=32,
)


In [19]:

(X1_t_train,X2_t_train), Y_t_train = Time_dataloader_train[0]
(X1_t_val,X2_t_val), Y_t_val = Time_dataloader_val[0]

(X1_train,X2_train), Y_train = train_dataloader[0]
(X1_val,X2_val), Y_val = val_dataloader[0]

print(X1_t_train.shape)
print(X1_train.shape)
print("")
print(X2_t_train.shape)
print(X2_train.shape)
print("")
print(Y_t_train.shape)
print(Y_train.shape)
print("")


print(X1_t_val.shape)
print(X1_val.shape)
print("")
print(X2_t_val.shape)
print(X2_val.shape)
print("")
print(Y_t_val.shape)
print(Y_val.shape)
print("")


(32, 20, 10)
(32, 20, 30)

(32, 5, 10)
(32, 5, 30)

(32, 5)
(32, 5)

(393, 20, 10)
(393, 20, 30)

(393, 1, 10)
(393, 1, 30)

(393, 1)
(393, 1)

