In [1]:
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import matplotlib.ticker as mticker
from matplotlib.ticker import MaxNLocator
from transformers import AutoTokenizer, AutoModel
from pathlib import Path
import os
import sys
import polars as pl # used to read the .parquet files so its important
import numpy as np
import warnings

# Suppress all warnings
warnings.filterwarnings("ignore")

# Add the parent directory (or specify path to 'utils' if it's higher)
dir = os.path.abspath(os.path.join(os.getcwd(), '..', '..')) 
sys.path.append(dir)

from dataloader.NRMSdataloader import NRMSDataLoader
from models.NRMS_tensorflow import NRMSModel
from eval.metricEval import MetricEvaluator, AucScore, MrrScore, NdcgScore

# Now you can import from utils
from utils import add_known_user_column, add_prediction_scores
from utils import get_transformers_word_embeddings, concat_str_columns,convert_text2encoding_with_transformers, create_article_id_to_value_mapping
from utils import get_script_directory, slice_join_dataframes, truncate_history,sampling_strategy_wu2019, create_binary_labels_column

In [2]:
basic_path = get_script_directory()

DEFAULT_IS_BEYOND_ACCURACY_COL = "is_beyond_accuracy"


PATH = Path(basic_path+"/Data")
TRAIN_VAL_SPLIT = f"ebnerd_demo"  # [ebnerd_demo, ebnerd_small, ebnerd_large]
TEST_SPLIT = f"ebnerd_testset"  # "ebnerd_testset", "ebnerd_testset_gt"

#_____________________Training____________________________
# Reads the behaviors file from training data
df_behaviors_train = df_behaviors = pl.scan_parquet(
    PATH.joinpath(TRAIN_VAL_SPLIT, "train", "behaviors.parquet")
)
# Reads the history file from training data
df_history_train = df_behaviors = pl.scan_parquet(
    PATH.joinpath(TRAIN_VAL_SPLIT, "train", "history.parquet")
)

#_____________________Validation____________________________
# Reads the behaviors file from Validation data
df_behaviors_val = df_behaviors = pl.scan_parquet(
    PATH.joinpath(TRAIN_VAL_SPLIT, "validation", "behaviors.parquet")
)
# Reads the History file from Validation data
df_history_val = df_behaviors = pl.scan_parquet(
    PATH.joinpath(TRAIN_VAL_SPLIT, "validation", "history.parquet")
)

#_____________________Test____________________________
# Reads the behaviors file from test data
df_behaviors_test = df_behaviors = (
    pl.scan_parquet(PATH.joinpath(TEST_SPLIT, "test", "behaviors.parquet"))
    .filter(~pl.col(DEFAULT_IS_BEYOND_ACCURACY_COL))
    .drop(DEFAULT_IS_BEYOND_ACCURACY_COL)
)
# Reads the History file from test data
df_history_test = df_behaviors = pl.scan_parquet(
    PATH.joinpath(TEST_SPLIT, "test", "history.parquet")
)

# ?? seem we already read this file ? but now without .drop(DEFAULT_IS_BEYOND_ACCURACY_COL)
df_behaviors_test_ba = df_behaviors = pl.scan_parquet(
    PATH.joinpath(TEST_SPLIT, "test", "behaviors.parquet")
).filter(pl.col(DEFAULT_IS_BEYOND_ACCURACY_COL))

#_____________________Reads Articles ??____________________________
df_articles = pl.scan_parquet(PATH.joinpath(TEST_SPLIT,"articles.parquet")).collect()

PLOT_PATH = Path("plot")

In [3]:

"""
from src.ebrec.utils._constants import (
    DEFAULT_HISTORY_ARTICLE_ID_COL, = f"{"article_id"}_fixed"
    DEFAULT_CLICKED_ARTICLES_COL, = "article_ids_clicked"
    DEFAULT_INVIEW_ARTICLES_COL, = "article_ids_inview"
    DEFAULT_IMPRESSION_ID_COL, = "impression_id"
    DEFAULT_SUBTITLE_COL, = "subtitle"
    DEFAULT_LABELS_COL, = "labels"
    DEFAULT_TITLE_COL, =  "title"
    DEFAULT_USER_COL, = "user_id"
)
"""

'\nfrom src.ebrec.utils._constants import (\n    DEFAULT_HISTORY_ARTICLE_ID_COL, = f"{"article_id"}_fixed"\n    DEFAULT_CLICKED_ARTICLES_COL, = "article_ids_clicked"\n    DEFAULT_INVIEW_ARTICLES_COL, = "article_ids_inview"\n    DEFAULT_IMPRESSION_ID_COL, = "impression_id"\n    DEFAULT_SUBTITLE_COL, = "subtitle"\n    DEFAULT_LABELS_COL, = "labels"\n    DEFAULT_TITLE_COL, =  "title"\n    DEFAULT_USER_COL, = "user_id"\n)\n'

In [4]:
def ebnerd_from_path(path: Path, history_size: int = 30) -> pl.DataFrame:
    """
    Load ebnerd - function
    """
    df_history = (
        pl.scan_parquet(path.joinpath("history.parquet"))
        .select("user_id", "article_id_fixed")
        .pipe(
            truncate_history,
            column="article_id_fixed",
            history_size=history_size,
            padding_value=0,
            enable_warning=False,
        )
    )
    df_behaviors = (
        pl.scan_parquet(path.joinpath("behaviors.parquet"))
        .collect()
        .pipe(
            slice_join_dataframes,
            df2=df_history.collect(),
            on="user_id",
            how="left",
        )
    )
    return df_behaviors

In [5]:
basic_path = get_script_directory()

PATH = Path(basic_path+"/Data")
DATASPLIT = f"ebnerd_demo"  # [ebnerd_demo, ebnerd_small, ebnerd_large]

In [6]:
COLUMNS = [
    "user_id",
    "article_id_fixed",
    "article_ids_inview",
    "article_ids_clicked",
    "impression_id",
]
HISTORY_SIZE = 10
FRACTION = 0.01

df_train = (
    ebnerd_from_path(PATH.joinpath(DATASPLIT, "train"), history_size=HISTORY_SIZE)
    .select(COLUMNS)
    .pipe(
        sampling_strategy_wu2019,
        npratio=4,
        shuffle=True,
        with_replacement=True,
        seed=123,
    )
    .pipe(create_binary_labels_column)
    .sample(fraction=FRACTION)
)
# =>
df_validation = (
    ebnerd_from_path(PATH.joinpath(DATASPLIT, "validation"), history_size=HISTORY_SIZE)
    .select(COLUMNS)
    .pipe(create_binary_labels_column)
    .sample(fraction=FRACTION)
)
df_train.head(2)

user_id,article_id_fixed,article_ids_inview,article_ids_clicked,impression_id,labels
u32,list[i32],list[i64],list[i64],u32,list[i8]
1927519,"[9765955, 9767268, … 9763284]","[9772660, 9772768, … 9772750]",[9772660],182127655,"[1, 0, … 0]"
1047632,"[9768328, 9769367, … 9770541]","[9772168, 9772193, … 9772092]",[9772193],255153419,"[0, 1, … 0]"


In [7]:
df_articles = pl.read_parquet(PATH.joinpath(DATASPLIT,"articles.parquet"))
df_articles.head(2)

article_id,title,subtitle,last_modified_time,premium,body,published_time,image_ids,article_type,url,ner_clusters,entity_groups,topics,category,subcategory,category_str,total_inviews,total_pageviews,total_read_time,sentiment_score,sentiment_label
i32,str,str,datetime[μs],bool,str,datetime[μs],list[i64],str,str,list[str],list[str],list[str],i16,list[i16],str,i32,i32,f32,f32,str
3037230,"""Ishockey-spiller: Jeg troede j…","""ISHOCKEY: Ishockey-spilleren S…",2023-06-29 06:20:57,False,"""Ambitionerne om at komme til U…",2003-08-28 08:55:00,,"""article_default""","""https://ekstrabladet.dk/sport/…",[],[],"[""Kriminalitet"", ""Kendt"", … ""Mindre ulykke""]",142,"[327, 334]","""sport""",,,,0.9752,"""Negative"""
3044020,"""Prins Harry tvunget til dna-te…","""Hoffet tvang Prins Harry til a…",2023-06-29 06:21:16,False,"""Den britiske tabloidavis The S…",2005-06-29 08:47:00,"[3097307, 3097197, 3104927]","""article_default""","""https://ekstrabladet.dk/underh…","[""Harry"", ""James Hewitt""]","[""PER"", ""PER""]","[""Kriminalitet"", ""Kendt"", … ""Personfarlig kriminalitet""]",414,[432],"""underholdning""",,,,0.7084,"""Negative"""


In [8]:
TRANSFORMER_MODEL_NAME = "FacebookAI/xlm-roberta-base"
TEXT_COLUMNS_TO_USE = ["subtitle", "title"]
MAX_TITLE_LENGTH = 30

# LOAD HUGGINGFACE:
transformer_model = AutoModel.from_pretrained(TRANSFORMER_MODEL_NAME)
transformer_tokenizer = AutoTokenizer.from_pretrained(TRANSFORMER_MODEL_NAME)

# We'll init the word embeddings using the
word2vec_embedding = get_transformers_word_embeddings(transformer_model)
#
df_articles, cat_cal = concat_str_columns(df_articles, columns=TEXT_COLUMNS_TO_USE)
df_articles, token_col_title = convert_text2encoding_with_transformers(
    df_articles, transformer_tokenizer, cat_cal, max_length=MAX_TITLE_LENGTH
)
# =>
article_mapping = create_article_id_to_value_mapping(
    df=df_articles, value_col=token_col_title
)

In [9]:
train_dataloader = NRMSDataLoader(
    behaviors=df_train,
    article_dict=article_mapping,
    unknown_representation="zeros",
    history_column="article_id_fixed",
    eval_mode=False,
    batch_size=32,
)
val_dataloader = NRMSDataLoader(
    behaviors=df_validation,
    article_dict=article_mapping,
    unknown_representation="zeros",
    history_column="article_id_fixed",
    eval_mode=True,
    batch_size=32,
)



In [10]:
import warnings
warnings.filterwarnings("ignore")
import tensorflow as tf

class hparams_nrms:
    # INPUT DIMENTIONS:
    title_size: int = 30
    history_size: int = 50
    # MODEL ARCHITECTURE
    head_num: int = 20
    head_dim: int = 20
    attention_hidden_dim: int = 200
    # MODEL OPTIMIZER:
    optimizer: str = "adam"
    loss: str = "cross_entropy_loss"
    dropout: float = 0.2
    learning_rate: float = 0.0001

MODEL_NAME = "NRMS"
LOG_DIR = f"downloads/runs/{MODEL_NAME}"
MODEL_WEIGHTS = "downloads/data/state_dict/NRMS/weights.weights.h5"

# CALLBACKS
tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir=LOG_DIR, histogram_freq=1)
early_stopping = tf.keras.callbacks.EarlyStopping(monitor="val_loss", patience=2)
#modelcheckpoint = tf.keras.callbacks.ModelCheckpoint(
    #filepath=MODEL_WEIGHTS, save_best_only=False, save_weights_only=True, verbose=1
#)

hparams_nrms.history_size = HISTORY_SIZE
model = NRMSModel(
    hparams=hparams_nrms,
    word2vec_embedding=word2vec_embedding,
    seed=42,
)
hist = model.model.fit(
    train_dataloader,
    validation_data=val_dataloader,
    epochs=10,
    #callbacks=[tensorboard_callback, early_stopping], #, modelcheckpoint
)
#_ = model.model.load_weights(filepath=MODEL_WEIGHTS)

NEWSENCODER - embedded layer: (None, 30, 768)
WQ shape: (768, 400)
NEWSENCODER - att 1: (None, 30, 400)
NEWSENCODER - att 2: (None, 400)
USERENCODER - His_in: (None, 10, 30)
USERENCODER - Clicked: (None, 10, 400)
WQ shape: (400, 400)
USERENCODER - att 1: (None, 10, 400)
USERENCODER - att 2: (None, 400)
Epoch 1/10
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m19s[0m 2s/step - loss: 1.6222 - val_loss: 0.0000e+00
Epoch 2/10
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 1s/step - loss: 1.5509 - val_loss: 0.0000e+00
Epoch 3/10
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 1s/step - loss: 1.5074 - val_loss: 0.0000e+00
Epoch 4/10
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 1s/step - loss: 1.4758 - val_loss: 0.0000e+00
Epoch 5/10
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 1s/step - loss: 1.4018 - val_loss: 0.0000e+00
Epoch 6/10
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 1s/step - loss: 1.3316

In [11]:
pred_validation = model.scorer.predict(val_dataloader)

[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 868ms/step


In [12]:
print(df_validation.columns)

['user_id', 'article_id_fixed', 'article_ids_inview', 'article_ids_clicked', 'impression_id', 'labels']


In [13]:

df_validation = add_prediction_scores(df_validation, pred_validation.tolist()).pipe(
    add_known_user_column, known_users=df_train["user_id"]
)

In [14]:

metrics = MetricEvaluator(
    labels=df_validation["labels"].to_list(),
    predictions=df_validation["scores"].to_list(),
    metric_functions=[AucScore(), MrrScore(), NdcgScore(k=5), NdcgScore(k=10)],
)
metrics.evaluate()






<MetricEvaluator class>: 
 {
    "auc": 0.5609728847930222,
    "mrr": 0.351708404178917,
    "ndcg@5": 0.3962766772528227,
    "ndcg@10": 0.4768827168257954
}

In [None]:
labels=df_validation["labels"].to_list()
predictions=df_validation["scores"].to_list()

flat_labels = [item for sublist in labels for item in sublist]
flat_pred = [item for sublist in predictions for item in sublist]
binary_values = [1 if prob > 0.5 else 0 for prob in flat_pred]

# Overall accuracy
overall_accuracy = sum(t == p for t, p in zip(flat_labels, binary_values)) / len(flat_labels)

# Accuracy for 0-valued labels
zero_indices = [i for i, t in enumerate(flat_labels) if t == 0]
zero_accuracy = sum(flat_labels[i] == binary_values[i] for i in zero_indices) / len(zero_indices)

# Accuracy for 1-valued labels
one_indices = [i for i, t in enumerate(flat_labels) if t == 1]
one_accuracy = sum(flat_labels[i] == binary_values[i] for i in one_indices) / len(one_indices)

print(f"Overall Accuracy: {np.round(overall_accuracy*100,2):.2f}%")
print(f"0-Label Accuracy: {np.round(zero_accuracy*100,2):.2f}%")
print(f"1-Label Accuracy: {np.round(one_accuracy*100,2):.2f}%")


#______Pytorch softmax model results:_________
#Average overall Accuracy: 68.18 %
#Average 0-label Accuracy: 71.17 %
#Average 1-label Accuracy: 32.6 %
#________________________________________


#______Pytorch sigmoid (weighted) model results:_________
#Average overall Accuracy: 69.1 %
#Average 0-label Accuracy: 72.8 %
#Average 1-label Accuracy: 31.17 %
#________________________________________


#______Pytorch softmax - add  model results:_________
#Average overall Accuracy: 51.87 %
#Average 0-label Accuracy: 52.15 %
#Average 1-label Accuracy: 50.86 %
#________________________________________

#______Pytorch softmax - ffnn  model results:_________
#Average overall Accuracy: 90.99 %
#Average 0-label Accuracy: 99.83 %
#Average 1-label Accuracy: 0.0 %
#________________________________________

#______Pytorch sigmoid - ffnn  model results:_________
#Average overall Accuracy: 55.81 %
#Average 0-label Accuracy: 56.23 %
#Average 1-label Accuracy: 50.97 %
#________________________________________


#______Pytorch sigmoid - add  model results:_________
#Average overall Accuracy: 55.81 %
#Average 0-label Accuracy: 56.23 %
#Average 1-label Accuracy: 50.97 %
#________________________________________

#______Pytorch sigmoid - ffnn  model results:_________
#Average overall Accuracy: 64.03 %
#Average 0-label Accuracy: 67.22 %
#Average 1-label Accuracy: 32.34 %
#________________________________________

Overall Accuracy: 77.95%
0-Label Accuracy: 82.89%
1-Label Accuracy: 23.05%
